R:如何对数据进行分组并在数据框中的不同组内分配因子水平?

R: How to group data and assign factor levels within different groups in a dataframe?

structure(list(drug = c("Chlorambucil", "Fludarabine", "FludarabineMafosfamide", 
"NDI031301", "CMPB", "Tofacitinib", "Peficitinib", "FludarabineMafosfamide", 
"PDB", "Filgotinib", "Dexamethasone", "CMPA", "Lenalidomide", 
"Dexamethasone", "Gandotinib", "NDI031301", "Filgotinib", "PDB", 
"CMPB", "Ruxolitinib", "CC122", "Atovaquone", "CC122", "SAR20347", 
"Momelotinib", "Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine", 
"Cerdulatinib", "Lenalidomide", "Atovaquone", "Chlorambucil", 
"CMPA", "FludarabineMafosfamide", "FludarabineMafosfamide", "Fludarabine", 
"Atovaquone", "Momelotinib", "PDB", "Filgotinib", "Chlorambucil", 
"Dexamethasone", "Tofacitinib", "SAR20347", "CMPB", "Momelotinib", 
"Fludarabine", "Cerdulatinib", "Peficitinib", "Atovaquone", "CC122", 
"CMPA", "NDI031301", "PDB", "CMPA", "Lenalidomide", "SAR20347", 
"Tofacitinib", "Gandotinib", "Lenalidomide", "Peficitinib", "CMPB", 
"CC122", "Dexamethasone", "FludarabineMafosfamide", "Ruxolitinib", 
"CMPB", "Peficitinib", "Tofacitinib", "FludarabineMafosfamide", 
"Filgotinib", "Dexamethasone", "CMPA", "Dexamethasone", "Gandotinib", 
"NDI031301", "Filgotinib", "SAR20347", "CMPB", "Ruxolitinib", 
"Peficitinib", "Atovaquone", "CC122", "SAR20347", "Momelotinib", 
"Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine", "Cerdulatinib", 
"Atovaquone", "Chlorambucil", "CMPA", "NDI031301"), dose = c(1, 
1, 10, 1, 0.1, 1, 1, 1, 100, 1, 10, 1, 10, 100, 1, 10, 10, 10, 
1, 1, 0.1, 3, 1, 1, 1, 0.1, 10, 1, 10, 1, 1, 30, 30, 0.1, 0.01, 
0.1, 0.01, 0.3, 0.001, 1, 0.01, 0.3, 0.1, 0.01, 0.1, 0.001, 0.01, 
0.1, 0.01, 0.1, 0.03, 0.01, 0.01, 0.01, 0.1, 0.001, 0.01, 0.01, 
0.1, 0.01, 0.1, 0.01, 0.01, 0.001, 1, 10, 10, 0.1, 1, 1, 1, 1, 
10, 1, 100, 1, 10, 10, 10, 1, 1, 10, 3, 1, 1, 1, 0.1, 10, 10, 
1, 1, 30, 30, 0.1, 1), drug.dose = c("Chlorambucil_1uM", "Fludarabine_1uM", 
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "NDI031301_1uM", 
"CMPB_0.1uM", "Tofacitinib_1uM", "Peficitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml", 
"PDB_100ng/ml", "Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM", 
"Lenalidomide_10uM", "Dexamethasone_100uM", "Gandotinib_1uM", 
"NDI031301_10uM", "Filgotinib_10uM", "PDB_10ng/ml", "CMPB_1uM", 
"Ruxolitinib_1uM", "CC122_0.1uM", "Atovaquone_3uM", "CC122_1uM", 
"SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM", "Tofacitinib_10uM", 
"Fludarabine_1ug/ml", "Fludarabine_10ug/ml", "Cerdulatinib_1uM", 
"Lenalidomide_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM", 
"FludarabineMafosfamide_0.01ug/mlplus1ug/ml", "FludarabineMafosfamide_0.1ug/mlplus1ug/ml", 
"Fludarabine_0.01ug/ml", "Atovaquone_0.3uM", "Momelotinib_0.001uM", 
"PDB_1ng/ml", "Filgotinib_0.01uM", "Chlorambucil_0.3uM", "Dexamethasone_0.1uM", 
"Tofacitinib_0.01uM", "SAR20347_0.1uM", "CMPB_0.001uM", "Momelotinib_0.01uM", 
"Fludarabine_0.1ug/ml", "Cerdulatinib_0.01uM", "Peficitinib_0.1uM", 
"Atovaquone_0.03uM", "CC122_0.01uM", "CMPA_0.01uM", "NDI031301_0.01uM", 
"PDB_0.1ng/ml", "CMPA_0.001uM", "Lenalidomide_0.01uM", "SAR20347_0.01uM", 
"Tofacitinib_0.1uM", "Gandotinib_0.01uM", "Lenalidomide_0.1uM", 
"Peficitinib_0.01uM", "CMPB_0.01uM", "CC122_0.001uM", "Dexamethasone_1uM", 
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "Ruxolitinib_10uM", 
"CMPB_0.1uM", "Peficitinib_1uM", "Tofacitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml", 
"Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM", "Dexamethasone_100uM", 
"Gandotinib_1uM", "NDI031301_10uM", "Filgotinib_10uM", "SAR20347_10uM", 
"CMPB_1uM", "Ruxolitinib_1uM", "Peficitinib_10uM", "Atovaquone_3uM", 
"CC122_1uM", "SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM", 
"Tofacitinib_10uM", "Fludarabine_10ug/ml", "Fludarabine_1ug/ml", 
"Cerdulatinib_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM", 
"NDI031301_1uM"), combo = c("none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none"), cluster = c(3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L), dosage = c("1uM", "1uM", "10ug/mlplus1ug/ml", 
"1uM", "0.1uM", "1uM", "1uM", "1ug/mlplus1ug/ml", "100ng/ml", 
"1uM", "10uM", "1uM", "10uM", "100uM", "1uM", "10uM", "10uM", 
"10ng/ml", "1uM", "1uM", "0.1uM", "3uM", "1uM", "1uM", "1uM", 
"0.1uM", "10uM", "1ug/ml", "10ug/ml", "1uM", "1uM", "30uM", "30uM", 
"0.1uM", "0.01ug/mlplus1ug/ml", "0.1ug/mlplus1ug/ml", "0.01ug/ml", 
"0.3uM", "0.001uM", "1ng/ml", "0.01uM", "0.3uM", "0.1uM", "0.01uM", 
"0.1uM", "0.001uM", "0.01uM", "0.1ug/ml", "0.01uM", "0.1uM", 
"0.03uM", "0.01uM", "0.01uM", "0.01uM", "0.1ng/ml", "0.001uM", 
"0.01uM", "0.01uM", "0.1uM", "0.01uM", "0.1uM", "0.01uM", "0.01uM", 
"0.001uM", "1uM", "10ug/mlplus1ug/ml", "10uM", "0.1uM", "1uM", 
"1uM", "1ug/mlplus1ug/ml", "1uM", "10uM", "1uM", "100uM", "1uM", 
"10uM", "10uM", "10uM", "1uM", "1uM", "10uM", "3uM", "1uM", "1uM", 
"1uM", "0.1uM", "10uM", "10ug/ml", "1ug/ml", "1uM", "30uM", "30uM", 
"0.1uM", "1uM")), row.names = c(NA, -95L), class = "data.frame")

抱歉菜鸟问题,我有这个复杂的药物集群数据,如屏幕截图所示。

我想将它们显示为堆叠式 geom_col 类型的绘图,x 轴为“药物”,Y 轴为出现次数,并按簇排列。

到目前为止还很简单。但我也想通过使用颜色填充来匹配它们的剂量来查看这些药物和剂量在每个集群中的分布。实际剂量有不同的单位等

我将数字剂量提取到它自己的常设列中。我想指定一个因子向量(“最小”、“低”、“高”、“最大”)来反映剂量水平,因为我知道每种药物有 4 种不同的剂量。

问题是不同药物的数字剂量不同,所以我不能简单地使用等级

例如一些药物剂量范围从0.03到30,一些等级从0.3到300,还有一些范围从0.01到10。

那么我如何使用该数字药物剂量列为每种药物分配药物水平?

这是一种使用 rank() 和连接的方法。我们可以利用以下事实,即每种药物在 种药物中具有相同的单位

library(dplyr)
df %>%
  arrange(drug) %>% #for visualization
  group_by(drug) %>% #group by drug
  select(dose) %>% #get rid of extra columns
  filter(!duplicated(dose)) %>% #remove duplicates
  mutate(rank = rank(dose), #rank doses, mostly for visualization of results
         category = c("min","low","high","max")[rank]) #assign category
# A tibble: 67 x 4
# Groups:   drug [19]
   drug           dose  rank category
   <chr>         <dbl> <dbl> <chr>   
 1 Atovaquone    3         3 high    
 2 Atovaquone   30         4 max     
 3 Atovaquone    0.3       2 low     
 4 Atovaquone    0.03      1 min     
 5 CC122         0.1       3 high    
 6 CC122         1         4 max     
 7 CC122         0.01      2 low     
 8 CC122         0.001     1 min     
 9 Cerdulatinib  1         2 low     
10 Cerdulatinib  0.01      1 min     
# … with 57 more rows

现在我们可以加入回原来的data.frame:

df %>%
  arrange(drug) %>%
  group_by(drug) %>% 
  select(dose) %>%
  filter(!duplicated(dose)) %>%
  mutate(rank = rank(dose), #rank doses
         category = c("min","low","high","max")[rank]) %>%
  right_join(df)
# A tibble: 95 x 8
# Groups:   drug [19]
   drug        dose dosage  rank category drug.dose         combo cluster
   <chr>      <dbl> <chr>  <dbl> <chr>    <chr>             <chr>   <int>
 1 Atovaquone  3    3uM        3 high     Atovaquone_3uM    none        4
 2 Atovaquone  3    3uM        3 high     Atovaquone_3uM    none        6
 3 Atovaquone 30    30uM       4 max      Atovaquone_30uM   none        4
 4 Atovaquone 30    30uM       4 max      Atovaquone_30uM   none        6
 5 Atovaquone  0.3  0.3uM      2 low      Atovaquone_0.3uM  none        5
 6 Atovaquone  0.03 0.03uM     1 min      Atovaquone_0.03uM none        5
 7 CC122       0.1  0.1uM      3 high     CC122_0.1uM       none        4
 8 CC122       1    1uM        4 max      CC122_1uM         none        4
 9 CC122       1    1uM        4 max      CC122_1uM         none        6
10 CC122       0.01 0.01uM     2 low      CC122_0.01uM      none        5
# … with 85 more rows