用 dcast 重塑长到宽

Question

我需要将数据框 csf_antibio（下面的示例）从长格式重塑为宽格式，按行分隔每个患者的所有值。

study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12

我尝试了以下但没有成功：

csfmelt <- melt(csf_antibio, id.vars=1:1)
csf <- dcast(csfmelt, study_id ~ variable, value.var = "value", fun.aggregate = sum)

我希望最终的数据框每行都有每个研究 ID 和变量

study_id,proanbx_dt1,proanbx_tm1,name1,othername1,route1,dosage1,units1,doses1,freq1,proanbx_dt2,proanbx_tm2,name2,othername2,route2,dosage2,units2,doses2,freq2,proanbx_dt3,proanbx_tm3,name3,othername3,route3,dosage3,units3,doses3,freq3,proanbx_dt4,proanbx_tm4,name4,othername4,route4,dosage4,units4,doses4,freq4
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13, 2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9, 2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12

提前致谢！

Answer 1

您想要的输出包含 dcast 无法自然推断的“数字”部分。我们可以使用 ave 相对容易地添加它（基础 R，当然这可以在 data.table 或 dplyr 分组中同样容易地完成）。

reshape2 和基础 R

csfmelt$num <- ave(seq(nrow(csfmelt)), csfmelt[c("study_id","variable")], FUN = seq_along)
head(csfmelt)
#    study_id   variable      value num
# 1 CHLA_0001 proanbx_dt 2021-07-22   1
# 2 CHLA_0001 proanbx_dt 2021-07-22   2
# 3 CHLA_0001 proanbx_dt 2021-07-22   3
# 4 CHLA_0001 proanbx_dt 2021-08-09   4
# 5 CHLA_0002 proanbx_dt 2020-12-18   1
# 6 CHLA_0002 proanbx_dt 2020-12-18   2
csfwide <- reshape2::dcast(csfmelt, study_id ~ variable + num, value.var = "value")
csfwide
#    study_id proanbx_dt_1 proanbx_dt_2 proanbx_dt_3 proanbx_dt_4 proanbx_tm_1 proanbx_tm_2 proanbx_tm_3 proanbx_tm_4      name_1        name_2     name_3     name_4 othername_1 othername_2                   othername_3 othername_4 route_1 route_2 route_3 route_4 dosage_1 dosage_2 dosage_3 dosage_4 units_1 units_2 units_3 units_4 doses_1 doses_2 doses_3 doses_4 freq_1 freq_2 freq_3 freq_4
# 1 CHLA_0001   2021-07-22   2021-07-22   2021-07-22   2021-08-09     20:01:00     20:19:00     23:17:00     19:34:00 ceftriaxone metronidazole vancomycin  cefazolin                                                                        IV      IV      IV      IV     1250      250      350      738      mg      mg      mg      mg       4       5       3       1     13      9      6      8
# 2 CHLA_0002   2020-12-18   2020-12-18   2020-12-19   2020-12-20      0:30:00      1:03:00     18:15:00      4:18:00    cefepime    vancomycin   cefepime vancomycin                                                                        IV      IV      IV      IV       75       23       60       24      mg      mg      mg      mg       5       4       6       4      8     13      8     12
# 3 CHLA_0003   2021-04-20   2021-04-21   2021-04-22         <NA>     15:17:00      2:20:00     14:16:00         <NA>   meropenem     meropenem      Other       <NA>                         sulfamethoxazole-trimethoprim        <NA>      IV      IV      IV    <NA>      200      400       50     <NA>      mg      mg      mg    <NA>       2      17       9    <NA>      1      8     12   <NA>

列顺序不是您要求的，但可以稍微符合一下：

variables <- as.character(unique(csfmelt$variable))
sub(".*_", "", names(csfwide)[-(1:2)])
#  [1] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
sub("_[^_]*$", "", names(csfwide)[-(1:2)])
#  [1] "proanbx_dt" "proanbx_dt" "proanbx_dt" "proanbx_tm" "proanbx_tm" "proanbx_tm" "proanbx_tm" "name"       "name"       "name"       "name"       "othername"  "othername"  "othername"  "othername"  "route"      "route"      "route"      "route"      "dosage"    
# [21] "dosage"     "dosage"     "dosage"     "units"      "units"      "units"      "units"      "doses"      "doses"      "doses"      "doses"      "freq"       "freq"       "freq"       "freq"      

nms <- names(csfwide)[-(1:2)]
newnms <- nms[order(sub(".*_", "", nms), match(nms, variables))]
csfwide2 <- subset(csfwide, select = c(names(csfwide)[1:2], newnms))
csfwide2
#    study_id proanbx_dt_1 proanbx_tm_1      name_1 othername_1 route_1 dosage_1 units_1 doses_1 freq_1 proanbx_dt_2 proanbx_tm_2        name_2 othername_2 route_2 dosage_2 units_2 doses_2 freq_2 proanbx_dt_3 proanbx_tm_3     name_3                   othername_3 route_3 dosage_3 units_3 doses_3 freq_3 proanbx_dt_4 proanbx_tm_4     name_4 othername_4 route_4 dosage_4 units_4 doses_4 freq_4
# 1 CHLA_0001   2021-07-22     20:01:00 ceftriaxone                  IV     1250      mg       4     13   2021-07-22     20:19:00 metronidazole                  IV      250      mg       5      9   2021-07-22     23:17:00 vancomycin                                    IV      350      mg       3      6   2021-08-09     19:34:00  cefazolin                  IV      738      mg       1      8
# 2 CHLA_0002   2020-12-18      0:30:00    cefepime                  IV       75      mg       5      8   2020-12-18      1:03:00    vancomycin                  IV       23      mg       4     13   2020-12-19     18:15:00   cefepime                                    IV       60      mg       6      8   2020-12-20      4:18:00 vancomycin                  IV       24      mg       4     12
# 3 CHLA_0003   2021-04-20     15:17:00   meropenem                  IV      200      mg       2      1   2021-04-21      2:20:00     meropenem                  IV      400      mg      17      8   2021-04-22     14:16:00      Other sulfamethoxazole-trimethoprim      IV       50      mg       9     12         <NA>         <NA>       <NA>        <NA>    <NA>     <NA>    <NA>    <NA>   <NA>

Answer 2

@r2evans 给了你一个很好的答案，但我在考虑你对日期和时间的评论。你没有提供你是如何收集这些数据的，所以我不能告诉你如何以这种方式导入它。但是，我确实在以下代码中转换了这些变量。话虽这么说，添加日期是没有意义的。我当时在想，经过的天数和时间量可能更符合您为那些特定变量寻找的内容。不幸的是，我无法弄清楚如何让它与 reshape2 一起工作。这使用 dplyr、tidyselect 和 hms。不过，您只需调用 dplyr，因为我已经附加了适用函数的包。（不过，您需要安装软件包。）

我没有保留 name 和 othername 因为它们不是多个条目。

library(dplyr)

csf_antibio = read.table(header = T, sep = ",", text = "study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12")

因为时间真正和日期挂钩，所以我写了一个处理时差的函数

timer <- function(df1){
  maxtm = max(df1$proanbx_tm[df1$proanbx_dt == max(df1$proanbx_dt)]) %>% hms::as_hms()
  mintm = min(df1$proanbx_tm[df1$proanbx_dt == min(df1$proanbx_dt)]) %>% hms::as_hms()
  if(maxtm > mintm){
    tmr = (maxtm - mintm) %>% hms::as_hms() # captures mult entries in the same day
  } else if(mintm > maxtm) {
    tmr = (maxtm - mintm) + hms::as_hms('24:00:00')  # add a full day
  } else {      # only one entry or the time is identical in max/min
    tmr = hms::as_hms('0')
  }
  return(tmr)
}

我将列名称收集到 return 列的原始顺序。

ordNames = names(csf_antibio) # collect names to return order to columns
#  [1] "study_id"   "proanbx_dt" "proanbx_tm" "name"       "othername"  "route"     
#  [7] "dosage"     "units"      "doses"      "freq"       

# names kept = ordNames[,c(1:3,6:10)]

求时间的和与差

csf2 <- csf_antibio %>% 
  mutate(proanbx_dt = as.Date(proanbx_dt),         # convert to date
         proanbx_tm = hms::as_hms(proanbx_tm)) %>% # convert to time
  group_by(study_id) %>%                           # group by study
  summarise(proanbx_tm = timer(.data), # difference in time
            proanbx_dt = max(proanbx_dt) - min(proanbx_dt), # difference in days
            across(tidyselect:::where(is.integer), sum),
            units = "mg",
            route = "IV") %>% 
  select(ordNames[c(1:3,6:10)])
head(csf2)
# # A tibble: 3 × 8
#   study_id  proanbx_dt proanbx_tm route dosage units doses  freq
#   <chr>     <drtn>     <time>     <chr>  <int> <chr> <int> <int>
# 1 CHLA_0001 18 days    23:33      IV      2588 mg       13    36
# 2 CHLA_0002  2 days    03:48      IV       182 mg       19    41
# 3 CHLA_0003  2 days    22:59      IV       650 mg       28    21

用 dcast 重塑长到宽

Reshape long to wide with dcast

r

dcast

reshape2 和基础 R