通过分组低效和缓慢来连接列

Concatenate a column by grouping inefficient and slow

代码完全可以满足需要,但这些嵌套循环的速度非常慢(一次超过 10k 列),是否有更多 effective/simple 方法来做到这一点?

all_xpath<-unique(dplyr::pull(tibble(all.df['Xpath']), Xpath))
all_section<-unique(dplyr::pull(tibble(all.df['section']), section))
valueS<-unique(dplyr::pull(tibble(all.df['attr.']), attr.))

for (j in all_section){
  for (f in all_xpath){
    for (g in valueS){
      allx <-  data.frame(filter(all.df, section==j & attr. == g & Xpath == f), stringsAsFactors=FALSE)
      if (nrow(allx)>1){
          value<-paste(allx$value. , collapse = ' | ')
          if ( any(all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$elem.!='telecom')){
            all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$value. <- value
}}}}}

valueS<-unique(dplyr::pull(tibble(all.df['elem.']), elem.))
#concat by  elem. and xpath within component
for (j in all_section){
  for (f in all_xpath){
    for (g in valueS){
      allx <-  data.frame(filter(all.df, section==j & elem. == g & Xpath == f & attr.==""), stringsAsFactors=FALSE)
      if (nrow(allx)>1){
        value<-paste(allx$value. , collapse = ' | ')
        all.df[all.df$section==j & all.df$elem.==g & all.df$Xpath==f & all.df$attr.=="",]$value. <- value
      }}}}
all.df<-dplyr::distinct(all.df)

数据格式如下:

dput(all.df)
structure(list(section = c("LastFiled", "LastFiled", "LastFiled", 
"LastFiled", "Results", "Results", "History", "History", "Cable", 
"Cable", "Fan", "Fan"), elem. = c("code", "code", "id", "id", 
"code", "code", "effectiveTime", "effectiveTime", "value", "value", 
"code", "code"), attr. = c("code", "code", "root", "root", "code", 
"code", "value", "value", "", "", "", ""), value. = c("8462-4", 
"8462-5", "39156-7", "39156-8", "59408-11", "39156-13", "59408-13", 
"39156-14", "59408-8", "39156-9", "59408-9", "39156-11"), Xpath = c("/Document/othersection/entry/body/sceen/code", 
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/body/sceen/id", 
"/Document/othersection/entry/body/sceen/id", "/Document/othersection/entry/body/sceen/code", 
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/sceen/effectiveTime", 
"/Document/othersection/entry/sceen/effectiveTime", "/Document/othersection/entry/entryRelationship", 
"/Document/othersection/entry/entryRelationship", "/Document/othersection/entry/procedure/entryRelationship/sceen", 
"/Document/othersection/entry/procedure/entryRelationship/sceen"
)), row.names = c(NA, -12L), class = "data.frame")

结果应如下所示:

section     elem.       attr.   value.                  Xpath
LastFiled   code        code    8462-4 | 8462-5     /Document/othersection/entry/body/sceen/code
LastFiled   id          root    39156-7 | 39156-8   /Document/othersection/entry/body/sceen/id
Results     code        code    59408-11 | 39156-13 /Document/othersection/entry/body/sceen/code
History     effectiveTime   value   59408-13 | 39156-14     /Document/othersection/entry/sceen/effectiveTime
Cable       value               59408-8 | 39156-9 | 
                                59408-8 | 39156-9   /Document/othersection/entry/entryRelationship
Fan         code                59408-9 | 39156-11 | 
                                59408-9 | 39156-11  /Document/othersection/entry/procedure/entryRelationship

问题可以用两个 dplyr 管道解决,每个管道一个用于数据转换。结果包裹在 bind_rows.

library(dplyr)

bind_rows(
  all.df %>%
    filter(elem. != 'telecom') %>%
    group_by(section, attr., elem., Xpath) %>%
    summarise(value. = paste(value., collapse = "|"), .groups = "keep"),
  
  all.df %>%
    filter(attr. == "") %>%
    group_by(section, elem., Xpath) %>%
    summarise(value. = paste(value., collapse = "|"), .groups = "keep")
) %>%
  mutate(attr. = ifelse(is.na(attr.), "", attr.)) %>%
  relocate(value., .before = Xpath)
## A tibble: 8 x 5
## Groups:   section, attr., elem., Xpath [6]
#  section   attr.   elem.         value.            Xpath                                                         
#  <chr>     <chr>   <chr>         <chr>             <chr>                                                         
#1 Cable     ""      value         59408-8|39156-9   /Document/othersection/entry/entryRelationship                
#2 Fan       ""      code          59408-9|39156-11  /Document/othersection/entry/procedure/entryRelationship/sceen
#3 History   "value" effectiveTime 59408-13|39156-14 /Document/othersection/entry/sceen/effectiveTime              
#4 LastFiled "code"  code          8462-4|8462-5     /Document/othersection/entry/body/sceen/code                  
#5 LastFiled "root"  id            39156-7|39156-8   /Document/othersection/entry/body/sceen/id                    
#6 Results   "code"  code          59408-11|39156-13 /Document/othersection/entry/body/sceen/code                  
#7 Cable     ""      value         59408-8|39156-9   /Document/othersection/entry/entryRelationship                
#8 Fan       ""      code          59408-9|39156-11  /Document/othersection/entry/procedure/entryRelationship/sceen