通过分组低效和缓慢来连接列
Concatenate a column by grouping inefficient and slow
代码完全可以满足需要,但这些嵌套循环的速度非常慢(一次超过 10k 列),是否有更多 effective/simple 方法来做到这一点?
all_xpath<-unique(dplyr::pull(tibble(all.df['Xpath']), Xpath))
all_section<-unique(dplyr::pull(tibble(all.df['section']), section))
valueS<-unique(dplyr::pull(tibble(all.df['attr.']), attr.))
for (j in all_section){
for (f in all_xpath){
for (g in valueS){
allx <- data.frame(filter(all.df, section==j & attr. == g & Xpath == f), stringsAsFactors=FALSE)
if (nrow(allx)>1){
value<-paste(allx$value. , collapse = ' | ')
if ( any(all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$elem.!='telecom')){
all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$value. <- value
}}}}}
valueS<-unique(dplyr::pull(tibble(all.df['elem.']), elem.))
#concat by elem. and xpath within component
for (j in all_section){
for (f in all_xpath){
for (g in valueS){
allx <- data.frame(filter(all.df, section==j & elem. == g & Xpath == f & attr.==""), stringsAsFactors=FALSE)
if (nrow(allx)>1){
value<-paste(allx$value. , collapse = ' | ')
all.df[all.df$section==j & all.df$elem.==g & all.df$Xpath==f & all.df$attr.=="",]$value. <- value
}}}}
all.df<-dplyr::distinct(all.df)
数据格式如下:
dput(all.df)
structure(list(section = c("LastFiled", "LastFiled", "LastFiled",
"LastFiled", "Results", "Results", "History", "History", "Cable",
"Cable", "Fan", "Fan"), elem. = c("code", "code", "id", "id",
"code", "code", "effectiveTime", "effectiveTime", "value", "value",
"code", "code"), attr. = c("code", "code", "root", "root", "code",
"code", "value", "value", "", "", "", ""), value. = c("8462-4",
"8462-5", "39156-7", "39156-8", "59408-11", "39156-13", "59408-13",
"39156-14", "59408-8", "39156-9", "59408-9", "39156-11"), Xpath = c("/Document/othersection/entry/body/sceen/code",
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/body/sceen/id",
"/Document/othersection/entry/body/sceen/id", "/Document/othersection/entry/body/sceen/code",
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/sceen/effectiveTime",
"/Document/othersection/entry/sceen/effectiveTime", "/Document/othersection/entry/entryRelationship",
"/Document/othersection/entry/entryRelationship", "/Document/othersection/entry/procedure/entryRelationship/sceen",
"/Document/othersection/entry/procedure/entryRelationship/sceen"
)), row.names = c(NA, -12L), class = "data.frame")
结果应如下所示:
section elem. attr. value. Xpath
LastFiled code code 8462-4 | 8462-5 /Document/othersection/entry/body/sceen/code
LastFiled id root 39156-7 | 39156-8 /Document/othersection/entry/body/sceen/id
Results code code 59408-11 | 39156-13 /Document/othersection/entry/body/sceen/code
History effectiveTime value 59408-13 | 39156-14 /Document/othersection/entry/sceen/effectiveTime
Cable value 59408-8 | 39156-9 |
59408-8 | 39156-9 /Document/othersection/entry/entryRelationship
Fan code 59408-9 | 39156-11 |
59408-9 | 39156-11 /Document/othersection/entry/procedure/entryRelationship
问题可以用两个 dplyr
管道解决,每个管道一个用于数据转换。结果包裹在 bind_rows
.
中
library(dplyr)
bind_rows(
all.df %>%
filter(elem. != 'telecom') %>%
group_by(section, attr., elem., Xpath) %>%
summarise(value. = paste(value., collapse = "|"), .groups = "keep"),
all.df %>%
filter(attr. == "") %>%
group_by(section, elem., Xpath) %>%
summarise(value. = paste(value., collapse = "|"), .groups = "keep")
) %>%
mutate(attr. = ifelse(is.na(attr.), "", attr.)) %>%
relocate(value., .before = Xpath)
## A tibble: 8 x 5
## Groups: section, attr., elem., Xpath [6]
# section attr. elem. value. Xpath
# <chr> <chr> <chr> <chr> <chr>
#1 Cable "" value 59408-8|39156-9 /Document/othersection/entry/entryRelationship
#2 Fan "" code 59408-9|39156-11 /Document/othersection/entry/procedure/entryRelationship/sceen
#3 History "value" effectiveTime 59408-13|39156-14 /Document/othersection/entry/sceen/effectiveTime
#4 LastFiled "code" code 8462-4|8462-5 /Document/othersection/entry/body/sceen/code
#5 LastFiled "root" id 39156-7|39156-8 /Document/othersection/entry/body/sceen/id
#6 Results "code" code 59408-11|39156-13 /Document/othersection/entry/body/sceen/code
#7 Cable "" value 59408-8|39156-9 /Document/othersection/entry/entryRelationship
#8 Fan "" code 59408-9|39156-11 /Document/othersection/entry/procedure/entryRelationship/sceen
代码完全可以满足需要,但这些嵌套循环的速度非常慢(一次超过 10k 列),是否有更多 effective/simple 方法来做到这一点?
all_xpath<-unique(dplyr::pull(tibble(all.df['Xpath']), Xpath))
all_section<-unique(dplyr::pull(tibble(all.df['section']), section))
valueS<-unique(dplyr::pull(tibble(all.df['attr.']), attr.))
for (j in all_section){
for (f in all_xpath){
for (g in valueS){
allx <- data.frame(filter(all.df, section==j & attr. == g & Xpath == f), stringsAsFactors=FALSE)
if (nrow(allx)>1){
value<-paste(allx$value. , collapse = ' | ')
if ( any(all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$elem.!='telecom')){
all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$value. <- value
}}}}}
valueS<-unique(dplyr::pull(tibble(all.df['elem.']), elem.))
#concat by elem. and xpath within component
for (j in all_section){
for (f in all_xpath){
for (g in valueS){
allx <- data.frame(filter(all.df, section==j & elem. == g & Xpath == f & attr.==""), stringsAsFactors=FALSE)
if (nrow(allx)>1){
value<-paste(allx$value. , collapse = ' | ')
all.df[all.df$section==j & all.df$elem.==g & all.df$Xpath==f & all.df$attr.=="",]$value. <- value
}}}}
all.df<-dplyr::distinct(all.df)
数据格式如下:
dput(all.df)
structure(list(section = c("LastFiled", "LastFiled", "LastFiled",
"LastFiled", "Results", "Results", "History", "History", "Cable",
"Cable", "Fan", "Fan"), elem. = c("code", "code", "id", "id",
"code", "code", "effectiveTime", "effectiveTime", "value", "value",
"code", "code"), attr. = c("code", "code", "root", "root", "code",
"code", "value", "value", "", "", "", ""), value. = c("8462-4",
"8462-5", "39156-7", "39156-8", "59408-11", "39156-13", "59408-13",
"39156-14", "59408-8", "39156-9", "59408-9", "39156-11"), Xpath = c("/Document/othersection/entry/body/sceen/code",
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/body/sceen/id",
"/Document/othersection/entry/body/sceen/id", "/Document/othersection/entry/body/sceen/code",
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/sceen/effectiveTime",
"/Document/othersection/entry/sceen/effectiveTime", "/Document/othersection/entry/entryRelationship",
"/Document/othersection/entry/entryRelationship", "/Document/othersection/entry/procedure/entryRelationship/sceen",
"/Document/othersection/entry/procedure/entryRelationship/sceen"
)), row.names = c(NA, -12L), class = "data.frame")
结果应如下所示:
section elem. attr. value. Xpath
LastFiled code code 8462-4 | 8462-5 /Document/othersection/entry/body/sceen/code
LastFiled id root 39156-7 | 39156-8 /Document/othersection/entry/body/sceen/id
Results code code 59408-11 | 39156-13 /Document/othersection/entry/body/sceen/code
History effectiveTime value 59408-13 | 39156-14 /Document/othersection/entry/sceen/effectiveTime
Cable value 59408-8 | 39156-9 |
59408-8 | 39156-9 /Document/othersection/entry/entryRelationship
Fan code 59408-9 | 39156-11 |
59408-9 | 39156-11 /Document/othersection/entry/procedure/entryRelationship
问题可以用两个 dplyr
管道解决,每个管道一个用于数据转换。结果包裹在 bind_rows
.
library(dplyr)
bind_rows(
all.df %>%
filter(elem. != 'telecom') %>%
group_by(section, attr., elem., Xpath) %>%
summarise(value. = paste(value., collapse = "|"), .groups = "keep"),
all.df %>%
filter(attr. == "") %>%
group_by(section, elem., Xpath) %>%
summarise(value. = paste(value., collapse = "|"), .groups = "keep")
) %>%
mutate(attr. = ifelse(is.na(attr.), "", attr.)) %>%
relocate(value., .before = Xpath)
## A tibble: 8 x 5
## Groups: section, attr., elem., Xpath [6]
# section attr. elem. value. Xpath
# <chr> <chr> <chr> <chr> <chr>
#1 Cable "" value 59408-8|39156-9 /Document/othersection/entry/entryRelationship
#2 Fan "" code 59408-9|39156-11 /Document/othersection/entry/procedure/entryRelationship/sceen
#3 History "value" effectiveTime 59408-13|39156-14 /Document/othersection/entry/sceen/effectiveTime
#4 LastFiled "code" code 8462-4|8462-5 /Document/othersection/entry/body/sceen/code
#5 LastFiled "root" id 39156-7|39156-8 /Document/othersection/entry/body/sceen/id
#6 Results "code" code 59408-11|39156-13 /Document/othersection/entry/body/sceen/code
#7 Cable "" value 59408-8|39156-9 /Document/othersection/entry/entryRelationship
#8 Fan "" code 59408-9|39156-11 /Document/othersection/entry/procedure/entryRelationship/sceen