R 字符串拆分,使用 运行 索引标准化(长)格式

R string split, to normalized (long) format with running index

我有这个数据框

structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2", 
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id", 
"rules"), row.names = c(NA, -2L), class = "data.frame")

#  rule.id           rules
#1       1 Lamp1.1,Lamp1.2
#2       2 Lamp2.1,Lamp2.2

我需要用分隔符逗号 (",") 在 "rules" 列上拆分,出现多个逗号(不仅像示例中的 2 个),然后将其转换为规范化格式并保持相关rule.id 来自原始 df 的值。 结果应如下所示:

structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2", 
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA, 
-4L), class = "data.frame")

#  rule.id     lhs
#1       1 Lamp1.1
#2       1 Lamp1.2
#3       2 Lamp2.1
#4       2 Lamp2.1

我有一个代码可以处理 str 拆分和规范化(长)格式,但不确定如何处理 rule.id 要求

lhs.norm <- as.data.frame(
  cbind(
    rules.df$ruleid, 
    unlist(strsplit(
      unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
      ,","))))

感谢@acrun 解决方案使用

cSplit(rules.df.lhs, "lhs", ",", "long"))

我对 100 万行进行了 19 秒的基准测试(结果大约是 200 万行)

我们可以使用 splitstackshape

中的 cSplit
library(splitstackshape)
cSplit(df, "rules", ",", "long")
#   rule.id   rules
#1:       1 Lamp1.1
#2:       1 Lamp1.2
#3:       2 Lamp2.1
#4:       2 Lamp2.2

如果这是一个巨大的数据集,我们可以使用stringi拆分

library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
                  rules = unlist(lst))
df2
#   rule.id   rules
#1       1 Lamp1.1
#2       1 Lamp1.2
#3       2 Lamp2.1
#4       2 Lamp2.2

另一种选择是data.table

library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]

有了新的基管,我们可以使 @akrun 使用 stringi::stri_split_fixed 更快。这也利用了 rule.id 列的回收。

stringi::stri_split_fixed(d$rules, ",") |>
  unlist() |>
  cbind(d[1])
#   unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1                                         Lamp1.1       1
# 2                                         Lamp1.2       2
# 3                                         Lamp2.1       1
# 4                                         Lamp2.2       2

基准

sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
       library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
  melt=cbind(dl[1],  do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
    reshape2::melt('rule.id'),
  cbind=stri_split_fixed(dl$rules, ",") |>
    unlist() |>
    cbind(dl[1]),
  dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
  cSplit=cSplit(dl, "rules", ",", "long"),
  stringi={lst <- stri_split_fixed(dl$rules, ",")
  data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
             rules = unlist(lst))}, times=3L)
# Unit: milliseconds
#    expr        min         lq       mean     median         uq        max neval  cld
#    melt  1472.5459  1518.5649  1608.8124  1564.5838  1676.9456  1789.3075     3  b  
#   cbind   335.7105   365.9372   380.9120   396.1639   403.5128   410.8617     3 a   
#  dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951     3    d
#  cSplit  3003.0660  3079.3098  3232.6108  3155.5537  3347.3832  3539.2128     3   c 
# stringi   421.1481   469.1054   518.9577   517.0627   567.8626   618.6624     3 a   
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE

注意: 警告源于 cSplit() 代码可能很长时间没有更新了。


数据

d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
                                                                         "Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
                                                                                                                            "rules"), row.names = c(NA, -2L), class = "data.frame")