R 字符串拆分,使用 运行 索引标准化(长)格式
R string split, to normalized (long) format with running index
我有这个数据框
structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
# rule.id rules
#1 1 Lamp1.1,Lamp1.2
#2 2 Lamp2.1,Lamp2.2
我需要用分隔符逗号 (",") 在 "rules" 列上拆分,出现多个逗号(不仅像示例中的 2 个),然后将其转换为规范化格式并保持相关rule.id 来自原始 df 的值。
结果应如下所示:
structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2",
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA,
-4L), class = "data.frame")
# rule.id lhs
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.1
我有一个代码可以处理 str 拆分和规范化(长)格式,但不确定如何处理 rule.id 要求
lhs.norm <- as.data.frame(
cbind(
rules.df$ruleid,
unlist(strsplit(
unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
,","))))
感谢@acrun 解决方案使用
cSplit(rules.df.lhs, "lhs", ",", "long"))
我对 100 万行进行了 19 秒的基准测试(结果大约是 200 万行)
我们可以使用 splitstackshape
中的 cSplit
library(splitstackshape)
cSplit(df, "rules", ",", "long")
# rule.id rules
#1: 1 Lamp1.1
#2: 1 Lamp1.2
#3: 2 Lamp2.1
#4: 2 Lamp2.2
如果这是一个巨大的数据集,我们可以使用stringi
拆分
library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
rules = unlist(lst))
df2
# rule.id rules
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.2
另一种选择是data.table
library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]
有了新的基管,我们可以使 @akrun 的 使用 stringi::stri_split_fixed
更快。这也利用了 rule.id
列的回收。
stringi::stri_split_fixed(d$rules, ",") |>
unlist() |>
cbind(d[1])
# unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1 Lamp1.1 1
# 2 Lamp1.2 2
# 3 Lamp2.1 1
# 4 Lamp2.2 2
基准
sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
melt=cbind(dl[1], do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
reshape2::melt('rule.id'),
cbind=stri_split_fixed(dl$rules, ",") |>
unlist() |>
cbind(dl[1]),
dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
cSplit=cSplit(dl, "rules", ",", "long"),
stringi={lst <- stri_split_fixed(dl$rules, ",")
data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
rules = unlist(lst))}, times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# melt 1472.5459 1518.5649 1608.8124 1564.5838 1676.9456 1789.3075 3 b
# cbind 335.7105 365.9372 380.9120 396.1639 403.5128 410.8617 3 a
# dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951 3 d
# cSplit 3003.0660 3079.3098 3232.6108 3155.5537 3347.3832 3539.2128 3 c
# stringi 421.1481 469.1054 518.9577 517.0627 567.8626 618.6624 3 a
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
注意: 警告源于 cSplit()
代码可能很长时间没有更新了。
数据
d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
我有这个数据框
structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
# rule.id rules
#1 1 Lamp1.1,Lamp1.2
#2 2 Lamp2.1,Lamp2.2
我需要用分隔符逗号 (",") 在 "rules" 列上拆分,出现多个逗号(不仅像示例中的 2 个),然后将其转换为规范化格式并保持相关rule.id 来自原始 df 的值。 结果应如下所示:
structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2",
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA,
-4L), class = "data.frame")
# rule.id lhs
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.1
我有一个代码可以处理 str 拆分和规范化(长)格式,但不确定如何处理 rule.id 要求
lhs.norm <- as.data.frame(
cbind(
rules.df$ruleid,
unlist(strsplit(
unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
,","))))
感谢@acrun 解决方案使用
cSplit(rules.df.lhs, "lhs", ",", "long"))
我对 100 万行进行了 19 秒的基准测试(结果大约是 200 万行)
我们可以使用 splitstackshape
cSplit
library(splitstackshape)
cSplit(df, "rules", ",", "long")
# rule.id rules
#1: 1 Lamp1.1
#2: 1 Lamp1.2
#3: 2 Lamp2.1
#4: 2 Lamp2.2
如果这是一个巨大的数据集,我们可以使用stringi
拆分
library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
rules = unlist(lst))
df2
# rule.id rules
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.2
另一种选择是data.table
library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]
有了新的基管,我们可以使 @akrun 的 stringi::stri_split_fixed
更快。这也利用了 rule.id
列的回收。
stringi::stri_split_fixed(d$rules, ",") |>
unlist() |>
cbind(d[1])
# unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1 Lamp1.1 1
# 2 Lamp1.2 2
# 3 Lamp2.1 1
# 4 Lamp2.2 2
基准
sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
melt=cbind(dl[1], do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
reshape2::melt('rule.id'),
cbind=stri_split_fixed(dl$rules, ",") |>
unlist() |>
cbind(dl[1]),
dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
cSplit=cSplit(dl, "rules", ",", "long"),
stringi={lst <- stri_split_fixed(dl$rules, ",")
data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
rules = unlist(lst))}, times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# melt 1472.5459 1518.5649 1608.8124 1564.5838 1676.9456 1789.3075 3 b
# cbind 335.7105 365.9372 380.9120 396.1639 403.5128 410.8617 3 a
# dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951 3 d
# cSplit 3003.0660 3079.3098 3232.6108 3155.5537 3347.3832 3539.2128 3 c
# stringi 421.1481 469.1054 518.9577 517.0627 567.8626 618.6624 3 a
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
注意: 警告源于 cSplit()
代码可能很长时间没有更新了。
数据
d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")