基于 r 中两列的 delim 分隔行
separate rows based on delim based on two columns in r
我有以下 df:
df_1=data.frame(col_1=c("a;b;c","c;d","e","f","g","h;j"),col_2=c("1;2;3","4","5;6","7","8;9","10;11;12"))
所以我想将 col_1 分成单独的行,如果存在 col_2 对应的值。
例如,如果 col_1 中的元素数 = col_2 中的元素数,那么它们应该与 col_1 和 [= 中的相应值分开33=](第 1 行)
如果他们有不同数量的元素,如果一列只有一个元素,那么也可以分开到不同的行(第2行)
如果它们的元素数量不成比例(每个元素超过 1 个且不相等),则应保持原样
这里是 final_dataset:
df_2=data.frame(col_1=c("a","b","c","c","d","e","e","f","g","g","h;j"),col_2=c("1","2","3","4","4","5","6","7","8","9","10;11;12"))
我们可以使用cSplit
library(splitstackshape)
library(zoo)
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
rbind(cSplit(df_1[!i1,], c('col_1', 'col_2'), sep=";", "long")[
!is.na(col_1)|!is.na(col_2), lapply(.SD, na.locf0)], df_1[i1,])
# col_1 col_2
# 1: a 1
# 2: b 2
# 3: c 3
# 4: c 4
# 5: d 4
# 6: e 5
# 7: e 6
# 8: f 7
# 9: g 8
#10: g 9
#11: h;j 10;11;12
或使用具有所有约束的base R
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
lst1 <- lapply(df_1[!i1, ], function(x) strsplit(x, ";"))
out <- rbind(do.call(rbind, Map(function(x, y) {
l1 <- length(x)
l2 <- length(y)
mx <- max(l1, l2)
x <- if(l1 != l2 & l1 == 1) rep(x, mx) else x
y <- if(l1 != l2 & l2 == 1) rep(y, mx) else y
data.frame(col_1 = x, col_2 = y) } ,
lst1[[1]], lst1[[2]])), df_1[i1,])
row.names(out) <- NULL
out
# col_1 col_2
#1 a 1
#2 b 2
#3 c 3
#4 c 4
#5 d 4
#6 e 5
#7 e 6
#8 f 7
#9 g 8
#10 g 9
#11 h;j 10;11;12
这是另一个通过定义自定义函数的基本 R 选项 f
f <- function(v) {
X <- unlist(strsplit(v[[1]],";"))
Y <- unlist(strsplit(v[[2]],";"))
if (length(X) == length(Y) || min(length(X),length(Y))==1) {
res <- data.frame(col_1 = X, col_2 = Y)
} else {
res <- data.frame(col_1 = v[[1]], col_2 = v[[2]])
}
res
}
df_2 <- do.call(rbind,apply(df_1,1,f))
我们会得到
col_1 col_2
1 a 1
2 b 2
3 c 3
4 c 4
5 d 4
6 e 5
7 e 6
8 f 7
9 g 8
10 g 9
11 h;j 10;11;12
我有以下 df:
df_1=data.frame(col_1=c("a;b;c","c;d","e","f","g","h;j"),col_2=c("1;2;3","4","5;6","7","8;9","10;11;12"))
所以我想将 col_1 分成单独的行,如果存在 col_2 对应的值。
例如,如果 col_1 中的元素数 = col_2 中的元素数,那么它们应该与 col_1 和 [= 中的相应值分开33=](第 1 行)
如果他们有不同数量的元素,如果一列只有一个元素,那么也可以分开到不同的行(第2行)
如果它们的元素数量不成比例(每个元素超过 1 个且不相等),则应保持原样
这里是 final_dataset:
df_2=data.frame(col_1=c("a","b","c","c","d","e","e","f","g","g","h;j"),col_2=c("1","2","3","4","4","5","6","7","8","9","10;11;12"))
我们可以使用cSplit
library(splitstackshape)
library(zoo)
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
rbind(cSplit(df_1[!i1,], c('col_1', 'col_2'), sep=";", "long")[
!is.na(col_1)|!is.na(col_2), lapply(.SD, na.locf0)], df_1[i1,])
# col_1 col_2
# 1: a 1
# 2: b 2
# 3: c 3
# 4: c 4
# 5: d 4
# 6: e 5
# 7: e 6
# 8: f 7
# 9: g 8
#10: g 9
#11: h;j 10;11;12
或使用具有所有约束的base R
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
lst1 <- lapply(df_1[!i1, ], function(x) strsplit(x, ";"))
out <- rbind(do.call(rbind, Map(function(x, y) {
l1 <- length(x)
l2 <- length(y)
mx <- max(l1, l2)
x <- if(l1 != l2 & l1 == 1) rep(x, mx) else x
y <- if(l1 != l2 & l2 == 1) rep(y, mx) else y
data.frame(col_1 = x, col_2 = y) } ,
lst1[[1]], lst1[[2]])), df_1[i1,])
row.names(out) <- NULL
out
# col_1 col_2
#1 a 1
#2 b 2
#3 c 3
#4 c 4
#5 d 4
#6 e 5
#7 e 6
#8 f 7
#9 g 8
#10 g 9
#11 h;j 10;11;12
这是另一个通过定义自定义函数的基本 R 选项 f
f <- function(v) {
X <- unlist(strsplit(v[[1]],";"))
Y <- unlist(strsplit(v[[2]],";"))
if (length(X) == length(Y) || min(length(X),length(Y))==1) {
res <- data.frame(col_1 = X, col_2 = Y)
} else {
res <- data.frame(col_1 = v[[1]], col_2 = v[[2]])
}
res
}
df_2 <- do.call(rbind,apply(df_1,1,f))
我们会得到
col_1 col_2
1 a 1
2 b 2
3 c 3
4 c 4
5 d 4
6 e 5
7 e 6
8 f 7
9 g 8
10 g 9
11 h;j 10;11;12