R - 生成动态列数和子字符串列值
R - generate dynamic number of columns and substring column values
寻求有关 R 中数据操作的帮助。我有以下格式的数据;
ID L1 L2 L3
1 BBCBCACCBCB CBCBBBB BEBBBAAB
2 BBCBCCCCBCB CBCCCBC BBAACCCB
3 BBCBCACCBCB CBCBBBB BEBBBAAB
4 BBCBCACCBCB CBCBBBB BEBBBAAB
5 BBCBACBCCCB BBCCCBC BBCBAAAAB
6 BBCBBCCBBCB BBCBCEB BBBBCAACB
7 BBCBBCCBBCB BBCBCEB BBBBCAACB
8
9 BBCBCACCBCB CBCBBBB BEBBBAAB
10 BBCBBCCBBCB BBCBCEB BBBBCAACB
11 BBCBBCCBBCB BBCBCEB BBBBCAACB
每列中的值将是不同长度的字符串。我想要一个 R 函数,对于上面的每一列,will
1) 根据列中任意字符串的最大长度生成动态列数,例如L1 最大长度 = 11,因此有 11 个新列,每个列都标记为 L1_1:L1_11
2) 然后将字符串拆分为三元组,例如
ID L1 L2 L3 L1_1 L1_2 L1_3 L1_4 L1_5 L1_6 L1_7 L1_8 L1_9
1 BBCBCACCBCB CBCBBBB BEBBBAAB BBC BCB CBC BCA CAC ACC CCB CBC BCB
3) 对这个三元组进行计算,即('a' * 1 的数量)+('b' * 3 的数量)+('c' * 7 的数量)三胞胎。
4) return 此计算在新列中的值。
我发现当 运行 用于列 L1、L2 但不适用于 L3 时,建议的代码完全符合我的需要。我收到的错误是 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'
有什么想法吗?
非常感谢。
编辑
dput(df):
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
结构(列表(ID = 1:11,L1 = 结构(c(4L,5L,4L,4L,2L,3L,3L,1L,4L,3L,3L),.Label = c ("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = 结构(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC","CBCBBBB","CBCCCBC"), class = "factor"), L3 = 结构(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c ("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
#DATA
df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda",
"bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA,
-4L))
#Go through the strings and split into subgroups of 3 characters.
#Put the substrings in a list
temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
#Obtain the length of the subgroup with the most triplets
temp_l = max(lengths(temp))
#Subset the subgroups from 1 to temp_l so that remianing values are NA
cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc abb bbb bbc bcc
#2 2 aabacd aab aba bac acd
#3 3 abbda abb bbd bda <NA>
#4 4 bbad bba bad <NA> <NA>
如果要基于三元组进行计算,运行在执行 cbind
步骤之前执行以下操作
temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
#REPEAT FOR L2, L3, ...
cbind(df, temp_L1) #Run cbind(df, temp_L1, temp_L2, ...)
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc 7 9 13 17
#2 2 aabacd 5 5 11 8
#3 3 abbda 7 6 4 NA
#4 4 bbad 7 4 NA NA
更新
您可以创建一个函数并如下所示使用它
#FUNCTION
foo = function(data, column){
temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
return(temp_L1)
}
#USING ON NEW DATA
cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))
如果你想使用tidyverse
个动词
library(tidyverse)
df1 <- df %>%
mutate(L2=L1) %>% # copies L1
nest(L2) %>% # nest L1
mutate(data=map(data,~sapply(1:(nchar(.x)-2), function(y) substr(.x, y, y+2)))) %>% # makes triplets
unnest(data) %>% # unnest triplets
group_by(ID) %>% # perform next operations group wise
mutate(rn=letters[row_number()]) %>% # make future column names
spread(rn,data) # spread long format into wide format (columns)
ID L1 a b c d
1 1 abbbcc abb bbb bbc bcc
2 2 aabacd aab aba bac acd
3 3 abbda abb bbd bda <NA>
4 4 bbad bba bad <NA> <NA>
寻求有关 R 中数据操作的帮助。我有以下格式的数据;
ID L1 L2 L3
1 BBCBCACCBCB CBCBBBB BEBBBAAB
2 BBCBCCCCBCB CBCCCBC BBAACCCB
3 BBCBCACCBCB CBCBBBB BEBBBAAB
4 BBCBCACCBCB CBCBBBB BEBBBAAB
5 BBCBACBCCCB BBCCCBC BBCBAAAAB
6 BBCBBCCBBCB BBCBCEB BBBBCAACB
7 BBCBBCCBBCB BBCBCEB BBBBCAACB
8
9 BBCBCACCBCB CBCBBBB BEBBBAAB
10 BBCBBCCBBCB BBCBCEB BBBBCAACB
11 BBCBBCCBBCB BBCBCEB BBBBCAACB
每列中的值将是不同长度的字符串。我想要一个 R 函数,对于上面的每一列,will
1) 根据列中任意字符串的最大长度生成动态列数,例如L1 最大长度 = 11,因此有 11 个新列,每个列都标记为 L1_1:L1_11
2) 然后将字符串拆分为三元组,例如
ID L1 L2 L3 L1_1 L1_2 L1_3 L1_4 L1_5 L1_6 L1_7 L1_8 L1_9
1 BBCBCACCBCB CBCBBBB BEBBBAAB BBC BCB CBC BCA CAC ACC CCB CBC BCB
3) 对这个三元组进行计算,即('a' * 1 的数量)+('b' * 3 的数量)+('c' * 7 的数量)三胞胎。
4) return 此计算在新列中的值。
我发现当 运行 用于列 L1、L2 但不适用于 L3 时,建议的代码完全符合我的需要。我收到的错误是 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'
有什么想法吗? 非常感谢。
编辑
dput(df):
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
结构(列表(ID = 1:11,L1 = 结构(c(4L,5L,4L,4L,2L,3L,3L,1L,4L,3L,3L),.Label = c ("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = 结构(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC","CBCBBBB","CBCCCBC"), class = "factor"), L3 = 结构(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c ("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
#DATA
df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda",
"bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA,
-4L))
#Go through the strings and split into subgroups of 3 characters.
#Put the substrings in a list
temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
#Obtain the length of the subgroup with the most triplets
temp_l = max(lengths(temp))
#Subset the subgroups from 1 to temp_l so that remianing values are NA
cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc abb bbb bbc bcc
#2 2 aabacd aab aba bac acd
#3 3 abbda abb bbd bda <NA>
#4 4 bbad bba bad <NA> <NA>
如果要基于三元组进行计算,运行在执行 cbind
步骤之前执行以下操作
temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
#REPEAT FOR L2, L3, ...
cbind(df, temp_L1) #Run cbind(df, temp_L1, temp_L2, ...)
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc 7 9 13 17
#2 2 aabacd 5 5 11 8
#3 3 abbda 7 6 4 NA
#4 4 bbad 7 4 NA NA
更新
您可以创建一个函数并如下所示使用它
#FUNCTION
foo = function(data, column){
temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
return(temp_L1)
}
#USING ON NEW DATA
cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))
如果你想使用tidyverse
个动词
library(tidyverse)
df1 <- df %>%
mutate(L2=L1) %>% # copies L1
nest(L2) %>% # nest L1
mutate(data=map(data,~sapply(1:(nchar(.x)-2), function(y) substr(.x, y, y+2)))) %>% # makes triplets
unnest(data) %>% # unnest triplets
group_by(ID) %>% # perform next operations group wise
mutate(rn=letters[row_number()]) %>% # make future column names
spread(rn,data) # spread long format into wide format (columns)
ID L1 a b c d
1 1 abbbcc abb bbb bbc bcc
2 2 aabacd aab aba bac acd
3 3 abbda abb bbd bda <NA>
4 4 bbad bba bad <NA> <NA>