R string_extract_all 使用 plyr 循环遍历数据帧
R string_extract_all looped through data frame using plyr
library(plyr)
library(stringr)
###example data
examp<- data.frame(id_info = c("123", "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365", "9062", "5558", "239", "8700", "6995", "9853"),
filterme1 = c("ABB123460sadjasd", "ABB123461asjdjs", "ABB123462ranogvmg", "ABB123463dkfohsd", "ABB123464fff///sss", "ABB123465jfsdf", "ABB123466 sdf", "ABB123467 sdf", "ABB123468 fff///sss", "ABB123469 ty", "ABB123470 fff///sss", "ABB123471 dfs", "ABB123472 ", "ABB123473 gt", "ABB123474 y", "ABB123475 f", "ABB123476 gfgABB123462", "ABB123477 dsd", "ABB123478 re", "ABB123479 fgh", "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg", "ABB123483 s"),
filterme2 = c("sadjasdABB123460", "asjdjsABB123461", "ranogvmgABB123462", "dkfohsdABB123463", "fff///sssABB123464", "jfsdfABB123465", "sdfABB123466", "sdfABB123467", "fff///sssABB123468", "tyABB123469", "fff///sssABB123470", "dfsABB123471", "ABB123472", "gtABB123473", "yABB123474", "fABB123475", "ABB123462gfgABB123476", "dsdABB123477", "reABB123478", "fghABB123479", "tuABB123480", "yuABB123481", "dfgABB123482", "sABB123483"))
##id_info should be factor even tho it is numeric
str(examp)
我想从字符串中提取一个元素。该元素应以 "ABB" 开头,后跟 6 位数字。
#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression
下面是我尝试在整个数据帧中使用我的 stringr 函数。
#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))
创建列表后,我想尝试将其全部放回数据中 frame.I 发现 link here 关于将未知长度的列表放入数据帧,但我不确定是否我能不能用这个。
indx <- sapply(examp_str_big, length)
#indx <- lengths(lst)
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
max(indx))))
colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)
所以我想要的最终结果是这样的:
id_info filterme1 filterme2 filterme3 filterme4
123 ABB123460 ABB123460
3464 ABB123461 ABB123461
7156 ABB123462 ABB123462
3171 ABB123463 ABB123463
5299 ABB123464 ABB123464
4541 ABB123465 ABB123465
4956 ABB123466 ABB123466
9926 ABB123467 ABB123467
8418 ABB123468 ABB123468
1392 ABB123469 ABB123469
9080 ABB123470 ABB123470
6455 ABB123471 ABB123471
2423 ABB123472 ABB123472
9101 ABB123473 ABB123473
7807 ABB123474 ABB123474
5195 ABB123475 ABB123475
7827 ABB123476 ABB123462 ABB123462 ABB1234576
365 ABB123477 ABB123477
9062 ABB123478 ABB123478
5558 ABB123479 ABB123479
239 ABB123480 ABB123480
8700 ABB123481 ABB123481
6995 ABB123482 ABB123482
9853 ABB123483 ABB123483
我的实际数据集更长并且有更多 "filterme" 列。任何帮助将不胜感激。如果有另一种更聪明的方法来实现这个目标,我很想听听。
谢谢。
这是一种方法(基于您的原始数据框 examp
,我假设它是用 stringsAsFactors = FALSE
读取的):
library(stringr)
# Extract all occurrences of patterns (NOTE: I am using 6 digits
# since no 7 digits example in provided sample data. Modify to 7.
examp$pattern <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
# Append columns for each found pattern
maxlength <- max(sapply(examp$pattern, length))
examp <- cbind(examp,
as.data.frame(do.call(rbind,
lapply(examp$pattern,
function(x) {
s <- strsplit(x, ' ')
c(s, rep(NA, maxlength - length(s)))
}))))
# This will result in a wider data frame with all found patterns appended
# as new columns
examp
id_info filterme1 filterme2 pattern V1
1 123 ABB123460sadjasd sadjasdABB123460 ABB123460 ABB123460
2 3464 ABB123461asjdjs asjdjsABB123461 ABB123461 ABB123461
3 7156 ABB123462ranogvmg ranogvmgABB123462 ABB123462 ABB123462
4 3171 ABB123463dkfohsd dkfohsdABB123463 ABB123463 ABB123463
5 5299 ABB123464fff///sss fff///sssABB123464 ABB123464 ABB123464
6 4541 ABB123465jfsdf jfsdfABB123465 ABB123465 ABB123465
7 4956 ABB123466 sdf sdfABB123466 ABB123466 ABB123466
8 9926 ABB123467 sdf sdfABB123467 ABB123467 ABB123467
9 8418 ABB123468 fff///sss fff///sssABB123468 ABB123468 ABB123468
10 1392 ABB123469 ty tyABB123469 ABB123469 ABB123469
11 9080 ABB123470 fff///sss fff///sssABB123470 ABB123470 ABB123470
12 6455 ABB123471 dfs dfsABB123471 ABB123471 ABB123471
13 2423 ABB123472 ABB123472 ABB123472 ABB123472
14 9101 ABB123473 gt gtABB123473 ABB123473 ABB123473
15 7807 ABB123474 y yABB123474 ABB123474 ABB123474
16 5195 ABB123475 f fABB123475 ABB123475 ABB123475
17 7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476, ABB123462 ABB123476
18 365 ABB123477 dsd dsdABB123477 ABB123477 ABB123477
19 9062 ABB123478 re reABB123478 ABB123478 ABB123478
20 5558 ABB123479 fgh fghABB123479 ABB123479 ABB123479
21 239 ABB123480 tu tuABB123480 ABB123480 ABB123480
22 8700 ABB123481 yu yuABB123481 ABB123481 ABB123481
23 6995 ABB123482 dfg dfgABB123482 ABB123482 ABB123482
24 9853 ABB123483 s sABB123483 ABB123483 ABB123483
V2
1 NA
2 NA
3 NA
4 NA
5 NA
6 NA
7 NA
8 NA
9 NA
10 NA
11 NA
12 NA
13 NA
14 NA
15 NA
16 NA
17 ABB123462
18 NA
19 NA
20 NA
21 NA
22 NA
23 NA
24 NA
在这种情况下,只添加了两个新列,因为在提供的示例数据中最多出现两次该模式(甚至修改为上面的 6 次)。
编辑:添加匹配多列模式的代码(在本例中为 filterme1
和 filterme2
):
library(tidyr)
examp <- unite(examp, filterme, filterme1, filterme2, remove = FALSE)
examp$pattern <- str_extract_all(examp$filterme, 'ABB[0-9]{6}')
此时,您可以 运行 在分配了 examp$pattern
的行之后的其余代码。
我们也可以使用lengths
函数
lst <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
m1 <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
examp[paste0("pattern", seq_len(ncol(m1)))] <- m1
examp
# id_info filterme1 filterme2 pattern1 pattern2
#1 123 ABB123460sadjasd sadjasdABB123460 ABB123460 <NA>
#2 3464 ABB123461asjdjs asjdjsABB123461 ABB123461 <NA>
#3 7156 ABB123462ranogvmg ranogvmgABB123462 ABB123462 <NA>
#4 3171 ABB123463dkfohsd dkfohsdABB123463 ABB123463 <NA>
#5 5299 ABB123464fff///sss fff///sssABB123464 ABB123464 <NA>
#6 4541 ABB123465jfsdf jfsdfABB123465 ABB123465 <NA>
#7 4956 ABB123466 sdf sdfABB123466 ABB123466 <NA>
#8 9926 ABB123467 sdf sdfABB123467 ABB123467 <NA>
#9 8418 ABB123468 fff///sss fff///sssABB123468 ABB123468 <NA>
#10 1392 ABB123469 ty tyABB123469 ABB123469 <NA>
#11 9080 ABB123470 fff///sss fff///sssABB123470 ABB123470 <NA>
#12 6455 ABB123471 dfs dfsABB123471 ABB123471 <NA>
#13 2423 ABB123472 ABB123472 ABB123472 <NA>
#14 9101 ABB123473 gt gtABB123473 ABB123473 <NA>
#15 7807 ABB123474 y yABB123474 ABB123474 <NA>
#16 5195 ABB123475 f fABB123475 ABB123475 <NA>
#17 7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476 ABB123462
#18 365 ABB123477 dsd dsdABB123477 ABB123477 <NA>
#19 9062 ABB123478 re reABB123478 ABB123478 <NA>
#20 5558 ABB123479 fgh fghABB123479 ABB123479 <NA>
#21 239 ABB123480 tu tuABB123480 ABB123480 <NA>
#22 8700 ABB123481 yu yuABB123481 ABB123481 <NA>
#23 6995 ABB123482 dfg dfgABB123482 ABB123482 <NA>
#24 9853 ABB123483 s sABB123483 ABB123483 <NA>
library(plyr)
library(stringr)
###example data
examp<- data.frame(id_info = c("123", "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365", "9062", "5558", "239", "8700", "6995", "9853"),
filterme1 = c("ABB123460sadjasd", "ABB123461asjdjs", "ABB123462ranogvmg", "ABB123463dkfohsd", "ABB123464fff///sss", "ABB123465jfsdf", "ABB123466 sdf", "ABB123467 sdf", "ABB123468 fff///sss", "ABB123469 ty", "ABB123470 fff///sss", "ABB123471 dfs", "ABB123472 ", "ABB123473 gt", "ABB123474 y", "ABB123475 f", "ABB123476 gfgABB123462", "ABB123477 dsd", "ABB123478 re", "ABB123479 fgh", "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg", "ABB123483 s"),
filterme2 = c("sadjasdABB123460", "asjdjsABB123461", "ranogvmgABB123462", "dkfohsdABB123463", "fff///sssABB123464", "jfsdfABB123465", "sdfABB123466", "sdfABB123467", "fff///sssABB123468", "tyABB123469", "fff///sssABB123470", "dfsABB123471", "ABB123472", "gtABB123473", "yABB123474", "fABB123475", "ABB123462gfgABB123476", "dsdABB123477", "reABB123478", "fghABB123479", "tuABB123480", "yuABB123481", "dfgABB123482", "sABB123483"))
##id_info should be factor even tho it is numeric
str(examp)
我想从字符串中提取一个元素。该元素应以 "ABB" 开头,后跟 6 位数字。
#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression
下面是我尝试在整个数据帧中使用我的 stringr 函数。
#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))
创建列表后,我想尝试将其全部放回数据中 frame.I 发现 link here 关于将未知长度的列表放入数据帧,但我不确定是否我能不能用这个。
indx <- sapply(examp_str_big, length)
#indx <- lengths(lst)
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
max(indx))))
colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)
所以我想要的最终结果是这样的:
id_info filterme1 filterme2 filterme3 filterme4
123 ABB123460 ABB123460
3464 ABB123461 ABB123461
7156 ABB123462 ABB123462
3171 ABB123463 ABB123463
5299 ABB123464 ABB123464
4541 ABB123465 ABB123465
4956 ABB123466 ABB123466
9926 ABB123467 ABB123467
8418 ABB123468 ABB123468
1392 ABB123469 ABB123469
9080 ABB123470 ABB123470
6455 ABB123471 ABB123471
2423 ABB123472 ABB123472
9101 ABB123473 ABB123473
7807 ABB123474 ABB123474
5195 ABB123475 ABB123475
7827 ABB123476 ABB123462 ABB123462 ABB1234576
365 ABB123477 ABB123477
9062 ABB123478 ABB123478
5558 ABB123479 ABB123479
239 ABB123480 ABB123480
8700 ABB123481 ABB123481
6995 ABB123482 ABB123482
9853 ABB123483 ABB123483
我的实际数据集更长并且有更多 "filterme" 列。任何帮助将不胜感激。如果有另一种更聪明的方法来实现这个目标,我很想听听。
谢谢。
这是一种方法(基于您的原始数据框 examp
,我假设它是用 stringsAsFactors = FALSE
读取的):
library(stringr)
# Extract all occurrences of patterns (NOTE: I am using 6 digits
# since no 7 digits example in provided sample data. Modify to 7.
examp$pattern <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
# Append columns for each found pattern
maxlength <- max(sapply(examp$pattern, length))
examp <- cbind(examp,
as.data.frame(do.call(rbind,
lapply(examp$pattern,
function(x) {
s <- strsplit(x, ' ')
c(s, rep(NA, maxlength - length(s)))
}))))
# This will result in a wider data frame with all found patterns appended
# as new columns
examp
id_info filterme1 filterme2 pattern V1
1 123 ABB123460sadjasd sadjasdABB123460 ABB123460 ABB123460
2 3464 ABB123461asjdjs asjdjsABB123461 ABB123461 ABB123461
3 7156 ABB123462ranogvmg ranogvmgABB123462 ABB123462 ABB123462
4 3171 ABB123463dkfohsd dkfohsdABB123463 ABB123463 ABB123463
5 5299 ABB123464fff///sss fff///sssABB123464 ABB123464 ABB123464
6 4541 ABB123465jfsdf jfsdfABB123465 ABB123465 ABB123465
7 4956 ABB123466 sdf sdfABB123466 ABB123466 ABB123466
8 9926 ABB123467 sdf sdfABB123467 ABB123467 ABB123467
9 8418 ABB123468 fff///sss fff///sssABB123468 ABB123468 ABB123468
10 1392 ABB123469 ty tyABB123469 ABB123469 ABB123469
11 9080 ABB123470 fff///sss fff///sssABB123470 ABB123470 ABB123470
12 6455 ABB123471 dfs dfsABB123471 ABB123471 ABB123471
13 2423 ABB123472 ABB123472 ABB123472 ABB123472
14 9101 ABB123473 gt gtABB123473 ABB123473 ABB123473
15 7807 ABB123474 y yABB123474 ABB123474 ABB123474
16 5195 ABB123475 f fABB123475 ABB123475 ABB123475
17 7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476, ABB123462 ABB123476
18 365 ABB123477 dsd dsdABB123477 ABB123477 ABB123477
19 9062 ABB123478 re reABB123478 ABB123478 ABB123478
20 5558 ABB123479 fgh fghABB123479 ABB123479 ABB123479
21 239 ABB123480 tu tuABB123480 ABB123480 ABB123480
22 8700 ABB123481 yu yuABB123481 ABB123481 ABB123481
23 6995 ABB123482 dfg dfgABB123482 ABB123482 ABB123482
24 9853 ABB123483 s sABB123483 ABB123483 ABB123483
V2
1 NA
2 NA
3 NA
4 NA
5 NA
6 NA
7 NA
8 NA
9 NA
10 NA
11 NA
12 NA
13 NA
14 NA
15 NA
16 NA
17 ABB123462
18 NA
19 NA
20 NA
21 NA
22 NA
23 NA
24 NA
在这种情况下,只添加了两个新列,因为在提供的示例数据中最多出现两次该模式(甚至修改为上面的 6 次)。
编辑:添加匹配多列模式的代码(在本例中为 filterme1
和 filterme2
):
library(tidyr)
examp <- unite(examp, filterme, filterme1, filterme2, remove = FALSE)
examp$pattern <- str_extract_all(examp$filterme, 'ABB[0-9]{6}')
此时,您可以 运行 在分配了 examp$pattern
的行之后的其余代码。
我们也可以使用lengths
函数
lst <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
m1 <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
examp[paste0("pattern", seq_len(ncol(m1)))] <- m1
examp
# id_info filterme1 filterme2 pattern1 pattern2
#1 123 ABB123460sadjasd sadjasdABB123460 ABB123460 <NA>
#2 3464 ABB123461asjdjs asjdjsABB123461 ABB123461 <NA>
#3 7156 ABB123462ranogvmg ranogvmgABB123462 ABB123462 <NA>
#4 3171 ABB123463dkfohsd dkfohsdABB123463 ABB123463 <NA>
#5 5299 ABB123464fff///sss fff///sssABB123464 ABB123464 <NA>
#6 4541 ABB123465jfsdf jfsdfABB123465 ABB123465 <NA>
#7 4956 ABB123466 sdf sdfABB123466 ABB123466 <NA>
#8 9926 ABB123467 sdf sdfABB123467 ABB123467 <NA>
#9 8418 ABB123468 fff///sss fff///sssABB123468 ABB123468 <NA>
#10 1392 ABB123469 ty tyABB123469 ABB123469 <NA>
#11 9080 ABB123470 fff///sss fff///sssABB123470 ABB123470 <NA>
#12 6455 ABB123471 dfs dfsABB123471 ABB123471 <NA>
#13 2423 ABB123472 ABB123472 ABB123472 <NA>
#14 9101 ABB123473 gt gtABB123473 ABB123473 <NA>
#15 7807 ABB123474 y yABB123474 ABB123474 <NA>
#16 5195 ABB123475 f fABB123475 ABB123475 <NA>
#17 7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476 ABB123462
#18 365 ABB123477 dsd dsdABB123477 ABB123477 <NA>
#19 9062 ABB123478 re reABB123478 ABB123478 <NA>
#20 5558 ABB123479 fgh fghABB123479 ABB123479 <NA>
#21 239 ABB123480 tu tuABB123480 ABB123480 <NA>
#22 8700 ABB123481 yu yuABB123481 ABB123481 <NA>
#23 6995 ABB123482 dfg dfgABB123482 ABB123482 <NA>
#24 9853 ABB123483 s sABB123483 ABB123483 <NA>