R string_extract_all 使用 plyr 循环遍历数据帧

Question

library(plyr)
library(stringr)



###example data
examp<- data.frame(id_info = c("123",   "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365",  "9062", "5558", "239",  "8700", "6995", "9853"),
                   filterme1 = c("ABB123460sadjasd",    "ABB123461asjdjs",  "ABB123462ranogvmg",    "ABB123463dkfohsd", "ABB123464fff///sss",   "ABB123465jfsdf",   "ABB123466 sdf",    "ABB123467 sdf",    "ABB123468 fff///sss",  "ABB123469 ty", "ABB123470 fff///sss",  "ABB123471 dfs",    "ABB123472 ",   "ABB123473 gt", "ABB123474 y",  "ABB123475 f",  "ABB123476 gfgABB123462",   "ABB123477 dsd",    "ABB123478 re", "ABB123479 fgh",    "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg",    "ABB123483 s"),
                    filterme2  = c("sadjasdABB123460",  "asjdjsABB123461",  "ranogvmgABB123462",    "dkfohsdABB123463", "fff///sssABB123464",   "jfsdfABB123465",   "sdfABB123466", "sdfABB123467", "fff///sssABB123468",   "tyABB123469",  "fff///sssABB123470",   "dfsABB123471", "ABB123472",    "gtABB123473",  "yABB123474",   "fABB123475",   "ABB123462gfgABB123476",    "dsdABB123477", "reABB123478",  "fghABB123479", "tuABB123480",  "yuABB123481",  "dfgABB123482", "sABB123483"))

##id_info should be factor even tho it is numeric
str(examp)

我想从字符串中提取一个元素。该元素应以 "ABB" 开头，后跟 6 位数字。

#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression

下面是我尝试在整个数据帧中使用我的 stringr 函数。

#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
                   function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))

创建列表后，我想尝试将其全部放回数据中 frame.I 发现 link here 关于将未知长度的列表放入数据帧，但我不确定是否我能不能用这个。

indx <- sapply(examp_str_big, length)
#indx <- lengths(lst) 
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
                                          max(indx))))

colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)

所以我想要的最终结果是这样的：

id_info filterme1   filterme2   filterme3   filterme4
123     ABB123460               ABB123460   
3464    ABB123461               ABB123461   
7156    ABB123462               ABB123462   
3171    ABB123463               ABB123463   
5299    ABB123464               ABB123464   
4541    ABB123465               ABB123465   
4956    ABB123466               ABB123466   
9926    ABB123467               ABB123467   
8418    ABB123468               ABB123468   
1392    ABB123469               ABB123469   
9080    ABB123470               ABB123470   
6455    ABB123471               ABB123471   
2423    ABB123472               ABB123472   
9101    ABB123473               ABB123473   
7807    ABB123474               ABB123474   
5195    ABB123475               ABB123475   
7827    ABB123476   ABB123462   ABB123462   ABB1234576
365      ABB123477              ABB123477   
9062    ABB123478               ABB123478   
5558    ABB123479               ABB123479   
239     ABB123480               ABB123480   
8700    ABB123481               ABB123481   
6995    ABB123482               ABB123482   
9853    ABB123483               ABB123483

我的实际数据集更长并且有更多 "filterme" 列。任何帮助将不胜感激。如果有另一种更聪明的方法来实现这个目标，我很想听听。

谢谢。

Answer 1

这是一种方法（基于您的原始数据框 examp，我假设它是用 stringsAsFactors = FALSE 读取的）：

library(stringr)
# Extract all occurrences of patterns (NOTE: I am using 6 digits
# since no 7 digits example in provided sample data. Modify to 7.
examp$pattern <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')

# Append columns for each found pattern
maxlength <- max(sapply(examp$pattern, length))
examp <- cbind(examp,
               as.data.frame(do.call(rbind,
                                     lapply(examp$pattern,
                                            function(x) {
                                              s <- strsplit(x, ' ')
                                              c(s, rep(NA, maxlength - length(s)))
                                              }))))

# This will result in a wider data frame with all found patterns appended
# as new columns

examp

   id_info              filterme1             filterme2              pattern        V1
1      123       ABB123460sadjasd      sadjasdABB123460            ABB123460 ABB123460
2     3464        ABB123461asjdjs       asjdjsABB123461            ABB123461 ABB123461
3     7156      ABB123462ranogvmg     ranogvmgABB123462            ABB123462 ABB123462
4     3171       ABB123463dkfohsd      dkfohsdABB123463            ABB123463 ABB123463
5     5299     ABB123464fff///sss    fff///sssABB123464            ABB123464 ABB123464
6     4541         ABB123465jfsdf        jfsdfABB123465            ABB123465 ABB123465
7     4956          ABB123466 sdf          sdfABB123466            ABB123466 ABB123466
8     9926          ABB123467 sdf          sdfABB123467            ABB123467 ABB123467
9     8418    ABB123468 fff///sss    fff///sssABB123468            ABB123468 ABB123468
10    1392           ABB123469 ty           tyABB123469            ABB123469 ABB123469
11    9080    ABB123470 fff///sss    fff///sssABB123470            ABB123470 ABB123470
12    6455          ABB123471 dfs          dfsABB123471            ABB123471 ABB123471
13    2423             ABB123472              ABB123472            ABB123472 ABB123472
14    9101           ABB123473 gt           gtABB123473            ABB123473 ABB123473
15    7807            ABB123474 y            yABB123474            ABB123474 ABB123474
16    5195            ABB123475 f            fABB123475            ABB123475 ABB123475
17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476, ABB123462 ABB123476
18     365          ABB123477 dsd          dsdABB123477            ABB123477 ABB123477
19    9062           ABB123478 re           reABB123478            ABB123478 ABB123478
20    5558          ABB123479 fgh          fghABB123479            ABB123479 ABB123479
21     239           ABB123480 tu           tuABB123480            ABB123480 ABB123480
22    8700           ABB123481 yu           yuABB123481            ABB123481 ABB123481
23    6995          ABB123482 dfg          dfgABB123482            ABB123482 ABB123482
24    9853            ABB123483 s            sABB123483            ABB123483 ABB123483
          V2
1         NA
2         NA
3         NA
4         NA
5         NA
6         NA
7         NA
8         NA
9         NA
10        NA
11        NA
12        NA
13        NA
14        NA
15        NA
16        NA
17 ABB123462
18        NA
19        NA
20        NA
21        NA
22        NA
23        NA
24        NA

在这种情况下，只添加了两个新列，因为在提供的示例数据中最多出现两次该模式（甚至修改为上面的 6 次）。

编辑：添加匹配多列模式的代码（在本例中为 filterme1 和 filterme2）：

library(tidyr)
examp <- unite(examp, filterme, filterme1, filterme2, remove = FALSE)
examp$pattern <- str_extract_all(examp$filterme, 'ABB[0-9]{6}')

此时，您可以运行在分配了 examp$pattern 的行之后的其余代码。

Answer 2

我们也可以使用lengths函数

 lst <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
 m1 <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
 examp[paste0("pattern", seq_len(ncol(m1)))] <- m1
 examp
#   id_info              filterme1             filterme2  pattern1  pattern2
#1      123       ABB123460sadjasd      sadjasdABB123460 ABB123460      <NA>
#2     3464        ABB123461asjdjs       asjdjsABB123461 ABB123461      <NA>
#3     7156      ABB123462ranogvmg     ranogvmgABB123462 ABB123462      <NA>
#4     3171       ABB123463dkfohsd      dkfohsdABB123463 ABB123463      <NA>
#5     5299     ABB123464fff///sss    fff///sssABB123464 ABB123464      <NA>
#6     4541         ABB123465jfsdf        jfsdfABB123465 ABB123465      <NA>
#7     4956          ABB123466 sdf          sdfABB123466 ABB123466      <NA>
#8     9926          ABB123467 sdf          sdfABB123467 ABB123467      <NA>
#9     8418    ABB123468 fff///sss    fff///sssABB123468 ABB123468      <NA>
#10    1392           ABB123469 ty           tyABB123469 ABB123469      <NA>
#11    9080    ABB123470 fff///sss    fff///sssABB123470 ABB123470      <NA>
#12    6455          ABB123471 dfs          dfsABB123471 ABB123471      <NA>
#13    2423             ABB123472              ABB123472 ABB123472      <NA>
#14    9101           ABB123473 gt           gtABB123473 ABB123473      <NA>
#15    7807            ABB123474 y            yABB123474 ABB123474      <NA>
#16    5195            ABB123475 f            fABB123475 ABB123475      <NA>
#17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476 ABB123462
#18     365          ABB123477 dsd          dsdABB123477 ABB123477      <NA>
#19    9062           ABB123478 re           reABB123478 ABB123478      <NA>
#20    5558          ABB123479 fgh          fghABB123479 ABB123479      <NA>
#21     239           ABB123480 tu           tuABB123480 ABB123480      <NA>
#22    8700           ABB123481 yu           yuABB123481 ABB123481      <NA>
#23    6995          ABB123482 dfg          dfgABB123482 ABB123482      <NA>
#24    9853            ABB123483 s            sABB123483 ABB123483      <NA>

R string_extract_all 使用 plyr 循环遍历数据帧

R string_extract_all looped through data frame using plyr

r

plyr

stringr