R按列中的新行拆分数据框

Question

我正在尝试通过换行符“\n”拆分列中的字符串。这是一个数据框 sample_data:

 test_data <- data.frame(ID=c('john@xxx.com', 'sally@xxx.com'),
                  Changes=c('3 max cost changes
  productxyz > pb100  > a : Max cost decreased from [=10=].98 to [=10=].83
  productxyz > pb2  > a : Max cost decreased from .07 to [=10=].91
  productxyz > pb2  > b : Max cost decreased from [=10=].65 to [=10=].55', 
                            '2 max cost changes
  productabc > pb1000  > d : Max cost decreased from .07 to [=10=].91
  productabc > pb1000  > x : Max cost decreased from .44 to .22'), stringsAsFactors=FALSE)

我的目标是将价格提取到列中并获得如下结果集：

ID              Prev_Price    New_Price
john@xxx.com     [=11=].98            [=11=].83
john@xxx.com     .07            [=11=].91
john@xxx.com     [=11=].65            [=11=].55
sally@xxx.com    .07            [=11=].91
sally@xxx.com    .44            .22

我试过使用 tidyr 包，但我的结果充满了 N/A。

vars <- c("Prev_Price","New_Price")
seperate(sample_data, Changes, into = vars, sep = "[A-Za-z]+from", extra= "drop")

如有任何帮助，我们将不胜感激。

谢谢！

Answer 1

尝试

df1$ID <- df1$ID[df1$ID!=''][cumsum(df1$ID!='')]
library(stringi)
setNames(data.frame(df1$ID, do.call(rbind,stri_extract_all(df1$Changes, 
       regex='\$\d*'))), c('ID', 'Prev_Price', 'New_Price'))
 #   ID Prev_Price New_Price
 #1  A               
 #2  A               
 #3  B               
 #4  B

或

library(tidyr)
extract(df1, Changes, into=c('Prev_Price', 'New_Price'), 
          '[^$]*(\$\d*)[^$]*(\$\d*)')
#   ID Prev_Price New_Price
#1  A               
#2  A               
#3  B               
#4  B

或

library(data.table)#v1.9.5+
setDT(df1)[, c('Prev_Price', 'New_Price') := tstrsplit(Changes, 
                                 '[A-Za-z ]+')[-1]][]
#   ID              Changes Prev_Price New_Price
#1:  A down from  to                
#2:  A down from  to                
#3:  B down from  to                
#4:  B down from  to

注意："Changes" 可以删除

或仅使用 base R 方法

data.frame(ID=df1$ID, read.table(text=gsub('[^$]*(\$\d+)', ' \1 ', 
   df1$Changes),col.names=c('Prev_Price', 'New_Price'), 
                    stringsAsFactors=FALSE))
 #   ID Prev_Price New_Price
 #1  A               
 #2  A               
 #3  B               
 #4  B

更新

如果元素位于同一个单元格中，一种选择是使用 data.table 的开发版本，即。 v1.9.5+。它可以从 here

安装

这里，我们使用相同的代码拆分'Changes' (tstrsplit(Changes,..))，然后通过指定measure.vars作为list，如果需要 order 通过 'ID' 并删除不需要的列 ('variable')。

 melt(
   setDT(df2)[, paste0('V',1:4) := tstrsplit(Changes,
           '[A-Za-z ]+')[-1]][,-2, with=FALSE],
      id.var='ID', measure=list(c('V1', 'V3'), c('V2', 'V4')), 
        value.name=c('Prev_Price', 'New_Price'))[order(ID)][, variable:=NULL]
  #    ID Prev_Price New_Price
  #1:  A               
  #2:  A               
  #3:  B               
  #4:  B

或者我们可以像以前一样使用gsub，然后用reshape从base R

转换成long格式

 d1 <- data.frame(ID=df2$ID,read.table(text=gsub('[^$]*(\$\d+)',
                 ' \1 ', df2$Changes)))

colnames(d1)[-1] <- paste0(c('Prev_Price.', 'New_Price.'), 
                          rep(1:2,each=2))
reshape(d1, idvar='ID', varying=2:ncol(d1), sep=".", direction='long')
#    ID time Prev_Price New_Price
#A.1  A    1               
#B.1  B    1               
#A.2  A    2               
#B.2  B    2

更新2

对于新数据集 ("df3")，我们可以使用 stri_extract_all_regex 提取 $ 后跟数字，包括 '\$[0-9.]+' 的小数点 ('\$[0-9.]+') 64=] 列，使用 Map 将第一列与我们在将输出更改为 matrix 后从 stri_extract_all_regex 获得的 list 输出组合（因为我们需要交替元素位于不同的列中），然后是 rbind (do.call(rbind,)。

library(stringi)
res <- do.call(rbind,
       Map(function(x,y) data.frame(x,matrix(y, ncol=2, byrow=TRUE, 
           dimnames=list(NULL, c("Prev_Price", "New_Price")))),
        df3$ID, stri_extract_all_regex(df3$Changes, '\$[0-9.]+')))
row.names(res) <- NULL
res
#              x Prev_Price New_Price
#1  john@xxx.com      [=16=].98     [=16=].83
#2  john@xxx.com      .07     [=16=].91
#3  john@xxx.com      [=16=].65     [=16=].55
#4 sally@xxx.com      .07     [=16=].91
#5 sally@xxx.com      .44     .22

数据

df1 <- structure(list(ID = c("A", "", "B", ""), 
 Changes = c("down from  to ", 
"down from  to ", "down from  to ", "down from  to "
)), .Names = c("ID", "Changes"), class = "data.frame", 
row.names = c(NA, -4L))

df2 <- data.frame(ID=c('A', 'B'),
   Changes=c('down from  to  down from  to ', 
  'down from  to  down from  to '), stringsAsFactors=FALSE)

Answer 2

df <- data.frame(ID=c('A','','B',''), Changes=c('down from  to ','down from  to ','down from  to ','down from  to '), stringsAsFactors=F );
with(list(ss=strsplit(df$Changes,'\s+')),transform(df,ID=ID[ID!=''][cumsum(ID!='')],Prev_Price=sapply(ss,function(v)v[3]),New_Price=sapply(ss,function(v)v[5]),Changes=NULL));
##   ID Prev_Price New_Price
## 1  A               
## 2  A               
## 3  B               
## 4  B

另一种方法：

with(df,cbind(ID=ID[ID!=''][cumsum(ID!='')],setNames(as.data.frame(do.call(rbind,strsplit(Changes,'\s+'))[,c(3,5)]),c('Prev_Price','New_Price'))));
## same result

R按列中的新行拆分数据框

R Split dataframe by new line in column

regex

substring

r

更新

更新2

数据