R按列中的新行拆分数据框
R Split dataframe by new line in column
我正在尝试通过换行符“\n”拆分列中的字符串。
这是一个数据框 sample_data:
test_data <- data.frame(ID=c('john@xxx.com', 'sally@xxx.com'),
Changes=c('3 max cost changes
productxyz > pb100 > a : Max cost decreased from [=10=].98 to [=10=].83
productxyz > pb2 > a : Max cost decreased from .07 to [=10=].91
productxyz > pb2 > b : Max cost decreased from [=10=].65 to [=10=].55',
'2 max cost changes
productabc > pb1000 > d : Max cost decreased from .07 to [=10=].91
productabc > pb1000 > x : Max cost decreased from .44 to .22'), stringsAsFactors=FALSE)
我的目标是将价格提取到列中并获得如下结果集:
ID Prev_Price New_Price
john@xxx.com [=11=].98 [=11=].83
john@xxx.com .07 [=11=].91
john@xxx.com [=11=].65 [=11=].55
sally@xxx.com .07 [=11=].91
sally@xxx.com .44 .22
我试过使用 tidyr 包,但我的结果充满了 N/A。
vars <- c("Prev_Price","New_Price")
seperate(sample_data, Changes, into = vars, sep = "[A-Za-z]+from", extra= "drop")
如有任何帮助,我们将不胜感激。
谢谢!
尝试
df1$ID <- df1$ID[df1$ID!=''][cumsum(df1$ID!='')]
library(stringi)
setNames(data.frame(df1$ID, do.call(rbind,stri_extract_all(df1$Changes,
regex='\$\d*'))), c('ID', 'Prev_Price', 'New_Price'))
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
或
library(tidyr)
extract(df1, Changes, into=c('Prev_Price', 'New_Price'),
'[^$]*(\$\d*)[^$]*(\$\d*)')
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
或
library(data.table)#v1.9.5+
setDT(df1)[, c('Prev_Price', 'New_Price') := tstrsplit(Changes,
'[A-Za-z ]+')[-1]][]
# ID Changes Prev_Price New_Price
#1: A down from to
#2: A down from to
#3: B down from to
#4: B down from to
注意:"Changes" 可以删除
或仅使用 base R
方法
data.frame(ID=df1$ID, read.table(text=gsub('[^$]*(\$\d+)', ' \1 ',
df1$Changes),col.names=c('Prev_Price', 'New_Price'),
stringsAsFactors=FALSE))
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
更新
如果元素位于同一个单元格中,一种选择是使用 data.table
的开发版本,即。 v1.9.5+。它可以从 here
安装
这里,我们使用相同的代码拆分'Changes' (tstrsplit(Changes,..)
),然后通过指定measure.vars
作为list
,如果需要 order
通过 'ID' 并删除不需要的列 ('variable')。
melt(
setDT(df2)[, paste0('V',1:4) := tstrsplit(Changes,
'[A-Za-z ]+')[-1]][,-2, with=FALSE],
id.var='ID', measure=list(c('V1', 'V3'), c('V2', 'V4')),
value.name=c('Prev_Price', 'New_Price'))[order(ID)][, variable:=NULL]
# ID Prev_Price New_Price
#1: A
#2: A
#3: B
#4: B
或者我们可以像以前一样使用gsub
,然后用reshape
从base R
转换成long
格式
d1 <- data.frame(ID=df2$ID,read.table(text=gsub('[^$]*(\$\d+)',
' \1 ', df2$Changes)))
colnames(d1)[-1] <- paste0(c('Prev_Price.', 'New_Price.'),
rep(1:2,each=2))
reshape(d1, idvar='ID', varying=2:ncol(d1), sep=".", direction='long')
# ID time Prev_Price New_Price
#A.1 A 1
#B.1 B 1
#A.2 A 2
#B.2 B 2
更新2
对于新数据集 ("df3"),我们可以使用 stri_extract_all_regex
提取 $
后跟数字,包括 '\$[0-9.]+'
的小数点 ('\$[0-9.]+'
) 64=] 列,使用 Map
将第一列与我们在将输出更改为 matrix
后从 stri_extract_all_regex
获得的 list
输出组合(因为我们需要交替元素位于不同的列中),然后是 rbind
(do.call(rbind,
)。
library(stringi)
res <- do.call(rbind,
Map(function(x,y) data.frame(x,matrix(y, ncol=2, byrow=TRUE,
dimnames=list(NULL, c("Prev_Price", "New_Price")))),
df3$ID, stri_extract_all_regex(df3$Changes, '\$[0-9.]+')))
row.names(res) <- NULL
res
# x Prev_Price New_Price
#1 john@xxx.com [=16=].98 [=16=].83
#2 john@xxx.com .07 [=16=].91
#3 john@xxx.com [=16=].65 [=16=].55
#4 sally@xxx.com .07 [=16=].91
#5 sally@xxx.com .44 .22
数据
df1 <- structure(list(ID = c("A", "", "B", ""),
Changes = c("down from to ",
"down from to ", "down from to ", "down from to "
)), .Names = c("ID", "Changes"), class = "data.frame",
row.names = c(NA, -4L))
df2 <- data.frame(ID=c('A', 'B'),
Changes=c('down from to down from to ',
'down from to down from to '), stringsAsFactors=FALSE)
df <- data.frame(ID=c('A','','B',''), Changes=c('down from to ','down from to ','down from to ','down from to '), stringsAsFactors=F );
with(list(ss=strsplit(df$Changes,'\s+')),transform(df,ID=ID[ID!=''][cumsum(ID!='')],Prev_Price=sapply(ss,function(v)v[3]),New_Price=sapply(ss,function(v)v[5]),Changes=NULL));
## ID Prev_Price New_Price
## 1 A
## 2 A
## 3 B
## 4 B
另一种方法:
with(df,cbind(ID=ID[ID!=''][cumsum(ID!='')],setNames(as.data.frame(do.call(rbind,strsplit(Changes,'\s+'))[,c(3,5)]),c('Prev_Price','New_Price'))));
## same result
我正在尝试通过换行符“\n”拆分列中的字符串。 这是一个数据框 sample_data:
test_data <- data.frame(ID=c('john@xxx.com', 'sally@xxx.com'),
Changes=c('3 max cost changes
productxyz > pb100 > a : Max cost decreased from [=10=].98 to [=10=].83
productxyz > pb2 > a : Max cost decreased from .07 to [=10=].91
productxyz > pb2 > b : Max cost decreased from [=10=].65 to [=10=].55',
'2 max cost changes
productabc > pb1000 > d : Max cost decreased from .07 to [=10=].91
productabc > pb1000 > x : Max cost decreased from .44 to .22'), stringsAsFactors=FALSE)
我的目标是将价格提取到列中并获得如下结果集:
ID Prev_Price New_Price
john@xxx.com [=11=].98 [=11=].83
john@xxx.com .07 [=11=].91
john@xxx.com [=11=].65 [=11=].55
sally@xxx.com .07 [=11=].91
sally@xxx.com .44 .22
我试过使用 tidyr 包,但我的结果充满了 N/A。
vars <- c("Prev_Price","New_Price")
seperate(sample_data, Changes, into = vars, sep = "[A-Za-z]+from", extra= "drop")
如有任何帮助,我们将不胜感激。
谢谢!
尝试
df1$ID <- df1$ID[df1$ID!=''][cumsum(df1$ID!='')]
library(stringi)
setNames(data.frame(df1$ID, do.call(rbind,stri_extract_all(df1$Changes,
regex='\$\d*'))), c('ID', 'Prev_Price', 'New_Price'))
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
或
library(tidyr)
extract(df1, Changes, into=c('Prev_Price', 'New_Price'),
'[^$]*(\$\d*)[^$]*(\$\d*)')
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
或
library(data.table)#v1.9.5+
setDT(df1)[, c('Prev_Price', 'New_Price') := tstrsplit(Changes,
'[A-Za-z ]+')[-1]][]
# ID Changes Prev_Price New_Price
#1: A down from to
#2: A down from to
#3: B down from to
#4: B down from to
注意:"Changes" 可以删除
或仅使用 base R
方法
data.frame(ID=df1$ID, read.table(text=gsub('[^$]*(\$\d+)', ' \1 ',
df1$Changes),col.names=c('Prev_Price', 'New_Price'),
stringsAsFactors=FALSE))
# ID Prev_Price New_Price
#1 A
#2 A
#3 B
#4 B
更新
如果元素位于同一个单元格中,一种选择是使用 data.table
的开发版本,即。 v1.9.5+。它可以从 here
这里,我们使用相同的代码拆分'Changes' (tstrsplit(Changes,..)
),然后通过指定measure.vars
作为list
,如果需要 order
通过 'ID' 并删除不需要的列 ('variable')。
melt(
setDT(df2)[, paste0('V',1:4) := tstrsplit(Changes,
'[A-Za-z ]+')[-1]][,-2, with=FALSE],
id.var='ID', measure=list(c('V1', 'V3'), c('V2', 'V4')),
value.name=c('Prev_Price', 'New_Price'))[order(ID)][, variable:=NULL]
# ID Prev_Price New_Price
#1: A
#2: A
#3: B
#4: B
或者我们可以像以前一样使用gsub
,然后用reshape
从base R
long
格式
d1 <- data.frame(ID=df2$ID,read.table(text=gsub('[^$]*(\$\d+)',
' \1 ', df2$Changes)))
colnames(d1)[-1] <- paste0(c('Prev_Price.', 'New_Price.'),
rep(1:2,each=2))
reshape(d1, idvar='ID', varying=2:ncol(d1), sep=".", direction='long')
# ID time Prev_Price New_Price
#A.1 A 1
#B.1 B 1
#A.2 A 2
#B.2 B 2
更新2
对于新数据集 ("df3"),我们可以使用 stri_extract_all_regex
提取 $
后跟数字,包括 '\$[0-9.]+'
的小数点 ('\$[0-9.]+'
) 64=] 列,使用 Map
将第一列与我们在将输出更改为 matrix
后从 stri_extract_all_regex
获得的 list
输出组合(因为我们需要交替元素位于不同的列中),然后是 rbind
(do.call(rbind,
)。
library(stringi)
res <- do.call(rbind,
Map(function(x,y) data.frame(x,matrix(y, ncol=2, byrow=TRUE,
dimnames=list(NULL, c("Prev_Price", "New_Price")))),
df3$ID, stri_extract_all_regex(df3$Changes, '\$[0-9.]+')))
row.names(res) <- NULL
res
# x Prev_Price New_Price
#1 john@xxx.com [=16=].98 [=16=].83
#2 john@xxx.com .07 [=16=].91
#3 john@xxx.com [=16=].65 [=16=].55
#4 sally@xxx.com .07 [=16=].91
#5 sally@xxx.com .44 .22
数据
df1 <- structure(list(ID = c("A", "", "B", ""),
Changes = c("down from to ",
"down from to ", "down from to ", "down from to "
)), .Names = c("ID", "Changes"), class = "data.frame",
row.names = c(NA, -4L))
df2 <- data.frame(ID=c('A', 'B'),
Changes=c('down from to down from to ',
'down from to down from to '), stringsAsFactors=FALSE)
df <- data.frame(ID=c('A','','B',''), Changes=c('down from to ','down from to ','down from to ','down from to '), stringsAsFactors=F );
with(list(ss=strsplit(df$Changes,'\s+')),transform(df,ID=ID[ID!=''][cumsum(ID!='')],Prev_Price=sapply(ss,function(v)v[3]),New_Price=sapply(ss,function(v)v[5]),Changes=NULL));
## ID Prev_Price New_Price
## 1 A
## 2 A
## 3 B
## 4 B
另一种方法:
with(df,cbind(ID=ID[ID!=''][cumsum(ID!='')],setNames(as.data.frame(do.call(rbind,strsplit(Changes,'\s+'))[,c(3,5)]),c('Prev_Price','New_Price'))));
## same result