自动合并列,其中合并的值在容器中用定界符分隔
Automate columns merging where merged values are separated with a delimiter in the container
我有一个大数据框,其中很多列的列名称中都有 .
。这是我在下面示例中的方法:
#this somewhat resembles what we have in hand
df <- data.frame(id= c("HD1", "HD2", "HD3", "HD4"),
mon.1= c(1, 0, 1, 4),
mon.2= c("a", "b", "c", "d"),
mon.2.4.1...1= c("#ji", "#ki", NA, "#ui"),
tue.6= c("1", "2", "3", "4"),
tue= c(190, 2345, 41, 89),
heh= c("1mn", "2a", "g78", "asd324"),
wed= c(1890, 9002, 14341, 657),
wed.01= c(NA, "@ksdf", NA, NA),
thu.0234= c("@jksdff", "@sfd", "@kukg.676", "@jdkfjk"),
rating= c(1,2,3,4))
#in order to collapse the columns, we can apply a mapply variant
#here i attach a new column to original df named combined1 which collapses all mon.....
df<- within(df,
combined1 <- Map(list, as.character(mon.1),
as.character(mon.2),
as.character(mon.2.4.1...1)))
#process repeats for others
df <- within(df,
combined2 <- Map(list, as.character(tue.6),
as.character(tue)))
df <- within(df,
combined3 <- Map(list, as.character(wed),
as.character(wed.01)))
产生逗号分隔的列 Combined1
、Combined2
、Combined3
:
# A tibble: 4 x 14
id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating
<fct> <dbl> <fct> <fct> <fct> <dbl> <fct> <dbl> <fct> <fct> <dbl>
1 HD1 1 a #ji 1 190 1mn 1890 NA @jksdff 1
2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2
3 HD3 1 c NA 3 41 g78 14341 NA @kukg.6~ 3
4 HD4 4 d #ui 4 89 asd3~ 657 NA @jdkfjk 4
# ... with 3 more variables: combined1 <named list>, combined2 <named list>,
# combined3 <named list>
我的问题是 mon
、tue
和 wed
大约有 20-30 列,我在创建一个方法时遇到问题,该方法将读取所有内容,例如, wed
、wed1.43654
、wed.46
等。这样我就不必手动输入它们了。非常感谢您的帮助!
编辑
像这样
> df[,11:14]
rating combined1 combined2 combined3
1 1 1, a, #ji 1, 190 1890, NA
2 2 0, b, #ki 2, 2345 9002, @ksdf
3 3 1, c, NA 3, 41 14341, NA
4 4 4, d, #ui 4, 89 657, NA
这是一个选项,我们删除以 .
('nm1') 开头的列名称的后缀部分,创建一个频率 table
,提取具有多个的名称count ('nm2'),遍历这些唯一名称,使用 grep
从数据集中提取列,并将其分配给创建 'combined' 列
nm1 <- sub("\..*", "", names(df))
nm2 <- names(which(table(nm1) > 1))
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x) df[grep(x, names(df))])
df$combined1
# mon.1 mon.2 mon.2.4.1...1
#1 1 a #ji
#2 0 b #ki
#3 1 c <NA>
#4 4 d #ui
如果我们希望这是一个 list
列
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))], 1, function(x) as.list(c(x))))
df
# id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating combined1 combined2 combined3
#1 HD1 1 a #ji 1 190 1mn 1890 <NA> @jksdff 1 1, a, #ji 1, 190 1890, NA
#2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2 0, b, #ki 2, 2345 9002, @ksdf
#3 HD3 1 c <NA> 3 41 g78 14341 <NA> @kukg.676 3 1, c, NA 3, 41 14341, NA
#4 HD4 4 d #ui 4 89 asd324 657 <NA> @jdkfjk 4 4, d, #ui 4, 89 657, NA
WolfgangBagdanow-
Yeah please, It would be nice if can have them in a list form or
something because I will writing them to a file. My goal is to
cut-down unnecessary column creations. I put down how I would like in
my question above
这是对@akrun 的第二个答案的扩展。如果您决定将此 df
写入 csv 文件,那么请知道 readr
、data.table
和 base
写入 csv 方法都不起作用,因为它们没有实现方法然而,将列表写入 csv 文件(据我所知)。为了将具有列表列的数据框写入 csv 文件,您需要这样的东西-
#@akrun 's list method-
#df <- data.frame(...)
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))], 1, function(x) as.list(c(x))))
#------------------------------------
set_lists_to_chars <- function(x) {
if(class(x) == 'list') {
y <- paste(unlist(x[1]), sep='', collapse=', ')
} else {
y <- x
}
return(y)
}
new_frame <- data.frame(lapply(tibble(df), set_lists_to_chars), stringsAsFactors = F)
write.csv(new_frame, file='test.csv')
或者,您也可以使用 apply
方法将列表转换为字符。无论如何,这会让你得到 csv 中想要的东西。希望。顺便提一下,这个问题很好,@akrun 给出了很好的答案。祝你好运!
好吧,如果你正在尝试为 csv 做准备,一个常见的方法是使用管道将困难部分的全部功劳分给@akrun 但是...
nm1 <- sub("\..*", "", names(df))
nm2 <- names(which(table(nm1) > 1))
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))],
1,
function(x) str_replace_all(toString(x),
pattern = ", ",
replacement = "|")))
> df
id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating combined1 combined2 combined3
1 HD1 1 a #ji 1 190 1mn 1890 <NA> @jksdff 1 1|a|#ji 1| 190 1890|NA
2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2 0|b|#ki 2|2345 9002|@ksdf
3 HD3 1 c <NA> 3 41 g78 14341 <NA> @kukg.676 3 1|c|NA 3| 41 14341|NA
4 HD4 4 d #ui 4 89 asd324 657 <NA> @jdkfjk 4 4|d|#ui 4| 89 657|NA
我有一个大数据框,其中很多列的列名称中都有 .
。这是我在下面示例中的方法:
#this somewhat resembles what we have in hand
df <- data.frame(id= c("HD1", "HD2", "HD3", "HD4"),
mon.1= c(1, 0, 1, 4),
mon.2= c("a", "b", "c", "d"),
mon.2.4.1...1= c("#ji", "#ki", NA, "#ui"),
tue.6= c("1", "2", "3", "4"),
tue= c(190, 2345, 41, 89),
heh= c("1mn", "2a", "g78", "asd324"),
wed= c(1890, 9002, 14341, 657),
wed.01= c(NA, "@ksdf", NA, NA),
thu.0234= c("@jksdff", "@sfd", "@kukg.676", "@jdkfjk"),
rating= c(1,2,3,4))
#in order to collapse the columns, we can apply a mapply variant
#here i attach a new column to original df named combined1 which collapses all mon.....
df<- within(df,
combined1 <- Map(list, as.character(mon.1),
as.character(mon.2),
as.character(mon.2.4.1...1)))
#process repeats for others
df <- within(df,
combined2 <- Map(list, as.character(tue.6),
as.character(tue)))
df <- within(df,
combined3 <- Map(list, as.character(wed),
as.character(wed.01)))
产生逗号分隔的列 Combined1
、Combined2
、Combined3
:
# A tibble: 4 x 14
id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating
<fct> <dbl> <fct> <fct> <fct> <dbl> <fct> <dbl> <fct> <fct> <dbl>
1 HD1 1 a #ji 1 190 1mn 1890 NA @jksdff 1
2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2
3 HD3 1 c NA 3 41 g78 14341 NA @kukg.6~ 3
4 HD4 4 d #ui 4 89 asd3~ 657 NA @jdkfjk 4
# ... with 3 more variables: combined1 <named list>, combined2 <named list>,
# combined3 <named list>
我的问题是 mon
、tue
和 wed
大约有 20-30 列,我在创建一个方法时遇到问题,该方法将读取所有内容,例如, wed
、wed1.43654
、wed.46
等。这样我就不必手动输入它们了。非常感谢您的帮助!
编辑 像这样
> df[,11:14]
rating combined1 combined2 combined3
1 1 1, a, #ji 1, 190 1890, NA
2 2 0, b, #ki 2, 2345 9002, @ksdf
3 3 1, c, NA 3, 41 14341, NA
4 4 4, d, #ui 4, 89 657, NA
这是一个选项,我们删除以 .
('nm1') 开头的列名称的后缀部分,创建一个频率 table
,提取具有多个的名称count ('nm2'),遍历这些唯一名称,使用 grep
从数据集中提取列,并将其分配给创建 'combined' 列
nm1 <- sub("\..*", "", names(df))
nm2 <- names(which(table(nm1) > 1))
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x) df[grep(x, names(df))])
df$combined1
# mon.1 mon.2 mon.2.4.1...1
#1 1 a #ji
#2 0 b #ki
#3 1 c <NA>
#4 4 d #ui
如果我们希望这是一个 list
列
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))], 1, function(x) as.list(c(x))))
df
# id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating combined1 combined2 combined3
#1 HD1 1 a #ji 1 190 1mn 1890 <NA> @jksdff 1 1, a, #ji 1, 190 1890, NA
#2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2 0, b, #ki 2, 2345 9002, @ksdf
#3 HD3 1 c <NA> 3 41 g78 14341 <NA> @kukg.676 3 1, c, NA 3, 41 14341, NA
#4 HD4 4 d #ui 4 89 asd324 657 <NA> @jdkfjk 4 4, d, #ui 4, 89 657, NA
WolfgangBagdanow-
Yeah please, It would be nice if can have them in a list form or something because I will writing them to a file. My goal is to cut-down unnecessary column creations. I put down how I would like in my question above
这是对@akrun 的第二个答案的扩展。如果您决定将此 df
写入 csv 文件,那么请知道 readr
、data.table
和 base
写入 csv 方法都不起作用,因为它们没有实现方法然而,将列表写入 csv 文件(据我所知)。为了将具有列表列的数据框写入 csv 文件,您需要这样的东西-
#@akrun 's list method-
#df <- data.frame(...)
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))], 1, function(x) as.list(c(x))))
#------------------------------------
set_lists_to_chars <- function(x) {
if(class(x) == 'list') {
y <- paste(unlist(x[1]), sep='', collapse=', ')
} else {
y <- x
}
return(y)
}
new_frame <- data.frame(lapply(tibble(df), set_lists_to_chars), stringsAsFactors = F)
write.csv(new_frame, file='test.csv')
或者,您也可以使用 apply
方法将列表转换为字符。无论如何,这会让你得到 csv 中想要的东西。希望。顺便提一下,这个问题很好,@akrun 给出了很好的答案。祝你好运!
好吧,如果你正在尝试为 csv 做准备,一个常见的方法是使用管道将困难部分的全部功劳分给@akrun 但是...
nm1 <- sub("\..*", "", names(df))
nm2 <- names(which(table(nm1) > 1))
df[paste0('combined', seq_along(nm2))] <- lapply(nm2,
function(x)
apply(df[grep(x, names(df))],
1,
function(x) str_replace_all(toString(x),
pattern = ", ",
replacement = "|")))
> df
id mon.1 mon.2 mon.2.4.1...1 tue.6 tue heh wed wed.01 thu.0234 rating combined1 combined2 combined3
1 HD1 1 a #ji 1 190 1mn 1890 <NA> @jksdff 1 1|a|#ji 1| 190 1890|NA
2 HD2 0 b #ki 2 2345 2a 9002 @ksdf @sfd 2 0|b|#ki 2|2345 9002|@ksdf
3 HD3 1 c <NA> 3 41 g78 14341 <NA> @kukg.676 3 1|c|NA 3| 41 14341|NA
4 HD4 4 d #ui 4 89 asd324 657 <NA> @jdkfjk 4 4|d|#ui 4| 89 657|NA