合并数据框中的行而不分组
Combine rows in data frame without grouping
tibble::tibble(
col1 = c("A","","C","",""),
col2 = c("string1 part 1","string1 part 2",
"string2 part 1", "string2 part 2",
"string3"),
col3 = c(1, "", 2, "", 3)
)
我想合并 col2
中的行,像这样获得没有空字符串的小标题
tibble::tibble(
col1 = c("A","C",NA),
col2 = c("string1 part 1 string1 part 2", "string2 part 1 string2 part 2", "string3"),
col3 = c(1,2,3)
)
# A tibble: 3 x 3
col1 col2 col3
<chr> <chr> <dbl>
1 A string1 part 1 string1 part 2 1
2 C string2 part 1 string2 part 2 2
3 NA string3 3
正如我在其他答案中发现的那样,分组似乎不是一个选项,因为我没有任何参考列
这适用于 dplyr
tibble::tibble(
col1 = c("A","","C","",""),
col2 = c("string1 part 1","string1 part 2",
"string2 part 1", "string2 part 2",
"string3"),
col3 = c(1, "", 2, "", 3)
) %>%
# fill empty values on col3
mutate(col3 = case_when(
col3!="" ~ 1,
T ~ 0
)) %>%
mutate(col3 = cumsum(col3)) %>%
# fill empty values on col1
group_by(col3) %>%
mutate(col1 = first(col1)) %>%
# group & summarise
group_by(col1, col3) %>%
summarise(col2 = paste(col2, collapse=' ')) %>%
# replace empty string by NA & arrange by col3
ungroup() %>%
mutate(col1 = case_when(
col1=="" ~ as.character(NA),
T ~ col1
)) %>%
arrange(col3) %>%
select(col1, col2, col3)
输出:
# A tibble: 3 x 3
# col1 col2 col3
# <chr> <chr> <dbl>
#1 A string1 part 1 string1 part 2 1
#2 C string2 part 1 string2 part 2 2
#3 <NA> string3 3
这是一个以 R 为基数的解决方案,其中使用了 findInterval()
和 split()
:
# split df according to col3 values
dfs <- split(df,findInterval(1:nrow(df), which(nchar(as.vector(df$col3))>0)))
# merge rows and form new data frame
dfout <- Reduce(rbind,lapply(dfs, function(v) data.frame(lapply(v, function(x) trimws(paste(x,collapse = " "))))))
这样
> dfout
col1 col2 col3
1 A string1 part 1 string1 part 2 1
2 C string2 part 1 string2 part 2 2
3 string3 3
数据
df <- structure(list(col1 = structure(c(2L, 1L, 3L, 1L, 1L), .Label = c("",
"A", "C"), class = "factor"), col2 = structure(1:5, .Label = c("string1 part 1",
"string1 part 2", "string2 part 1", "string2 part 2", "string3"
), class = "factor"), col3 = structure(c(2L, 1L, 3L, 1L, 4L), .Label = c("",
"1", "2", "3"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
忽略"without grouping"部分并按col1不为空的指标的cumsum分组+从前一个cumsum标识的每个组中拆分出前两行,您可以粘贴非空元素如果结果为空,则在组中 return NA。
library(data.table)
setDT(df)
df[, cs := cumsum(col1 != '')]
df[, lapply(.SD, function(x){
out <- paste(x[x != ''], collapse = ' ')
if(out == '') NA_character_
else out })
, by = .(cs, r = rowid(cs) <= 2)
][, -c('cs', 'r')]
# col1 col2 col3
# 1: A string1 part 1 string1 part 2 1
# 2: C string2 part 1 string2 part 2 2
# 3: <NA> string3 3
更常规的dplyr
解决方案:
require(dplyr)
df1 %>%
rowwise %>% mutate(grp = strsplit(col2, " ")[[1]][1]) %>%
ungroup %>% group_by(grp) %>%
summarise_all(list(~trimws(as.character(paste0(., collapse = " "))))) %>%
select(-grp) %>%
mutate_all(na_if,"")
#> # A tibble: 3 x 3
#> col1 col2 col3
#> <chr> <chr> <chr>
#> 1 A string1 part 1 string1 part 2 1
#> 2 C string2 part 1 string2 part 2 2
#> 3 <NA> string3 3
tibble::tibble(
col1 = c("A","","C","",""),
col2 = c("string1 part 1","string1 part 2",
"string2 part 1", "string2 part 2",
"string3"),
col3 = c(1, "", 2, "", 3)
)
我想合并 col2
中的行,像这样获得没有空字符串的小标题
tibble::tibble(
col1 = c("A","C",NA),
col2 = c("string1 part 1 string1 part 2", "string2 part 1 string2 part 2", "string3"),
col3 = c(1,2,3)
)
# A tibble: 3 x 3
col1 col2 col3
<chr> <chr> <dbl>
1 A string1 part 1 string1 part 2 1
2 C string2 part 1 string2 part 2 2
3 NA string3 3
正如我在其他答案中发现的那样,分组似乎不是一个选项,因为我没有任何参考列
这适用于 dplyr
tibble::tibble(
col1 = c("A","","C","",""),
col2 = c("string1 part 1","string1 part 2",
"string2 part 1", "string2 part 2",
"string3"),
col3 = c(1, "", 2, "", 3)
) %>%
# fill empty values on col3
mutate(col3 = case_when(
col3!="" ~ 1,
T ~ 0
)) %>%
mutate(col3 = cumsum(col3)) %>%
# fill empty values on col1
group_by(col3) %>%
mutate(col1 = first(col1)) %>%
# group & summarise
group_by(col1, col3) %>%
summarise(col2 = paste(col2, collapse=' ')) %>%
# replace empty string by NA & arrange by col3
ungroup() %>%
mutate(col1 = case_when(
col1=="" ~ as.character(NA),
T ~ col1
)) %>%
arrange(col3) %>%
select(col1, col2, col3)
输出:
# A tibble: 3 x 3
# col1 col2 col3
# <chr> <chr> <dbl>
#1 A string1 part 1 string1 part 2 1
#2 C string2 part 1 string2 part 2 2
#3 <NA> string3 3
这是一个以 R 为基数的解决方案,其中使用了 findInterval()
和 split()
:
# split df according to col3 values
dfs <- split(df,findInterval(1:nrow(df), which(nchar(as.vector(df$col3))>0)))
# merge rows and form new data frame
dfout <- Reduce(rbind,lapply(dfs, function(v) data.frame(lapply(v, function(x) trimws(paste(x,collapse = " "))))))
这样
> dfout
col1 col2 col3
1 A string1 part 1 string1 part 2 1
2 C string2 part 1 string2 part 2 2
3 string3 3
数据
df <- structure(list(col1 = structure(c(2L, 1L, 3L, 1L, 1L), .Label = c("",
"A", "C"), class = "factor"), col2 = structure(1:5, .Label = c("string1 part 1",
"string1 part 2", "string2 part 1", "string2 part 2", "string3"
), class = "factor"), col3 = structure(c(2L, 1L, 3L, 1L, 4L), .Label = c("",
"1", "2", "3"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
忽略"without grouping"部分并按col1不为空的指标的cumsum分组+从前一个cumsum标识的每个组中拆分出前两行,您可以粘贴非空元素如果结果为空,则在组中 return NA。
library(data.table)
setDT(df)
df[, cs := cumsum(col1 != '')]
df[, lapply(.SD, function(x){
out <- paste(x[x != ''], collapse = ' ')
if(out == '') NA_character_
else out })
, by = .(cs, r = rowid(cs) <= 2)
][, -c('cs', 'r')]
# col1 col2 col3
# 1: A string1 part 1 string1 part 2 1
# 2: C string2 part 1 string2 part 2 2
# 3: <NA> string3 3
更常规的dplyr
解决方案:
require(dplyr)
df1 %>%
rowwise %>% mutate(grp = strsplit(col2, " ")[[1]][1]) %>%
ungroup %>% group_by(grp) %>%
summarise_all(list(~trimws(as.character(paste0(., collapse = " "))))) %>%
select(-grp) %>%
mutate_all(na_if,"")
#> # A tibble: 3 x 3
#> col1 col2 col3
#> <chr> <chr> <chr>
#> 1 A string1 part 1 string1 part 2 1
#> 2 C string2 part 1 string2 part 2 2
#> 3 <NA> string3 3