合并数据框中的行而不分组

Combine rows in data frame without grouping

tibble::tibble(
  col1 = c("A","","C","",""),
  col2 = c("string1 part 1","string1 part 2",
           "string2 part 1", "string2 part 2",
           "string3"),
  col3 = c(1, "", 2, "", 3)
)

我想合并 col2 中的行,像这样获得没有空字符串的小标题

tibble::tibble(
  col1 = c("A","C",NA),
  col2 = c("string1 part 1 string1 part 2", "string2 part 1 string2 part 2", "string3"),
  col3 = c(1,2,3)
)

# A tibble: 3 x 3
  col1  col2                           col3
  <chr> <chr>                         <dbl>
1 A     string1 part 1 string1 part 2     1
2 C     string2 part 1 string2 part 2     2
3 NA    string3                           3

正如我在其他答案中发现的那样,分组似乎不是一个选项,因为我没有任何参考列

这适用于 dplyr

tibble::tibble(
  col1 = c("A","","C","",""),
  col2 = c("string1 part 1","string1 part 2",
           "string2 part 1", "string2 part 2",
           "string3"),
  col3 = c(1, "", 2, "", 3)
) %>% 
# fill empty values on col3
mutate(col3 = case_when(
    col3!="" ~ 1,
    T ~ 0
)) %>%
mutate(col3 = cumsum(col3)) %>%
# fill empty values on col1
group_by(col3) %>%
mutate(col1 = first(col1)) %>%
# group & summarise
group_by(col1, col3) %>%
summarise(col2 = paste(col2, collapse=' ')) %>%
# replace empty string by NA & arrange by col3
ungroup() %>%
mutate(col1 = case_when(
    col1=="" ~ as.character(NA),
    T ~ col1
)) %>%
arrange(col3) %>%
select(col1, col2, col3)

输出:

# A tibble: 3 x 3
#  col1  col2                           col3
#  <chr> <chr>                         <dbl>
#1 A     string1 part 1 string1 part 2     1
#2 C     string2 part 1 string2 part 2     2
#3 <NA>  string3                           3

这是一个以 R 为基数的解决方案,其中使用了 findInterval()split()

# split df according to col3 values
dfs <- split(df,findInterval(1:nrow(df), which(nchar(as.vector(df$col3))>0)))

# merge rows and form new data frame
dfout <- Reduce(rbind,lapply(dfs, function(v) data.frame(lapply(v, function(x) trimws(paste(x,collapse = " "))))))

这样

> dfout
  col1                          col2 col3
1    A string1 part 1 string1 part 2    1
2    C string2 part 1 string2 part 2    2
3                            string3    3

数据

df <- structure(list(col1 = structure(c(2L, 1L, 3L, 1L, 1L), .Label = c("", 
"A", "C"), class = "factor"), col2 = structure(1:5, .Label = c("string1 part 1", 
"string1 part 2", "string2 part 1", "string2 part 2", "string3"
), class = "factor"), col3 = structure(c(2L, 1L, 3L, 1L, 4L), .Label = c("", 
"1", "2", "3"), class = "factor")), class = "data.frame", row.names = c(NA, 
-5L))

忽略"without grouping"部分并按col1不为空的指标的cumsum分组+从前一个cumsum标识的每个组中拆分出前两行,您可以粘贴非空元素如果结果为空,则在组中 return NA。

library(data.table)
setDT(df)

df[, cs := cumsum(col1 != '')]
df[, lapply(.SD, function(x){ 
        out <- paste(x[x != ''], collapse = ' ')
        if(out == '') NA_character_
        else out })
   , by  = .(cs, r = rowid(cs) <= 2)
  ][, -c('cs', 'r')]

#    col1                          col2 col3
# 1:    A string1 part 1 string1 part 2    1
# 2:    C string2 part 1 string2 part 2    2
# 3: <NA>                       string3    3

更常规的dplyr解决方案:

require(dplyr)

df1 %>% 
  rowwise %>% mutate(grp = strsplit(col2, " ")[[1]][1]) %>% 
  ungroup %>% group_by(grp) %>% 
  summarise_all(list(~trimws(as.character(paste0(., collapse = " "))))) %>% 
  select(-grp) %>% 
  mutate_all(na_if,"")

#> # A tibble: 3 x 3
#>   col1  col2                          col3 
#>   <chr> <chr>                         <chr>
#> 1 A     string1 part 1 string1 part 2 1    
#> 2 C     string2 part 1 string2 part 2 2    
#> 3 <NA>  string3                       3