在 dplyr 中用真实的 NA 汇总分组字符数据

Summarize grouped character data with true NA in dplyr

我有语音数据,其中包含我想要折叠的同一说话者的话语:

df <- structure(list(Line = 1:7,
                     Speaker = c("ID01.A", NA, "ID01.C", "ID01.C", "ID01.A", "ID01.A", "ID01.A"), 
                     Utterance = c("how old's your mom¿", 
                                   "(0.855)", 
                                   "eh six:ty:::-one=", 
                                   "[when was] that¿=", 
                                   "[yes]",                               # collapse with ...
                                   "(0.163)",                             # ... this and ...
                                   "=!this! was on °Wednesday°"),         # ... that
                     Sequ = c(1L, 1L,1L, 2L, 2L, 2L, 2L), 
                     c7 = c("how_RGQ old_JJ 's_VBZ your_APPGE mom_NN1", 
                            NA, 
                            "eh_UH sixty-one_MC", 
                            "when_RRQ was_VBDZ that_DD1", 
                            "yes_UH",                                     # collapse with ...
                            NA,                                           # ... and ...
                            "this_DD1 was_VBDZ on_II Wednesday_NPD1"),    # ... that
                     N_c7 = c(12L,NA, 2L, 3L, 1L, NA, 4L)), 
                row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))

UtteranceN_c7 的 summarizing/collapsing 而言,我做得很好。只是列 c7 提出了一个问题,即 Line 中存在 (true) NA 6. 这种存在会阻止汇总操作 - unless I首先将 true NA 转换为字符 "NA":

library(dplyr)
library(data.table)
df %>%
  mutate(
    # convert true NA to character "NA":
    c7 = ifelse(is.na(c7), "NA", c7)
  ) %>%
  # group:
  group_by(grp = rleid(Speaker, Sequ)) %>%
  # summarise:
  summarise(
    # across Line, Speaker, Sequ, by taking the first value:
    across(c(Line, Speaker, Sequ), first),
    # collapse same-speaker `Utterance`s:
    Utterance = str_c(Utterance, collapse = ' '),     
    # collapse same-speaker `c7` data:
    c7 = str_c(c7, collapse = ' '),
    # sum same-speaker `N_c7` values:
    N_c7 = sum(N_c7, na.rm = TRUE)
           ) %>%
  # deactivate grouping:
  ungroup() %>%
  select(-grp)
# A tibble: 5 × 6
   Line Speaker  Sequ Utterance                                                c7                                                             N_c7
  <int> <chr>   <int> <chr>                                                    <chr>                                                         <int>
1     1 ID01.A      1 >like I don't understand< sorry like how old's your mom¿ like_VV0 I_PPIS1 do_VD0 n't_XX understand_VVI sorry_JJ like_…    12
2     2 NA          1 (0.855)                                                  NA                                                                0
3     3 ID01.C      1 eh six:ty:::-one=                                        eh_UH sixty-one_MC                                                2
4     4 ID01.C      2 [when was] that¿=                                        when_RRQ was_VBDZ that_DD1                                        3
5     5 ID01.A      2 [yes] (0.163) =!this! was on °Wednesday°                 yes_UH NA this_DD1 was_VBDZ on_II Wednesday_NPD1                  5

NA"NA" 的转换是次优的,因为列中的 all NA 值被突变为字符,包括那些我需要保持真实 NA。如何在不将 NA 转换为 "NA" 的情况下折叠 same-sepaker c7

我认为您遇到的问题是由于 str_c 的实施方式造成的,如果输入中有一个 NA,则故意导致 NA。如果你用 paste 替换 str_c 你会得到相同的结果而不需要将 NA 转换为字符。

您的代码

question_output <- df %>%
  mutate(
    # convert true NA to character "NA":
    c7 = ifelse(is.na(c7), "NA", c7)
  ) %>%
  # group:
  group_by(grp = rleid(Speaker, Sequ)) %>%
  # summarise:
  summarise(
    # across Line, Speaker, Sequ, by taking the first value:
    across(c(Line, Speaker, Sequ), first),
    # collapse same-speaker `Utterance`s:
    Utterance = str_c(Utterance, collapse = ' '),     
    # collapse same-speaker `c7` data:
    c7 = str_c(c7, collapse = ' '),
    # sum same-speaker `N_c7` values:
    N_c7 = sum(N_c7, na.rm = TRUE)
  ) %>%
  # deactivate grouping:
  ungroup() %>%
  select(-grp)

我用粘贴试用

trial_output <- df %>%
  group_by(grp = rleid(Speaker, Sequ)) %>%
  # summarise:
  summarise(
    # across Line, Speaker, Sequ, by taking the first value:
    across(c(Line, Speaker, Sequ), first),
    # collapse same-speaker `Utterance`s:
    Utterance = paste(Utterance, collapse = ' '),     
    # collapse same-speaker `c7` data:
    c7 = paste(c7, collapse = ' '),
    # sum same-speaker `N_c7` values:
    N_c7 = sum(N_c7, na.rm = TRUE)
  ) %>%
  # deactivate grouping:
  ungroup() %>%
  select(-grp)

结果相同

identical(question_output, trial_output)
#> [1] TRUE

reprex package (v2.0.1)

创建于 2022-01-11

我读到 true NA 的提及是删除说话者有文字的那些,但保留说话者没有文字的那些。就此而言,可以使用 na.omit,稍后将空字符串转换为 NA

df %>%
  group_by(grp = rleid(Speaker, Sequ)) %>%
  summarise(
    across(c(Line, Speaker, Sequ), first),
    Utterance = str_c(Utterance, collapse = ' '),     
    c7 = na_if(str_c(na.omit(c7), collapse = ' '), ""),
    N_c7 = sum(N_c7, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  select(-grp)

   Line Speaker  Sequ Utterance                                c7                                     N_c7
  <int> <chr>   <int> <chr>                                    <chr>                                 <int>
1     1 ID01.A      1 how old's your mom¿                      how_RGQ old_JJ 's_VBZ your_APPGE mom~    12
2     2 NA          1 (0.855)                                  NA                                        0
3     3 ID01.C      1 eh six:ty:::-one=                        eh_UH sixty-one_MC                        2
4     4 ID01.C      2 [when was] that¿=                        when_RRQ was_VBDZ that_DD1                3
5     5 ID01.A      2 [yes] (0.163) =!this! was on °Wednesday° yes_UH this_DD1 was_VBDZ on_II Wedne~     5