在 dplyr 中用真实的 NA 汇总分组字符数据
Summarize grouped character data with true NA in dplyr
我有语音数据,其中包含我想要折叠的同一说话者的话语:
df <- structure(list(Line = 1:7,
Speaker = c("ID01.A", NA, "ID01.C", "ID01.C", "ID01.A", "ID01.A", "ID01.A"),
Utterance = c("how old's your mom¿",
"(0.855)",
"eh six:ty:::-one=",
"[when was] that¿=",
"[yes]", # collapse with ...
"(0.163)", # ... this and ...
"=!this! was on °Wednesday°"), # ... that
Sequ = c(1L, 1L,1L, 2L, 2L, 2L, 2L),
c7 = c("how_RGQ old_JJ 's_VBZ your_APPGE mom_NN1",
NA,
"eh_UH sixty-one_MC",
"when_RRQ was_VBDZ that_DD1",
"yes_UH", # collapse with ...
NA, # ... and ...
"this_DD1 was_VBDZ on_II Wednesday_NPD1"), # ... that
N_c7 = c(12L,NA, 2L, 3L, 1L, NA, 4L)),
row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
就 Utterance
和 N_c7
的 summarizing/collapsing 而言,我做得很好。只是列 c7
提出了一个问题,即 Line
中存在 (true) NA
6. 这种存在会阻止汇总操作 - unless I首先将 true NA
转换为字符 "NA"
:
library(dplyr)
library(data.table)
df %>%
mutate(
# convert true NA to character "NA":
c7 = ifelse(is.na(c7), "NA", c7)
) %>%
# group:
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = str_c(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = str_c(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
# A tibble: 5 × 6
Line Speaker Sequ Utterance c7 N_c7
<int> <chr> <int> <chr> <chr> <int>
1 1 ID01.A 1 >like I don't understand< sorry like how old's your mom¿ like_VV0 I_PPIS1 do_VD0 n't_XX understand_VVI sorry_JJ like_… 12
2 2 NA 1 (0.855) NA 0
3 3 ID01.C 1 eh six:ty:::-one= eh_UH sixty-one_MC 2
4 4 ID01.C 2 [when was] that¿= when_RRQ was_VBDZ that_DD1 3
5 5 ID01.A 2 [yes] (0.163) =!this! was on °Wednesday° yes_UH NA this_DD1 was_VBDZ on_II Wednesday_NPD1 5
NA
到 "NA"
的转换是次优的,因为列中的 all NA
值被突变为字符,包括那些我需要保持真实 NA
。如何在不将 NA
转换为 "NA"
的情况下折叠 same-sepaker c7
?
我认为您遇到的问题是由于 str_c
的实施方式造成的,如果输入中有一个 NA
,则故意导致 NA
。如果你用 paste
替换 str_c
你会得到相同的结果而不需要将 NA
转换为字符。
您的代码
question_output <- df %>%
mutate(
# convert true NA to character "NA":
c7 = ifelse(is.na(c7), "NA", c7)
) %>%
# group:
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = str_c(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = str_c(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
我用粘贴试用
trial_output <- df %>%
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = paste(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = paste(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
结果相同
identical(question_output, trial_output)
#> [1] TRUE
由 reprex package (v2.0.1)
创建于 2022-01-11
我读到 true NA 的提及是删除说话者有文字的那些,但保留说话者没有文字的那些。就此而言,可以使用 na.omit
,稍后将空字符串转换为 NA
。
df %>%
group_by(grp = rleid(Speaker, Sequ)) %>%
summarise(
across(c(Line, Speaker, Sequ), first),
Utterance = str_c(Utterance, collapse = ' '),
c7 = na_if(str_c(na.omit(c7), collapse = ' '), ""),
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
ungroup() %>%
select(-grp)
Line Speaker Sequ Utterance c7 N_c7
<int> <chr> <int> <chr> <chr> <int>
1 1 ID01.A 1 how old's your mom¿ how_RGQ old_JJ 's_VBZ your_APPGE mom~ 12
2 2 NA 1 (0.855) NA 0
3 3 ID01.C 1 eh six:ty:::-one= eh_UH sixty-one_MC 2
4 4 ID01.C 2 [when was] that¿= when_RRQ was_VBDZ that_DD1 3
5 5 ID01.A 2 [yes] (0.163) =!this! was on °Wednesday° yes_UH this_DD1 was_VBDZ on_II Wedne~ 5
我有语音数据,其中包含我想要折叠的同一说话者的话语:
df <- structure(list(Line = 1:7,
Speaker = c("ID01.A", NA, "ID01.C", "ID01.C", "ID01.A", "ID01.A", "ID01.A"),
Utterance = c("how old's your mom¿",
"(0.855)",
"eh six:ty:::-one=",
"[when was] that¿=",
"[yes]", # collapse with ...
"(0.163)", # ... this and ...
"=!this! was on °Wednesday°"), # ... that
Sequ = c(1L, 1L,1L, 2L, 2L, 2L, 2L),
c7 = c("how_RGQ old_JJ 's_VBZ your_APPGE mom_NN1",
NA,
"eh_UH sixty-one_MC",
"when_RRQ was_VBDZ that_DD1",
"yes_UH", # collapse with ...
NA, # ... and ...
"this_DD1 was_VBDZ on_II Wednesday_NPD1"), # ... that
N_c7 = c(12L,NA, 2L, 3L, 1L, NA, 4L)),
row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
就 Utterance
和 N_c7
的 summarizing/collapsing 而言,我做得很好。只是列 c7
提出了一个问题,即 Line
中存在 (true) NA
6. 这种存在会阻止汇总操作 - unless I首先将 true NA
转换为字符 "NA"
:
library(dplyr)
library(data.table)
df %>%
mutate(
# convert true NA to character "NA":
c7 = ifelse(is.na(c7), "NA", c7)
) %>%
# group:
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = str_c(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = str_c(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
# A tibble: 5 × 6
Line Speaker Sequ Utterance c7 N_c7
<int> <chr> <int> <chr> <chr> <int>
1 1 ID01.A 1 >like I don't understand< sorry like how old's your mom¿ like_VV0 I_PPIS1 do_VD0 n't_XX understand_VVI sorry_JJ like_… 12
2 2 NA 1 (0.855) NA 0
3 3 ID01.C 1 eh six:ty:::-one= eh_UH sixty-one_MC 2
4 4 ID01.C 2 [when was] that¿= when_RRQ was_VBDZ that_DD1 3
5 5 ID01.A 2 [yes] (0.163) =!this! was on °Wednesday° yes_UH NA this_DD1 was_VBDZ on_II Wednesday_NPD1 5
NA
到 "NA"
的转换是次优的,因为列中的 all NA
值被突变为字符,包括那些我需要保持真实 NA
。如何在不将 NA
转换为 "NA"
的情况下折叠 same-sepaker c7
?
我认为您遇到的问题是由于 str_c
的实施方式造成的,如果输入中有一个 NA
,则故意导致 NA
。如果你用 paste
替换 str_c
你会得到相同的结果而不需要将 NA
转换为字符。
您的代码
question_output <- df %>%
mutate(
# convert true NA to character "NA":
c7 = ifelse(is.na(c7), "NA", c7)
) %>%
# group:
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = str_c(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = str_c(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
我用粘贴试用
trial_output <- df %>%
group_by(grp = rleid(Speaker, Sequ)) %>%
# summarise:
summarise(
# across Line, Speaker, Sequ, by taking the first value:
across(c(Line, Speaker, Sequ), first),
# collapse same-speaker `Utterance`s:
Utterance = paste(Utterance, collapse = ' '),
# collapse same-speaker `c7` data:
c7 = paste(c7, collapse = ' '),
# sum same-speaker `N_c7` values:
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
# deactivate grouping:
ungroup() %>%
select(-grp)
结果相同
identical(question_output, trial_output)
#> [1] TRUE
由 reprex package (v2.0.1)
创建于 2022-01-11我读到 true NA 的提及是删除说话者有文字的那些,但保留说话者没有文字的那些。就此而言,可以使用 na.omit
,稍后将空字符串转换为 NA
。
df %>%
group_by(grp = rleid(Speaker, Sequ)) %>%
summarise(
across(c(Line, Speaker, Sequ), first),
Utterance = str_c(Utterance, collapse = ' '),
c7 = na_if(str_c(na.omit(c7), collapse = ' '), ""),
N_c7 = sum(N_c7, na.rm = TRUE)
) %>%
ungroup() %>%
select(-grp)
Line Speaker Sequ Utterance c7 N_c7
<int> <chr> <int> <chr> <chr> <int>
1 1 ID01.A 1 how old's your mom¿ how_RGQ old_JJ 's_VBZ your_APPGE mom~ 12
2 2 NA 1 (0.855) NA 0
3 3 ID01.C 1 eh six:ty:::-one= eh_UH sixty-one_MC 2
4 4 ID01.C 2 [when was] that¿= when_RRQ was_VBDZ that_DD1 3
5 5 ID01.A 2 [yes] (0.163) =!this! was on °Wednesday° yes_UH this_DD1 was_VBDZ on_II Wedne~ 5