如何计算分配给一列的几个字符串?

how to count several strings assigned to a column?

我最近问了一个很好回答的问题,感谢帮助我的人

df2<- structure(list(V1 = structure(c(1L, 5L, 5L, 1L, 5L, 5L, 5L, 5L, 
NA, NA, NA, NA, 4L, 2L, 3L, 5L, NA, 1L, 1L, 1L, 1L, 5L, 6L, 6L, 
6L, NA, NA, 5L), .Label = c("1 x Bruit (U)", "1 x Bruit (U) 1 x TAMAN (M)", 
"1 x Bruit (U) 2 x TAMAN (M)", "1 x TAMAN (M) 2 x TAMAN (M)", 
"2 x Bruit (U)", "2 x TIKIam(T)"), class = "factor"), V2 = structure(c(1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L), .Label = c("BUX1_T10963", 
"BUX1_T10964", "BUX1_T10965", "BUX1_T10966", "BUX2_T10076"), class = "factor")), .Names = c("V1", 
"V2"), class = "data.frame", row.names = c(NA, -28L))

我有另一个问题,我想我应该问一个不同的问题而不是同一个问题。

如果我在第一列中有多个字符串,那么我无法获得好的结果。例如上面的问题。我得到

如果我执行以下操作

library(stringr)
library(reshape2)
df$goodname = gsub(pattern = "^[0-9]+ x ", replacement = "", x = df$V1)
# extract the number
df$quantity = as.numeric(str_extract(df$V1, "^[0-9]+"))
# any missing values assume to be 1
df$quantity[is.na(df$quantity)] = 1
dcast(data = df, formula = goodname ~ V2, value.var = "quantity", fun.aggregate = sum, na.rm = T)

我得到了

这样的答案
                 goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1               Bruit (U)           5           9           2           6           2
2 Bruit (U) 1 x TAMAN (M)           0           0           1           0           0
3 Bruit (U) 2 x TAMAN (M)           0           0           1           0           0
4 TAMAN (M) 2 x TAMAN (M)           0           0           1           0           0
5               TIKIam(T)           0           0           0           6           0
6                    <NA>           0           4           1           2           0

但我想要这样的答案

                goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1               Bruit (U)           5           9           4           6       2
2               TAMAN (M)           0           0           6           0       0
3               TIKIam(T)           0           0           0           6       0
4                    <NA>           0           4           1           2       0
5               TTXI1(M)            0           0           0            2       0

使用 dplyrtidyr 的解决方案。我们有不同的输出,但由于您的 df2 没有任何 TTXI1(M),我认为我的输出可能更有意义。关键是用separate_rows扩展多条记录的行,用separate分隔Times1 x2 x)和Label

library(dplyr)
library(tidyr)

df3 <- df2 %>%
  separate_rows(V1, sep = "(?<=\))\s(?=[0-9]+)") %>%
  separate(V1, into = c("Times", "Label"), sep = " x ", convert = TRUE) %>%
  mutate(Times = ifelse(is.na(Times), 1, Times)) %>%
  group_by(Label, V2) %>%
  summarise(n = sum(Times)) %>%
  spread(V2, n, fill = 0) %>%
  ungroup()
df3

# # A tibble: 4 x 6
#       Label BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
# *     <chr>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
# 1 Bruit (U)           5           9           4           6           2
# 2 TAMAN (M)           0           0           6           0           0
# 3 TIKIam(T)           0           0           0           6           0
# 4      <NA>           0           4           1           2           0