如何计算分配给一列的几个字符串?
how to count several strings assigned to a column?
我最近问了一个很好回答的问题,感谢帮助我的人
df2<- structure(list(V1 = structure(c(1L, 5L, 5L, 1L, 5L, 5L, 5L, 5L,
NA, NA, NA, NA, 4L, 2L, 3L, 5L, NA, 1L, 1L, 1L, 1L, 5L, 6L, 6L,
6L, NA, NA, 5L), .Label = c("1 x Bruit (U)", "1 x Bruit (U) 1 x TAMAN (M)",
"1 x Bruit (U) 2 x TAMAN (M)", "1 x TAMAN (M) 2 x TAMAN (M)",
"2 x Bruit (U)", "2 x TIKIam(T)"), class = "factor"), V2 = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L), .Label = c("BUX1_T10963",
"BUX1_T10964", "BUX1_T10965", "BUX1_T10966", "BUX2_T10076"), class = "factor")), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -28L))
我有另一个问题,我想我应该问一个不同的问题而不是同一个问题。
如果我在第一列中有多个字符串,那么我无法获得好的结果。例如上面的问题。我得到
如果我执行以下操作
library(stringr)
library(reshape2)
df$goodname = gsub(pattern = "^[0-9]+ x ", replacement = "", x = df$V1)
# extract the number
df$quantity = as.numeric(str_extract(df$V1, "^[0-9]+"))
# any missing values assume to be 1
df$quantity[is.na(df$quantity)] = 1
dcast(data = df, formula = goodname ~ V2, value.var = "quantity", fun.aggregate = sum, na.rm = T)
我得到了
这样的答案
goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1 Bruit (U) 5 9 2 6 2
2 Bruit (U) 1 x TAMAN (M) 0 0 1 0 0
3 Bruit (U) 2 x TAMAN (M) 0 0 1 0 0
4 TAMAN (M) 2 x TAMAN (M) 0 0 1 0 0
5 TIKIam(T) 0 0 0 6 0
6 <NA> 0 4 1 2 0
但我想要这样的答案
goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1 Bruit (U) 5 9 4 6 2
2 TAMAN (M) 0 0 6 0 0
3 TIKIam(T) 0 0 0 6 0
4 <NA> 0 4 1 2 0
5 TTXI1(M) 0 0 0 2 0
使用 dplyr
和 tidyr
的解决方案。我们有不同的输出,但由于您的 df2
没有任何 TTXI1(M)
,我认为我的输出可能更有意义。关键是用separate_rows
扩展多条记录的行,用separate
分隔Times
(1 x
、2 x
)和Label
。
library(dplyr)
library(tidyr)
df3 <- df2 %>%
separate_rows(V1, sep = "(?<=\))\s(?=[0-9]+)") %>%
separate(V1, into = c("Times", "Label"), sep = " x ", convert = TRUE) %>%
mutate(Times = ifelse(is.na(Times), 1, Times)) %>%
group_by(Label, V2) %>%
summarise(n = sum(Times)) %>%
spread(V2, n, fill = 0) %>%
ungroup()
df3
# # A tibble: 4 x 6
# Label BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Bruit (U) 5 9 4 6 2
# 2 TAMAN (M) 0 0 6 0 0
# 3 TIKIam(T) 0 0 0 6 0
# 4 <NA> 0 4 1 2 0
我最近问了一个很好回答的问题,感谢帮助我的人
df2<- structure(list(V1 = structure(c(1L, 5L, 5L, 1L, 5L, 5L, 5L, 5L,
NA, NA, NA, NA, 4L, 2L, 3L, 5L, NA, 1L, 1L, 1L, 1L, 5L, 6L, 6L,
6L, NA, NA, 5L), .Label = c("1 x Bruit (U)", "1 x Bruit (U) 1 x TAMAN (M)",
"1 x Bruit (U) 2 x TAMAN (M)", "1 x TAMAN (M) 2 x TAMAN (M)",
"2 x Bruit (U)", "2 x TIKIam(T)"), class = "factor"), V2 = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L), .Label = c("BUX1_T10963",
"BUX1_T10964", "BUX1_T10965", "BUX1_T10966", "BUX2_T10076"), class = "factor")), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -28L))
我有另一个问题,我想我应该问一个不同的问题而不是同一个问题。
如果我在第一列中有多个字符串,那么我无法获得好的结果。例如上面的问题。我得到
如果我执行以下操作
library(stringr)
library(reshape2)
df$goodname = gsub(pattern = "^[0-9]+ x ", replacement = "", x = df$V1)
# extract the number
df$quantity = as.numeric(str_extract(df$V1, "^[0-9]+"))
# any missing values assume to be 1
df$quantity[is.na(df$quantity)] = 1
dcast(data = df, formula = goodname ~ V2, value.var = "quantity", fun.aggregate = sum, na.rm = T)
我得到了
这样的答案 goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1 Bruit (U) 5 9 2 6 2
2 Bruit (U) 1 x TAMAN (M) 0 0 1 0 0
3 Bruit (U) 2 x TAMAN (M) 0 0 1 0 0
4 TAMAN (M) 2 x TAMAN (M) 0 0 1 0 0
5 TIKIam(T) 0 0 0 6 0
6 <NA> 0 4 1 2 0
但我想要这样的答案
goodname BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
1 Bruit (U) 5 9 4 6 2
2 TAMAN (M) 0 0 6 0 0
3 TIKIam(T) 0 0 0 6 0
4 <NA> 0 4 1 2 0
5 TTXI1(M) 0 0 0 2 0
使用 dplyr
和 tidyr
的解决方案。我们有不同的输出,但由于您的 df2
没有任何 TTXI1(M)
,我认为我的输出可能更有意义。关键是用separate_rows
扩展多条记录的行,用separate
分隔Times
(1 x
、2 x
)和Label
。
library(dplyr)
library(tidyr)
df3 <- df2 %>%
separate_rows(V1, sep = "(?<=\))\s(?=[0-9]+)") %>%
separate(V1, into = c("Times", "Label"), sep = " x ", convert = TRUE) %>%
mutate(Times = ifelse(is.na(Times), 1, Times)) %>%
group_by(Label, V2) %>%
summarise(n = sum(Times)) %>%
spread(V2, n, fill = 0) %>%
ungroup()
df3
# # A tibble: 4 x 6
# Label BUX1_T10963 BUX1_T10964 BUX1_T10965 BUX1_T10966 BUX2_T10076
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Bruit (U) 5 9 4 6 2
# 2 TAMAN (M) 0 0 6 0 0
# 3 TIKIam(T) 0 0 0 6 0
# 4 <NA> 0 4 1 2 0