无法使用 dplyr 进行分组
unable to group by using dplyr
我有以下 table,我想取 tf_diff
的平均值并按 cnrd_marsh
和 timefact_hour
分组。但是,当我使用 dplyr
时出现此错误并且不确定原因。我还尝试删除 NA 值并将第一列从因子转换为字符,但这并没有解决问题。有任何想法吗?我在下面包含了一个数据样本。
library(dplyr)
library(tidyr)
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)
Error: Key column 'cnrd_marsh' does not exist in input.
> foo
cnrd_marsh timefact_hour tf_diff
1 <NA> NA NA
2 БЧ 14 19
3 БЧ 14 5
4 <NA> NA NA
5 БЧ 13 1
6 БЧ 13 18
7 БЧ 13 31
8 БЧ 13 2
9 <NA> NA NA
10 БЧ 12 5
11 БЧ 12 10
12 БЧ 12 1
13 БЧ 12 17
14 <NA> NA NA
15 БЧ 11 2
16 БЧ 11 18
17 БЧ 11 4
18 БЧ 11 7
19 БЧ 11 16
20 БЧ 11 3
21 <NA> NA NA
22 БЧ 10 11
23 БЧ 10 6
24 БЧ 10 10
25 <NA> NA NA
26 БЧ 9 17
27 БЧ 9 6
28 БЧ 9 15
29 БЧ 9 4
30 БЧ 9 9
31 БЧ 9 1
32 <NA> NA NA
33 БЧ 8 16
34 БЧ 8 8
35 БЧ 8 14
36 БЧ 8 3
37 БЧ 8 11
38 БЧ 8 1
39 <NA> NA NA
40 БЧ 7 14
41 БЧ 7 6
42 БЧ 7 14
43 БЧ 7 5
44 <NA> NA NA
45 БЧ 6 0
46 БЧ 6 9
47 БЧ 6 10
48 БЧ 6 15
49 <NA> NA NA
50 <NA> NA NA
输出应该像这样
> head(output)
cnrd_marsh timefact_hour tf_diff
1 БЧ 14 12.00000000000000
2 БЧ 13 13.00000000000000
3 БЧ 12 8.25000000000000
4 БЧ 11 8.33333330000000
5 БЧ 10 9.00000000000000
6 БЧ 9 8.66666666666667
这是数据样本。
> dput(foo)
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ",
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA),
timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA,
12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA,
10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L,
8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1,
17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15,
4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0,
9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour",
"tf_diff"), vars = list(timefact_hour), row.names = c(NA, 50L
), class = "data.frame")
如果将该 dput 值中 vars
的值更改为带引号的值,您可以让代码生成输出:
foo <-
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ",
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA),
timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA,
12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA,
10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L,
8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1,
17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15,
4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0,
9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour",
"tf_diff"), vars = list('timefact_hour'), row.names = c(NA, 50L
), class = "data.frame")
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)
output
#-----------------------
Source: local data frame [10 x 3]
timefact_hour БЧ NA
* <int> <dbl> <dbl>
1 6 8.5000000 NA
2 7 9.7500000 NA
3 8 8.8333333 NA
4 9 8.6666667 NA
5 10 9.0000000 NA
6 11 8.3333333 NA
7 12 8.2500000 NA
8 13 13.0000000 NA
9 14 12.0000000 NA
10 NA NA NA
这确实说明了@joran 的担忧。以下是我认为对我来说更有意义的 "base" 解决方案:
> with(foo, tapply(tf_diff, list(timefact_hour, cnrd_marsh), mean, na.rm=TRUE))
БЧ
6 8.500000
7 9.750000
8 8.833333
9 8.666667
10 9.000000
11 8.333333
12 8.250000
13 13.000000
14 12.000000
这段代码似乎可以解决问题:
library(plyr)
cdata <- ddply(foo, c("cnrd_marsh", "timefact_hour"), summarise,
mean = mean(tf_diff)
)
cdata
您的错误 "Error: Key column 'cnrd_marsh' does not exist in input" 是因为当您在分组依据之后使用汇总时,它只会 returns 您 "tf_diff" 列。因此在计算点差时,无法找到 'cnrd_marsh' 和 'tf_diff' 列。
而是试试这个。它应该符合您的目的:
foo <- foo[complete.cases(foo),] ### or spread will fail
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
mutate(tf_diff = mean(tf_diff,na.rm=T)) %>% unique() %>%
spread(cnrd_marsh, tf_diff)
只需清除 NA 的数据集,您就可以开始了。
output <- foo %>%
na.omit %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)
我有以下 table,我想取 tf_diff
的平均值并按 cnrd_marsh
和 timefact_hour
分组。但是,当我使用 dplyr
时出现此错误并且不确定原因。我还尝试删除 NA 值并将第一列从因子转换为字符,但这并没有解决问题。有任何想法吗?我在下面包含了一个数据样本。
library(dplyr)
library(tidyr)
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)
Error: Key column 'cnrd_marsh' does not exist in input.
> foo
cnrd_marsh timefact_hour tf_diff
1 <NA> NA NA
2 БЧ 14 19
3 БЧ 14 5
4 <NA> NA NA
5 БЧ 13 1
6 БЧ 13 18
7 БЧ 13 31
8 БЧ 13 2
9 <NA> NA NA
10 БЧ 12 5
11 БЧ 12 10
12 БЧ 12 1
13 БЧ 12 17
14 <NA> NA NA
15 БЧ 11 2
16 БЧ 11 18
17 БЧ 11 4
18 БЧ 11 7
19 БЧ 11 16
20 БЧ 11 3
21 <NA> NA NA
22 БЧ 10 11
23 БЧ 10 6
24 БЧ 10 10
25 <NA> NA NA
26 БЧ 9 17
27 БЧ 9 6
28 БЧ 9 15
29 БЧ 9 4
30 БЧ 9 9
31 БЧ 9 1
32 <NA> NA NA
33 БЧ 8 16
34 БЧ 8 8
35 БЧ 8 14
36 БЧ 8 3
37 БЧ 8 11
38 БЧ 8 1
39 <NA> NA NA
40 БЧ 7 14
41 БЧ 7 6
42 БЧ 7 14
43 БЧ 7 5
44 <NA> NA NA
45 БЧ 6 0
46 БЧ 6 9
47 БЧ 6 10
48 БЧ 6 15
49 <NA> NA NA
50 <NA> NA NA
输出应该像这样
> head(output)
cnrd_marsh timefact_hour tf_diff
1 БЧ 14 12.00000000000000
2 БЧ 13 13.00000000000000
3 БЧ 12 8.25000000000000
4 БЧ 11 8.33333330000000
5 БЧ 10 9.00000000000000
6 БЧ 9 8.66666666666667
这是数据样本。
> dput(foo)
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ",
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA),
timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA,
12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA,
10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L,
8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1,
17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15,
4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0,
9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour",
"tf_diff"), vars = list(timefact_hour), row.names = c(NA, 50L
), class = "data.frame")
如果将该 dput 值中 vars
的值更改为带引号的值,您可以让代码生成输出:
foo <-
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ",
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA,
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ",
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA),
timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA,
12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA,
10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L,
8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1,
17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15,
4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0,
9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour",
"tf_diff"), vars = list('timefact_hour'), row.names = c(NA, 50L
), class = "data.frame")
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)
output
#-----------------------
Source: local data frame [10 x 3]
timefact_hour БЧ NA
* <int> <dbl> <dbl>
1 6 8.5000000 NA
2 7 9.7500000 NA
3 8 8.8333333 NA
4 9 8.6666667 NA
5 10 9.0000000 NA
6 11 8.3333333 NA
7 12 8.2500000 NA
8 13 13.0000000 NA
9 14 12.0000000 NA
10 NA NA NA
这确实说明了@joran 的担忧。以下是我认为对我来说更有意义的 "base" 解决方案:
> with(foo, tapply(tf_diff, list(timefact_hour, cnrd_marsh), mean, na.rm=TRUE))
БЧ
6 8.500000
7 9.750000
8 8.833333
9 8.666667
10 9.000000
11 8.333333
12 8.250000
13 13.000000
14 12.000000
这段代码似乎可以解决问题:
library(plyr)
cdata <- ddply(foo, c("cnrd_marsh", "timefact_hour"), summarise,
mean = mean(tf_diff)
)
cdata
您的错误 "Error: Key column 'cnrd_marsh' does not exist in input" 是因为当您在分组依据之后使用汇总时,它只会 returns 您 "tf_diff" 列。因此在计算点差时,无法找到 'cnrd_marsh' 和 'tf_diff' 列。 而是试试这个。它应该符合您的目的:
foo <- foo[complete.cases(foo),] ### or spread will fail
output <- foo %>%
group_by(cnrd_marsh, timefact_hour) %>%
mutate(tf_diff = mean(tf_diff,na.rm=T)) %>% unique() %>%
spread(cnrd_marsh, tf_diff)
只需清除 NA 的数据集,您就可以开始了。
output <- foo %>%
na.omit %>%
group_by(cnrd_marsh, timefact_hour) %>%
summarise(tf_diff = mean(tf_diff)) %>%
spread(cnrd_marsh, tf_diff)