如何按行取 select 列的平均值
How to take mean of values of select columns by rows
我的tibble
插在最后
我有一个包含不同类型列的数据框(它们是不同的重复项)。前四列应保持原样;那个start_with()
(我这样写是因为这个函数可能有用)“canopy”应该总结成一个mean
,还有“understory”(写成“under”)和“树胸径”(tdbh) – 包括 na.rm = TRUE
。我怎样才能做到这一点?我怎样才能总结这样的专栏?
数据(头):
structure(list(Site = c("Bala", "Bala", "Bala", "Bala", "Bala",
"Bala"), Classification = c("Primary forest", "Primary forest",
"Primary forest", "Primary forest", "Primary forest", "Primary forest"
), transect = c(1, 1, 1, 1, 1, 2), point = c(1, 2, 3, 4, 5, 1
), canopy1 = c(10, 2, 5, 10, 2, 4), canopy2 = c(4, 3, 2, 3, 2,
6), canopy3 = c(5, 2, 3, 4, 5, 1), canopy4 = c(5, 3, 2, 2, 2,
2), canopy5 = c(3, 9, 6, 7, 4, 8), under1 = c(15, 17, 4, 23,
9, 27), under2 = c(13, 1, 0, 0, 0, 0), under3 = c(1, 4, 4, 4,
5, 23), under4 = c(19, 9, 0, 5, 0, 0), Numtrees = c(4, 3, 6,
1, 3, 7), t1dbh = c(110, 100, 50, 60, 100, 70), t2dbh = c(80,
60, 60, NA, 70, 80), t3dbh = c(80, 110, 70, NA, 100, 50), t4dbh = c(90,
NA, 110, NA, NA, 60), t5dbh = c(NA, NA, 90, NA, NA, 50), t6dbh = c(NA,
NA, 110, NA, NA, 60), t7dbh = c(NA, NA, NA, NA, NA, 250), t8dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), t9dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), t10dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
所需的输出以:
开头
# A tibble: 2 x 7
Site Classification transect point canopy understory tdbh
<chr> <chr> <dbl> <dbl> <dbl> <chr> <chr>
1 Bala Primary forest 1 1 5.4 ... ...
2 Bala Primary forest 1 2 3.8 ... ...
我希望这可能只使用基础 R 和 tidyverse
中的任何内容(可能 dplyr
and/or tidyr
)...
编辑:我知道 mutate(canopy = mean(c(canopy1, canopy2, ...)))
应该可以正常工作,但这有两个问题:首先,它添加了一列而不是替换。这是一个麻烦,但并不可怕。但是,我必须列出所有内容。这是低效答案的标志。
你就不能这样做吗:
df$canopy <- rowMeans(df[grep("^canopy", names(df))])
或者,在 tidyverse-speak 中(并删除带有 select
的中间列以显示结果):
df %>%
mutate(canopy = rowMeans(select(., starts_with("canopy")))) %>%
select(-(5:24))
#> # A tibble: 6 x 5
#> Site Classification transect point canopy
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Bala Primary forest 1 1 5.4
#> 2 Bala Primary forest 1 2 3.8
#> 3 Bala Primary forest 1 3 3.6
#> 4 Bala Primary forest 1 4 5.2
#> 5 Bala Primary forest 1 5 3
#> 6 Bala Primary forest 2 1 4.2
使用 sapply
.
cbind(df[1:4], sapply(c("canopy", "under", "dbh"), function(x)
rowMeans(df[grep(x, names(df))], na.rm=TRUE)))
# Site Classification transect point canopy under dbh
# 1 Bala Primary forest 1 1 5.4 12.00 90.00000
# 2 Bala Primary forest 1 2 3.8 7.75 90.00000
# 3 Bala Primary forest 1 3 3.6 2.00 81.66667
# 4 Bala Primary forest 1 4 5.2 8.00 60.00000
# 5 Bala Primary forest 1 5 3.0 3.50 90.00000
# 6 Bala Primary forest 2 1 4.2 12.50 88.57143
使用 tidyverse
包的解决方案。我们可以用目标字符串创建一个向量,然后使用 map_dfc
和 mutate
动态计算平均值。之后,我们可以将计算出的列合并到原始数据框中。
library(tidyverse)
# Set the target column names
target <- c("canopy", "under", "dbh")
#
dat2 <- map_dfc(target, function(x){
temp <- dat %>%
mutate("{x}" := rowMeans(select(., contains(x)), na.rm = TRUE), .keep = "none")
})
dat3 <- dat %>%
select(-contains(target)) %>%
bind_cols(dat2)
print(dat3)
# # A tibble: 6 x 8
# Site Classification transect point Numtrees canopy under dbh
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Bala Primary forest 1 1 4 5.4 12 90
# 2 Bala Primary forest 1 2 3 3.8 7.75 90
# 3 Bala Primary forest 1 3 6 3.6 2 81.7
# 4 Bala Primary forest 1 4 1 5.2 8 60
# 5 Bala Primary forest 1 5 3 3 3.5 90
# 6 Bala Primary forest 2 1 7 4.2 12.5 88.6
我的tibble
插在最后
我有一个包含不同类型列的数据框(它们是不同的重复项)。前四列应保持原样;那个start_with()
(我这样写是因为这个函数可能有用)“canopy”应该总结成一个mean
,还有“understory”(写成“under”)和“树胸径”(tdbh) – 包括 na.rm = TRUE
。我怎样才能做到这一点?我怎样才能总结这样的专栏?
数据(头):
structure(list(Site = c("Bala", "Bala", "Bala", "Bala", "Bala",
"Bala"), Classification = c("Primary forest", "Primary forest",
"Primary forest", "Primary forest", "Primary forest", "Primary forest"
), transect = c(1, 1, 1, 1, 1, 2), point = c(1, 2, 3, 4, 5, 1
), canopy1 = c(10, 2, 5, 10, 2, 4), canopy2 = c(4, 3, 2, 3, 2,
6), canopy3 = c(5, 2, 3, 4, 5, 1), canopy4 = c(5, 3, 2, 2, 2,
2), canopy5 = c(3, 9, 6, 7, 4, 8), under1 = c(15, 17, 4, 23,
9, 27), under2 = c(13, 1, 0, 0, 0, 0), under3 = c(1, 4, 4, 4,
5, 23), under4 = c(19, 9, 0, 5, 0, 0), Numtrees = c(4, 3, 6,
1, 3, 7), t1dbh = c(110, 100, 50, 60, 100, 70), t2dbh = c(80,
60, 60, NA, 70, 80), t3dbh = c(80, 110, 70, NA, 100, 50), t4dbh = c(90,
NA, 110, NA, NA, 60), t5dbh = c(NA, NA, 90, NA, NA, 50), t6dbh = c(NA,
NA, 110, NA, NA, 60), t7dbh = c(NA, NA, NA, NA, NA, 250), t8dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), t9dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), t10dbh = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
所需的输出以:
开头# A tibble: 2 x 7
Site Classification transect point canopy understory tdbh
<chr> <chr> <dbl> <dbl> <dbl> <chr> <chr>
1 Bala Primary forest 1 1 5.4 ... ...
2 Bala Primary forest 1 2 3.8 ... ...
我希望这可能只使用基础 R 和 tidyverse
中的任何内容(可能 dplyr
and/or tidyr
)...
编辑:我知道 mutate(canopy = mean(c(canopy1, canopy2, ...)))
应该可以正常工作,但这有两个问题:首先,它添加了一列而不是替换。这是一个麻烦,但并不可怕。但是,我必须列出所有内容。这是低效答案的标志。
你就不能这样做吗:
df$canopy <- rowMeans(df[grep("^canopy", names(df))])
或者,在 tidyverse-speak 中(并删除带有 select
的中间列以显示结果):
df %>%
mutate(canopy = rowMeans(select(., starts_with("canopy")))) %>%
select(-(5:24))
#> # A tibble: 6 x 5
#> Site Classification transect point canopy
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Bala Primary forest 1 1 5.4
#> 2 Bala Primary forest 1 2 3.8
#> 3 Bala Primary forest 1 3 3.6
#> 4 Bala Primary forest 1 4 5.2
#> 5 Bala Primary forest 1 5 3
#> 6 Bala Primary forest 2 1 4.2
使用 sapply
.
cbind(df[1:4], sapply(c("canopy", "under", "dbh"), function(x)
rowMeans(df[grep(x, names(df))], na.rm=TRUE)))
# Site Classification transect point canopy under dbh
# 1 Bala Primary forest 1 1 5.4 12.00 90.00000
# 2 Bala Primary forest 1 2 3.8 7.75 90.00000
# 3 Bala Primary forest 1 3 3.6 2.00 81.66667
# 4 Bala Primary forest 1 4 5.2 8.00 60.00000
# 5 Bala Primary forest 1 5 3.0 3.50 90.00000
# 6 Bala Primary forest 2 1 4.2 12.50 88.57143
使用 tidyverse
包的解决方案。我们可以用目标字符串创建一个向量,然后使用 map_dfc
和 mutate
动态计算平均值。之后,我们可以将计算出的列合并到原始数据框中。
library(tidyverse)
# Set the target column names
target <- c("canopy", "under", "dbh")
#
dat2 <- map_dfc(target, function(x){
temp <- dat %>%
mutate("{x}" := rowMeans(select(., contains(x)), na.rm = TRUE), .keep = "none")
})
dat3 <- dat %>%
select(-contains(target)) %>%
bind_cols(dat2)
print(dat3)
# # A tibble: 6 x 8
# Site Classification transect point Numtrees canopy under dbh
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Bala Primary forest 1 1 4 5.4 12 90
# 2 Bala Primary forest 1 2 3 3.8 7.75 90
# 3 Bala Primary forest 1 3 6 3.6 2 81.7
# 4 Bala Primary forest 1 4 1 5.2 8 60
# 5 Bala Primary forest 1 5 3 3 3.5 90
# 6 Bala Primary forest 2 1 7 4.2 12.5 88.6