如何根据多个条件对行求和 - R?
How to sum rows based on multiple conditions - R?
我有一个包含地块 ID (plotID)、树种代码 (species) 和覆盖值 (cover) 的数据框。您可以看到其中一个地块内有多个树种记录。如果每个图中有重复的 "species" 行,我如何对 "cover" 字段求和?
例如,这里有一些示例数据:
# Sample Data
plotID = c( "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200046012040",
"SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040")
species = c("ABBA", "BEPA", "PIBA2", "PIMA", "PIRE", "PIBA2", "PIBA2", "PIMA", "PIMA", "PIRE", "POTR5", "POTR5")
cover = c(26.893939, 5.681818, 9.469697, 16.287879, 1.893939, 16.287879, 4.166667, 10.984848, 16.666667, 11.363636, 18.181818,
13.257576)
df_original = data.frame(plotID, species, cover)
这是预期的输出:
# Intended Output
plotID2 = c( "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200046012040",
"SUF200046012040", "SUF200046012040", "SUF200046012040")
species2 = c("ABBA", "BEPA", "PIBA2", "PIMA", "PIRE", "PIBA2", "PIMA", "PIRE", "POTR5")
cover2 = c(26.893939, 5.681818, 9.469697, 16.287879, 1.893939, 20.454546, 18.651515, 11.363636, 31.439394)
df_intended_output = data.frame(plotID2, species2, cover2)
您可以通过多种方式做到这一点。使用 base-r,dplyr
和 data.table
将是最典型的。
这是dplyr
的方式:
library(dplyr)
df_original %>% group_by(plotID, species) %>% summarize(cover = sum(cover))
# plotID species cover
#1 SUF200001035014 ABBA 26.893939
#2 SUF200001035014 BEPA 5.681818
#3 SUF200001035014 PIBA2 9.469697
#4 SUF200001035014 PIMA 16.287879
#5 SUF200001035014 PIRE 1.893939
#6 SUF200046012040 PIBA2 20.454546
#7 SUF200046012040 PIMA 27.651515
#8 SUF200046012040 PIRE 11.363636
#9 SUF200046012040 POTR5 31.439394
这将是 base-r 方式:
aggregate(df_original$cover, by=list(df_original$plotID, df_original$species), FUN=sum)
还有一个 data.table 方式 -
library(data.table)
DT <- as.data.table(df_original)
DT[, lapply(.SD,sum), by = "plotID,species"]
轻松 aggregate
aggregate(cover~species+plotID, data=df_original, FUN=sum)
使用 data.table
更容易
as.data.table(df_original)[, sum(cover), by = .(plotID, species)]
如上所述,ddply来自plyr包
library(plyr)
ddply(df_original, c("plotID","species"), summarise,cover2= sum(cover))
plotID species cover2
1 SUF200001035014 ABBA 26.893939
2 SUF200001035014 BEPA 5.681818
3 SUF200001035014 PIBA2 9.469697
4 SUF200001035014 PIMA 16.287879
5 SUF200001035014 PIRE 1.893939
6 SUF200046012040 PIBA2 20.454546
7 SUF200046012040 PIMA 27.651515
8 SUF200046012040 PIRE 11.363636
9 SUF200046012040 POTR5 31.439394
我有一个包含地块 ID (plotID)、树种代码 (species) 和覆盖值 (cover) 的数据框。您可以看到其中一个地块内有多个树种记录。如果每个图中有重复的 "species" 行,我如何对 "cover" 字段求和?
例如,这里有一些示例数据:
# Sample Data
plotID = c( "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200046012040",
"SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040", "SUF200046012040")
species = c("ABBA", "BEPA", "PIBA2", "PIMA", "PIRE", "PIBA2", "PIBA2", "PIMA", "PIMA", "PIRE", "POTR5", "POTR5")
cover = c(26.893939, 5.681818, 9.469697, 16.287879, 1.893939, 16.287879, 4.166667, 10.984848, 16.666667, 11.363636, 18.181818,
13.257576)
df_original = data.frame(plotID, species, cover)
这是预期的输出:
# Intended Output
plotID2 = c( "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200001035014", "SUF200046012040",
"SUF200046012040", "SUF200046012040", "SUF200046012040")
species2 = c("ABBA", "BEPA", "PIBA2", "PIMA", "PIRE", "PIBA2", "PIMA", "PIRE", "POTR5")
cover2 = c(26.893939, 5.681818, 9.469697, 16.287879, 1.893939, 20.454546, 18.651515, 11.363636, 31.439394)
df_intended_output = data.frame(plotID2, species2, cover2)
您可以通过多种方式做到这一点。使用 base-r,dplyr
和 data.table
将是最典型的。
这是dplyr
的方式:
library(dplyr)
df_original %>% group_by(plotID, species) %>% summarize(cover = sum(cover))
# plotID species cover
#1 SUF200001035014 ABBA 26.893939
#2 SUF200001035014 BEPA 5.681818
#3 SUF200001035014 PIBA2 9.469697
#4 SUF200001035014 PIMA 16.287879
#5 SUF200001035014 PIRE 1.893939
#6 SUF200046012040 PIBA2 20.454546
#7 SUF200046012040 PIMA 27.651515
#8 SUF200046012040 PIRE 11.363636
#9 SUF200046012040 POTR5 31.439394
这将是 base-r 方式:
aggregate(df_original$cover, by=list(df_original$plotID, df_original$species), FUN=sum)
还有一个 data.table 方式 -
library(data.table)
DT <- as.data.table(df_original)
DT[, lapply(.SD,sum), by = "plotID,species"]
轻松 aggregate
aggregate(cover~species+plotID, data=df_original, FUN=sum)
使用 data.table
as.data.table(df_original)[, sum(cover), by = .(plotID, species)]
如上所述,ddply来自plyr包
library(plyr)
ddply(df_original, c("plotID","species"), summarise,cover2= sum(cover))
plotID species cover2
1 SUF200001035014 ABBA 26.893939
2 SUF200001035014 BEPA 5.681818
3 SUF200001035014 PIBA2 9.469697
4 SUF200001035014 PIMA 16.287879
5 SUF200001035014 PIRE 1.893939
6 SUF200046012040 PIBA2 20.454546
7 SUF200046012040 PIMA 27.651515
8 SUF200046012040 PIRE 11.363636
9 SUF200046012040 POTR5 31.439394