Return 具有多个条件的向量中的最大值
Return highest values in a vector with multiple conditionals
我有这些示例数据
Data <- structure(list(IndID = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L), .Label = c("1",
"2", "3", "4", "5", "56", "58", "59", "60", "63"), class = "factor"),
Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("BHS",
"MTG"), class = "factor"), Season = structure(c(1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L), .Label = c("Summer", "Winter"), class = "factor"),
Percent = c(0.992, 0.992, 0.996, 0.976, 0.995, 0.871, 0.996,
0.996, 0.916, 0.875, 0.652, 0.802, 0.964, 0.673, 0.956, 0.879,
0.972, 0.782, 0.968, 0.832)), .Names = c("IndID", "Species",
"Season", "Percent"), row.names = c(NA, -20L), class = "data.frame")
看起来像这样
> head(Data)
IndID Species Season Percent
1 1 BHS Summer 0.992
2 1 BHS Winter 0.992
3 2 BHS Summer 0.996
4 2 BHS Winter 0.976
5 3 BHS Winter 0.995
6 3 BHS Summer 0.871
有 10 个独特的个体属于两个物种之一(BHS 或 MTG)。对于每个人 (IndID),每个季节(冬季和夏季)都有一个百分比值。
对于每个物种,我想 select 具有最高平均百分比值的两个个体。
EDIT 另请参阅下面我的注释。我没有 post 一个特定的结果,因为有多个结果可以满足我的需要。因为每个季节我都需要衡量百分比,所以我认为取百分比的平均值是 select 最佳个人的最佳方法。每个季节都会测量百分比,但我想要 select 排名最高的 IndID。我还可以按百分比总和(而不是平均值)对 IndID 进行排名。
除了由 @akrun post编辑的第二段代码外,4 个 IndID(每个物种排名最高的两个)的向量也将是一个很好的输出。
在此先感谢您的帮助。
假设您想要一个 dplyr
解决方案(来自标签),我们按 'Species' 对数据进行分组,将 'Percent' 列降序排列 (arrange
)并使用 slice
获取每个 'Species'
的前两行
library(dplyr)
Data %>%
group_by(Species) %>%
arrange(desc(Percent)) %>%
slice(1:2)
# IndID Species Season Percent
#1 2 BHS Summer 0.996
#2 4 BHS Summer 0.996
#3 60 MTG Summer 0.972
#4 63 MTG Summer 0.968
预期的输出会更容易。如果这是基于平均百分比,我们按 'Species' 和 'IndID' 分组,根据 'Percent' 的 mean
创建一个新列 'AvgPercent',我们分组'Species',按降序排列 'AvgPercent' 列,得到前两个 'IndID'
Data %>%
group_by(Species, IndID) %>%
mutate(AvgPercent=mean(Percent)) %>%
group_by(Species) %>%
arrange(desc(AvgPercent)) %>%
slice(1:4) %>%
select(-AvgPercent) %>%
filter(!duplicated(IndID))
# IndID Species Season Percent
#1 4 BHS Summer 0.996
#2 1 BHS Summer 0.992
#3 59 MTG Summer 0.956
#4 63 MTG Summer 0.968
或者用plyr
ddply(Data, "Species", function(x) sort(x[, "Percent"], T))[, 1:3]
Species V1 V2
1 BHS 0.996 0.996
2 MTG 0.972 0.968
另一个选项利用 tidyr
的 gather
和 spread
library(dplyr)
library(tidyr)
Data %>%
spread(Season, Percent) %>%
mutate(avg = (Summer + Winter)/2) %>%
group_by(Species) %>%
arrange(desc(avg)) %>%
top_n(2)
# IndID Species Summer Winter avg
#1 4 BHS 0.996 0.996 0.9960
#2 1 BHS 0.992 0.992 0.9920
#3 59 MTG 0.956 0.879 0.9175
#4 63 MTG 0.968 0.832 0.9000
这是一个data.table
方法
library(data.table)
setDT(Data)[, avg := mean(Percent), by = .(IndID, Species)]
Data[Data[Season=="Summer", .I[order(avg, decreasing = T)[1:2]], by = Species]$V1]
# IndID Species Season Percent avg
#1: 4 BHS Summer 0.996 0.9960
#2: 1 BHS Summer 0.992 0.9920
#3: 59 MTG Summer 0.956 0.9175
#4: 63 MTG Summer 0.968 0.9000
一个 data.table
解决方案 (library(data.table)
)。
d <- data.table(Data)
将您的 Data
包装到 data.table
对象中。
制作一个新的 table,其中还列出平均百分比(每个人在夏季和冬季之间)。
t <- d[, meanPercent := mean(Percent), by = IndID]
根据IndID
合并一些行
t <- t[, .SD[, list(Species, meanPercent)][1], by = IndID]
最后 select 每个物种平均百分比最高的两个个体。
t[order(-meanPercent)][Species == "BHS"][1:2]
t[order(-meanPercent)][Species == "MTG"][1:2]
我有这些示例数据
Data <- structure(list(IndID = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L), .Label = c("1",
"2", "3", "4", "5", "56", "58", "59", "60", "63"), class = "factor"),
Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("BHS",
"MTG"), class = "factor"), Season = structure(c(1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L), .Label = c("Summer", "Winter"), class = "factor"),
Percent = c(0.992, 0.992, 0.996, 0.976, 0.995, 0.871, 0.996,
0.996, 0.916, 0.875, 0.652, 0.802, 0.964, 0.673, 0.956, 0.879,
0.972, 0.782, 0.968, 0.832)), .Names = c("IndID", "Species",
"Season", "Percent"), row.names = c(NA, -20L), class = "data.frame")
看起来像这样
> head(Data)
IndID Species Season Percent
1 1 BHS Summer 0.992
2 1 BHS Winter 0.992
3 2 BHS Summer 0.996
4 2 BHS Winter 0.976
5 3 BHS Winter 0.995
6 3 BHS Summer 0.871
有 10 个独特的个体属于两个物种之一(BHS 或 MTG)。对于每个人 (IndID),每个季节(冬季和夏季)都有一个百分比值。
对于每个物种,我想 select 具有最高平均百分比值的两个个体。
EDIT 另请参阅下面我的注释。我没有 post 一个特定的结果,因为有多个结果可以满足我的需要。因为每个季节我都需要衡量百分比,所以我认为取百分比的平均值是 select 最佳个人的最佳方法。每个季节都会测量百分比,但我想要 select 排名最高的 IndID。我还可以按百分比总和(而不是平均值)对 IndID 进行排名。
除了由 @akrun post编辑的第二段代码外,4 个 IndID(每个物种排名最高的两个)的向量也将是一个很好的输出。
在此先感谢您的帮助。
假设您想要一个 dplyr
解决方案(来自标签),我们按 'Species' 对数据进行分组,将 'Percent' 列降序排列 (arrange
)并使用 slice
获取每个 'Species'
library(dplyr)
Data %>%
group_by(Species) %>%
arrange(desc(Percent)) %>%
slice(1:2)
# IndID Species Season Percent
#1 2 BHS Summer 0.996
#2 4 BHS Summer 0.996
#3 60 MTG Summer 0.972
#4 63 MTG Summer 0.968
预期的输出会更容易。如果这是基于平均百分比,我们按 'Species' 和 'IndID' 分组,根据 'Percent' 的 mean
创建一个新列 'AvgPercent',我们分组'Species',按降序排列 'AvgPercent' 列,得到前两个 'IndID'
Data %>%
group_by(Species, IndID) %>%
mutate(AvgPercent=mean(Percent)) %>%
group_by(Species) %>%
arrange(desc(AvgPercent)) %>%
slice(1:4) %>%
select(-AvgPercent) %>%
filter(!duplicated(IndID))
# IndID Species Season Percent
#1 4 BHS Summer 0.996
#2 1 BHS Summer 0.992
#3 59 MTG Summer 0.956
#4 63 MTG Summer 0.968
或者用plyr
ddply(Data, "Species", function(x) sort(x[, "Percent"], T))[, 1:3]
Species V1 V2
1 BHS 0.996 0.996
2 MTG 0.972 0.968
另一个选项利用 tidyr
的 gather
和 spread
library(dplyr)
library(tidyr)
Data %>%
spread(Season, Percent) %>%
mutate(avg = (Summer + Winter)/2) %>%
group_by(Species) %>%
arrange(desc(avg)) %>%
top_n(2)
# IndID Species Summer Winter avg
#1 4 BHS 0.996 0.996 0.9960
#2 1 BHS 0.992 0.992 0.9920
#3 59 MTG 0.956 0.879 0.9175
#4 63 MTG 0.968 0.832 0.9000
这是一个data.table
方法
library(data.table)
setDT(Data)[, avg := mean(Percent), by = .(IndID, Species)]
Data[Data[Season=="Summer", .I[order(avg, decreasing = T)[1:2]], by = Species]$V1]
# IndID Species Season Percent avg
#1: 4 BHS Summer 0.996 0.9960
#2: 1 BHS Summer 0.992 0.9920
#3: 59 MTG Summer 0.956 0.9175
#4: 63 MTG Summer 0.968 0.9000
一个 data.table
解决方案 (library(data.table)
)。
d <- data.table(Data)
将您的 Data
包装到 data.table
对象中。
制作一个新的 table,其中还列出平均百分比(每个人在夏季和冬季之间)。
t <- d[, meanPercent := mean(Percent), by = IndID]
根据IndID
t <- t[, .SD[, list(Species, meanPercent)][1], by = IndID]
最后 select 每个物种平均百分比最高的两个个体。
t[order(-meanPercent)][Species == "BHS"][1:2]
t[order(-meanPercent)][Species == "MTG"][1:2]