数据帧 R 的条件子集
Conditional subsetting of a data frame R
设数据框为:
set.seed(123)
df<-data.frame(name=sample(LETTERS,260,replace=TRUE),
hobby=rep(c("outdoor","indoor"),260),chess=rnorm(1:10))
我将从 df 中提取的条件是:
df_cond<-df %>% group_by(name,hobby) %>%
summarize(count=n()) %>%
mutate(sum.var=sum(count),sum.name=length(name)) %>%
filter(sum.name==2) %>%
mutate(min.var=min(count)) %>%
mutate(use=ifelse(min.var==count,"yes","no")) %>%
filter(grepl("yes",use))
我想从 df
中随机提取与 df_cond
中的(姓名、爱好、计数)组合相对应的行以及 df
中的其余行。我在组合 %in%
和 sample
时遇到了一些麻烦。感谢您提供任何线索!
编辑:例如:
head(df_cond)
name hobby count sum.var sum.name min.var use
<fctr> <fctr> <int> <int> <int> <int> <chr>
1 A indoor 2 6 2 2 yes
2 B indoor 8 16 2 8 yes
3 B outdoor 8 16 2 8 yes
4 C outdoor 6 14 2 6 yes
5 D indoor 10 24 2 10 yes
6 E outdoor 8 18 2 8 yes
使用上面的数据框,我想从 df
中随机提取 2 行(=count),组合为 A+indoor(row1),
8 行,组合 B+室内(第 2 行)来自 df
....等等。
结合@denrous 和@Jacob 的答案来得到我需要的。像这样:
m2<-df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {df %>% filter(name == x, hobby == y)})) %>%
ungroup() %>%
select(data) %>%
unnest()
test<-m2 %>%
group_by(name,hobby) %>%
summarize(num.levels=length(unique(hobby))) %>%
ungroup() %>%
group_by(name) %>%
summarize(total_levels=sum(num.levels)) %>%
filter(total_levels>1)
fin<-semi_join(m2,test)
不清楚这是否正是您想要的,但您可能正在寻找 left_join
:
df %>%
left_join(df_cond, by = "name")
根据 OP 说明进行编辑。
必须有更好的方法,但我会使用循环:
library(dplyr)
master_df <- data.frame()
for (i in 1:nrow(df_cond)){
name = as.character(df_cond[i, 1])
hobby = as.character(df_cond[i, 2])
n = as.numeric(df_cond[i, 3])
temp_df <- df %>% filter(name == name, hobby == hobby)
temp_df <- sample_n(temp_df, n)
master_df <- rbind(master_df, temp_df)
}
如果我没理解错的话,你可以使用purrr
来实现你想要的:
df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {filter(df, name == x, hobby == y)})) %>%
mutate(data = map2(data, count, function(x, y) sample_n(x, size = y)))
如果您想要与 df 相同的形式:
df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {df %>% filter(name == x, hobby == y)})) %>%
mutate(data = map2(data, count, function(x, y) sample_n(x, size = y))) %>%
ungroup() %>%
select(data) %>%
unnest()
设数据框为:
set.seed(123)
df<-data.frame(name=sample(LETTERS,260,replace=TRUE),
hobby=rep(c("outdoor","indoor"),260),chess=rnorm(1:10))
我将从 df 中提取的条件是:
df_cond<-df %>% group_by(name,hobby) %>%
summarize(count=n()) %>%
mutate(sum.var=sum(count),sum.name=length(name)) %>%
filter(sum.name==2) %>%
mutate(min.var=min(count)) %>%
mutate(use=ifelse(min.var==count,"yes","no")) %>%
filter(grepl("yes",use))
我想从 df
中随机提取与 df_cond
中的(姓名、爱好、计数)组合相对应的行以及 df
中的其余行。我在组合 %in%
和 sample
时遇到了一些麻烦。感谢您提供任何线索!
编辑:例如:
head(df_cond)
name hobby count sum.var sum.name min.var use
<fctr> <fctr> <int> <int> <int> <int> <chr>
1 A indoor 2 6 2 2 yes
2 B indoor 8 16 2 8 yes
3 B outdoor 8 16 2 8 yes
4 C outdoor 6 14 2 6 yes
5 D indoor 10 24 2 10 yes
6 E outdoor 8 18 2 8 yes
使用上面的数据框,我想从 df
中随机提取 2 行(=count),组合为 A+indoor(row1),
8 行,组合 B+室内(第 2 行)来自 df
....等等。
结合@denrous 和@Jacob 的答案来得到我需要的。像这样:
m2<-df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {df %>% filter(name == x, hobby == y)})) %>%
ungroup() %>%
select(data) %>%
unnest()
test<-m2 %>%
group_by(name,hobby) %>%
summarize(num.levels=length(unique(hobby))) %>%
ungroup() %>%
group_by(name) %>%
summarize(total_levels=sum(num.levels)) %>%
filter(total_levels>1)
fin<-semi_join(m2,test)
不清楚这是否正是您想要的,但您可能正在寻找 left_join
:
df %>%
left_join(df_cond, by = "name")
根据 OP 说明进行编辑。
必须有更好的方法,但我会使用循环:
library(dplyr)
master_df <- data.frame()
for (i in 1:nrow(df_cond)){
name = as.character(df_cond[i, 1])
hobby = as.character(df_cond[i, 2])
n = as.numeric(df_cond[i, 3])
temp_df <- df %>% filter(name == name, hobby == hobby)
temp_df <- sample_n(temp_df, n)
master_df <- rbind(master_df, temp_df)
}
如果我没理解错的话,你可以使用purrr
来实现你想要的:
df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {filter(df, name == x, hobby == y)})) %>%
mutate(data = map2(data, count, function(x, y) sample_n(x, size = y)))
如果您想要与 df 相同的形式:
df_cond %>%
mutate(data = map2(name, hobby, function(x, y) {df %>% filter(name == x, hobby == y)})) %>%
mutate(data = map2(data, count, function(x, y) sample_n(x, size = y))) %>%
ungroup() %>%
select(data) %>%
unnest()