将数据框分成两组
Split dataframe into two groups
我模拟了这个data.frame
:
library(plyr); library(ggplot2)
count <- rev(seq(0, 500, 20))
tide <- seq(0, 5, length.out = length(count))
df <- data.frame(count, tide)
count_sim <- unlist(llply(count, function(x) rnorm(20, x, 50)))
count_sim_df <- data.frame(tide=rep(tide,each=20), count_sim)
也可以画成这样:
ggplot(df, aes(tide, count)) + geom_jitter(data = count_sim_df, aes(tide, count_sim), position = position_jitter(width = 0.09)) + geom_line(color = "red")
我现在想将 count_sim_df
分成两组:high
和 low
。当我绘制拆分 count_sim_df
时,它应该看起来像这样(绿色和蓝色的所有内容都经过 Photoshop 处理)。我发现棘手的一点是 high
和 low
之间的重叠围绕 tide
.
的中间值
这就是我想要将 count_sim_df
分成高低的方式:
- 将
count_sim_df
的一半分配给high
,将count_sim_df
的一半分配给low
- 重新分配
count
的值以在 high
和 low
之间围绕 tide
的中间值创建重叠
这是我修改后的建议。我希望它有所帮助。
middle_tide <- mean(count_sim_df$tide)
hilo_margin <- 0.3
middle_df <- subset(count_sim_df,tide > (middle_tide * (1 - hilo_margin)))
middle_df <- subset(middle_df, tide < (middle_tide * (1 + hilo_margin)))
upper_df <- count_sim_df[count_sim_df$tide > (middle_tide * (1 + hilo_margin)),]
lower_df <- count_sim_df[count_sim_df$tide < (middle_tide * (1 - hilo_margin)),]
idx <- sample(2,nrow(middle_df), replace = T)
count_sim_high <- rbind(middle_df[idx==1,], upper_df)
count_sim_low <- rbind(middle_df[idx==2,], lower_df)
p <- ggplot(df, aes(tide, count)) +
geom_jitter(data = count_sim_high, aes(tide, count_sim), position = position_jitter(width = 0.09), alpha=0.4, col=3, size=3) +
geom_jitter(data = count_sim_low, aes(tide, count_sim), position = position_jitter(width = 0.09), alpha=0.4, col=4, size=3) +
geom_line(color = "red")
我可能还没有完全理解你的分高分低的过程,尤其是你说的"reassigning the value of count"是什么意思。在这种情况下,我在 tide
的中间值周围定义了一个 30% 的重叠区域,并将该过渡区域内的一半点随机分配给 "high",另一半分配给 "low" 设置。
这是一种使用相对较少的代码和仅使用基本 R 来生成示例数据集和分组的方法:
library(ggplot2)
count <- rev(seq(0, 500, 20))
tide <- seq(0, 5, length.out = length(count))
df <- data.frame(count, tide)
count_sim_df <- data.frame(tide = rep(tide,each=20),
count = rnorm(20 * nrow(df), rep(count, each = 20), 50))
margin <- 0.3
count_sim_df$`tide level` <-
with(count_sim_df,
factor((tide >= quantile(tide, 0.5 + margin / 2) |
(tide >= quantile(tide, 0.5 - margin / 2) & sample(0:1, length(tide), TRUE))),
labels = c("Low", "High")))
ggplot(df, aes(x = tide, y = count)) +
geom_line(colour = "red") +
geom_point(aes(colour = `tide level`), count_sim_df, position = "jitter") +
scale_colour_manual(values = c(High = "green", Low = "blue"))
我模拟了这个data.frame
:
library(plyr); library(ggplot2)
count <- rev(seq(0, 500, 20))
tide <- seq(0, 5, length.out = length(count))
df <- data.frame(count, tide)
count_sim <- unlist(llply(count, function(x) rnorm(20, x, 50)))
count_sim_df <- data.frame(tide=rep(tide,each=20), count_sim)
也可以画成这样:
ggplot(df, aes(tide, count)) + geom_jitter(data = count_sim_df, aes(tide, count_sim), position = position_jitter(width = 0.09)) + geom_line(color = "red")
我现在想将 count_sim_df
分成两组:high
和 low
。当我绘制拆分 count_sim_df
时,它应该看起来像这样(绿色和蓝色的所有内容都经过 Photoshop 处理)。我发现棘手的一点是 high
和 low
之间的重叠围绕 tide
.
这就是我想要将 count_sim_df
分成高低的方式:
- 将
count_sim_df
的一半分配给high
,将count_sim_df
的一半分配给low
- 重新分配
count
的值以在high
和low
之间围绕tide
的中间值创建重叠
这是我修改后的建议。我希望它有所帮助。
middle_tide <- mean(count_sim_df$tide)
hilo_margin <- 0.3
middle_df <- subset(count_sim_df,tide > (middle_tide * (1 - hilo_margin)))
middle_df <- subset(middle_df, tide < (middle_tide * (1 + hilo_margin)))
upper_df <- count_sim_df[count_sim_df$tide > (middle_tide * (1 + hilo_margin)),]
lower_df <- count_sim_df[count_sim_df$tide < (middle_tide * (1 - hilo_margin)),]
idx <- sample(2,nrow(middle_df), replace = T)
count_sim_high <- rbind(middle_df[idx==1,], upper_df)
count_sim_low <- rbind(middle_df[idx==2,], lower_df)
p <- ggplot(df, aes(tide, count)) +
geom_jitter(data = count_sim_high, aes(tide, count_sim), position = position_jitter(width = 0.09), alpha=0.4, col=3, size=3) +
geom_jitter(data = count_sim_low, aes(tide, count_sim), position = position_jitter(width = 0.09), alpha=0.4, col=4, size=3) +
geom_line(color = "red")
我可能还没有完全理解你的分高分低的过程,尤其是你说的"reassigning the value of count"是什么意思。在这种情况下,我在 tide
的中间值周围定义了一个 30% 的重叠区域,并将该过渡区域内的一半点随机分配给 "high",另一半分配给 "low" 设置。
这是一种使用相对较少的代码和仅使用基本 R 来生成示例数据集和分组的方法:
library(ggplot2)
count <- rev(seq(0, 500, 20))
tide <- seq(0, 5, length.out = length(count))
df <- data.frame(count, tide)
count_sim_df <- data.frame(tide = rep(tide,each=20),
count = rnorm(20 * nrow(df), rep(count, each = 20), 50))
margin <- 0.3
count_sim_df$`tide level` <-
with(count_sim_df,
factor((tide >= quantile(tide, 0.5 + margin / 2) |
(tide >= quantile(tide, 0.5 - margin / 2) & sample(0:1, length(tide), TRUE))),
labels = c("Low", "High")))
ggplot(df, aes(x = tide, y = count)) +
geom_line(colour = "red") +
geom_point(aes(colour = `tide level`), count_sim_df, position = "jitter") +
scale_colour_manual(values = c(High = "green", Low = "blue"))