计算R中每个组的凸包
Calculating convex hull for each group in R
我有以下数据集:
structure(list(time = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
x = c(40.8914337158203, 20.0796813964844, 13.9093618392944,
17.1513957977295, 18.5109558105469, 40.7868537902832, 19.9750995635986,
13.804780960083, 16.8376483917236, 18.4063758850098, 40.6822700500488,
19.7659358978271, 13.7001991271973, 16.6284866333008, 18.3017921447754,
40.5776901245117, 19.66135597229, 13.5956182479858, 16.3147411346436,
18.1972122192383, 40.5776901245117, 19.5567722320557, 13.4910354614258,
16.1055774688721, 17.9880485534668), y = c(0.603550314903259,
-8.24852085113525, 9.65680503845215, -19.0118350982666, 6.43787002563477,
0.704141974449158, -8.34911251068115, 9.75739574432373, -19.2130165100098,
6.43787002563477, 0.704141974449158, -8.44970417022705, 9.75739574432373,
-19.5147914886475, 6.43787002563477, 0.704141974449158, -8.65088748931885,
9.85798835754395, -19.8165683746338, 6.33727836608887, 0.704141974449158,
-8.85207080841064, 9.85798835754395, -20.1183433532715, 6.33727836608887
), object = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -25L), .Names = c("time",
"x", "y", "object"))
现在,我想为 time
的每个值计算一个凸包(使用 chull
函数)并将其存储在同一数据集中(因为我想用ggplot2
然后)。
我可以使用 with
为每个时间值使用 chull
chull(filter(data_sample, time == 1)$x, filter(data_sample, time == 1)$y)
其中 returns 是 4 3 1
的向量。所以我想我可以先按时间分组,然后用
之类的东西计算组内的凸包点
data_sample %>% group_by(time) %>% summarise(pts = chull(data_sample$x, data_sample$y))
问题是我无法连续存储向量。将每个顶点存储在单独的列中是一个选项,但以下
data_sample %>% group_by(time) %>% summarise(pt1 = chull(data_sample$x, data_sample$y)[1])
没有给出合理的结果。所以我的问题是:
1. 如何为一列中的每一行存储一个向量?我读过 tibbles 实际上可以有一个列表列,但我该如何创建它呢?
2. 我尝试在每个组中计算 chull
有什么问题?
- (额外问题,如果可以的话)为什么实际上
data_sample %>% filter(time == 1) %>% chull(.$x, .$y)
不起作用?这是因为 chull
不是设计用于管道和 dplyr
吗?
您可以简单地在列表中传递 chull 函数:
df <- df %>%
group_by(time) %>%
mutate(chull_val = list(chull(x,y)))
如果您不想使用列表列*,您可以考虑使用(更灵活的)data.table
.
library(data.table)
setDT(d)
d[d[ , .I[chull(x, y)], by = time]$V1]
说明:将您的数据转换为data.table
(setDT(d)
)。对于每次 (by = time
),计算 chull
索引和 select 对应的行 (.I
)(参见 here)。
如果要绘制chull
个多边形,需要添加第一个索引来闭合多边形。
d2 <- d[ , {
# for each time (by = time):
# compute the indices lying on the convex hull
ix <- chull(x, y)
# use indices to select data of each subset (.SD)
# possibly also add the first coordinate to close the polygon for plotting
.SD[c(ix, ix[1])]}, by = time]
# plot chull and original polygons
library(ggplot2)
ggplot(d2, aes(x, y, fill = factor(time))) +
geom_polygon(alpha = 0.2) +
geom_polygon(data = d, alpha = 0.2)
*相关 dplyr
问题:Summarising verbs with variable-length outputs, Optional parameter to control length of summarise.
由于 chull
为您提供了原始数据的索引,您可能希望随时保留坐标,这意味着您可能不应该使用 summarize
。我建议您像使用 tidyr
一样使用 "nested" 概念。第一步是嵌套数据:
library(tidyr)
data_sample %>%
group_by(time) %>%
nest()
# # A tibble: 5 × 2
# time data
# <int> <list>
# 1 1 <tibble [5 × 3]>
# 2 2 <tibble [5 × 3]>
# 3 3 <tibble [5 × 3]>
# 4 4 <tibble [5 × 3]>
# 5 5 <tibble [5 × 3]>
从这里开始,只需计算外壳(这将 return 一个索引向量),然后按提供的顺序输出相关行。这将受益于 purrr
:
提供的 map
功能
library(purrr)
data_sample %>% data_sample %>%
group_by(time) %>%
nest() %>%
mutate(
hull = map(data, ~ with(.x, chull(x, y))),
out = map2(data, hull, ~ .x[.y,,drop=FALSE])
)
# # A tibble: 5 × 4
# time data hull out
# <int> <list> <list> <list>
# 1 1 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 2 2 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 3 3 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 4 4 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 5 5 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
(您应该能够将两个作业放在一个 mutate
中。我
从这里,您可以通过删除现在不需要的列并取消嵌套,将其转换为您需要的坐标:
data_sample %>%
group_by(time) %>%
nest() %>%
mutate(
hull = map(data, ~ with(.x, chull(x, y))),
out = map2(data, hull, ~ .x[.y,,drop=FALSE])
) %>%
select(-data) %>%
unnest()
# # A tibble: 15 × 5
# time hull x y object
# <int> <int> <dbl> <dbl> <int>
# 1 1 4 17.15140 -19.0118351 4
# 2 1 3 13.90936 9.6568050 3
# 3 1 1 40.89143 0.6035503 1
# 4 2 4 16.83765 -19.2130165 4
# 5 2 3 13.80478 9.7573957 3
# 6 2 1 40.78685 0.7041420 1
# 7 3 4 16.62849 -19.5147915 4
# 8 3 3 13.70020 9.7573957 3
# 9 3 1 40.68227 0.7041420 1
# 10 4 4 16.31474 -19.8165684 4
# 11 4 3 13.59562 9.8579884 3
# 12 4 1 40.57769 0.7041420 1
# 13 5 4 16.10558 -20.1183434 4
# 14 5 3 13.49104 9.8579884 3
# 15 5 1 40.57769 0.7041420 1
(出于演示目的,我在此处保留了 hull
;您可能可以在上方 select(-data, -hull)
,因为您将拥有所需的内容,尤其是在 object
多余的情况下。)
对于你的最后一个问题,你可以选择以下任一方法:
filter(data_sample, time == 1) %>%
with(., chull(x, y))
with(filter(data_sample, time == 1), chull(x, y))
我有以下数据集:
structure(list(time = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
x = c(40.8914337158203, 20.0796813964844, 13.9093618392944,
17.1513957977295, 18.5109558105469, 40.7868537902832, 19.9750995635986,
13.804780960083, 16.8376483917236, 18.4063758850098, 40.6822700500488,
19.7659358978271, 13.7001991271973, 16.6284866333008, 18.3017921447754,
40.5776901245117, 19.66135597229, 13.5956182479858, 16.3147411346436,
18.1972122192383, 40.5776901245117, 19.5567722320557, 13.4910354614258,
16.1055774688721, 17.9880485534668), y = c(0.603550314903259,
-8.24852085113525, 9.65680503845215, -19.0118350982666, 6.43787002563477,
0.704141974449158, -8.34911251068115, 9.75739574432373, -19.2130165100098,
6.43787002563477, 0.704141974449158, -8.44970417022705, 9.75739574432373,
-19.5147914886475, 6.43787002563477, 0.704141974449158, -8.65088748931885,
9.85798835754395, -19.8165683746338, 6.33727836608887, 0.704141974449158,
-8.85207080841064, 9.85798835754395, -20.1183433532715, 6.33727836608887
), object = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -25L), .Names = c("time",
"x", "y", "object"))
现在,我想为 time
的每个值计算一个凸包(使用 chull
函数)并将其存储在同一数据集中(因为我想用ggplot2
然后)。
我可以使用 with
chull
chull(filter(data_sample, time == 1)$x, filter(data_sample, time == 1)$y)
其中 returns 是 4 3 1
的向量。所以我想我可以先按时间分组,然后用
data_sample %>% group_by(time) %>% summarise(pts = chull(data_sample$x, data_sample$y))
问题是我无法连续存储向量。将每个顶点存储在单独的列中是一个选项,但以下
data_sample %>% group_by(time) %>% summarise(pt1 = chull(data_sample$x, data_sample$y)[1])
没有给出合理的结果。所以我的问题是:
1. 如何为一列中的每一行存储一个向量?我读过 tibbles 实际上可以有一个列表列,但我该如何创建它呢?
2. 我尝试在每个组中计算 chull
有什么问题?
- (额外问题,如果可以的话)为什么实际上
data_sample %>% filter(time == 1) %>% chull(.$x, .$y)
不起作用?这是因为chull
不是设计用于管道和dplyr
吗?
您可以简单地在列表中传递 chull 函数:
df <- df %>%
group_by(time) %>%
mutate(chull_val = list(chull(x,y)))
如果您不想使用列表列*,您可以考虑使用(更灵活的)data.table
.
library(data.table)
setDT(d)
d[d[ , .I[chull(x, y)], by = time]$V1]
说明:将您的数据转换为data.table
(setDT(d)
)。对于每次 (by = time
),计算 chull
索引和 select 对应的行 (.I
)(参见 here)。
如果要绘制chull
个多边形,需要添加第一个索引来闭合多边形。
d2 <- d[ , {
# for each time (by = time):
# compute the indices lying on the convex hull
ix <- chull(x, y)
# use indices to select data of each subset (.SD)
# possibly also add the first coordinate to close the polygon for plotting
.SD[c(ix, ix[1])]}, by = time]
# plot chull and original polygons
library(ggplot2)
ggplot(d2, aes(x, y, fill = factor(time))) +
geom_polygon(alpha = 0.2) +
geom_polygon(data = d, alpha = 0.2)
*相关 dplyr
问题:Summarising verbs with variable-length outputs, Optional parameter to control length of summarise.
由于 chull
为您提供了原始数据的索引,您可能希望随时保留坐标,这意味着您可能不应该使用 summarize
。我建议您像使用 tidyr
一样使用 "nested" 概念。第一步是嵌套数据:
library(tidyr)
data_sample %>%
group_by(time) %>%
nest()
# # A tibble: 5 × 2
# time data
# <int> <list>
# 1 1 <tibble [5 × 3]>
# 2 2 <tibble [5 × 3]>
# 3 3 <tibble [5 × 3]>
# 4 4 <tibble [5 × 3]>
# 5 5 <tibble [5 × 3]>
从这里开始,只需计算外壳(这将 return 一个索引向量),然后按提供的顺序输出相关行。这将受益于 purrr
:
map
功能
library(purrr)
data_sample %>% data_sample %>%
group_by(time) %>%
nest() %>%
mutate(
hull = map(data, ~ with(.x, chull(x, y))),
out = map2(data, hull, ~ .x[.y,,drop=FALSE])
)
# # A tibble: 5 × 4
# time data hull out
# <int> <list> <list> <list>
# 1 1 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 2 2 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 3 3 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 4 4 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
# 5 5 <tibble [5 × 3]> <int [3]> <tibble [3 × 3]>
(您应该能够将两个作业放在一个 mutate
中。我
从这里,您可以通过删除现在不需要的列并取消嵌套,将其转换为您需要的坐标:
data_sample %>%
group_by(time) %>%
nest() %>%
mutate(
hull = map(data, ~ with(.x, chull(x, y))),
out = map2(data, hull, ~ .x[.y,,drop=FALSE])
) %>%
select(-data) %>%
unnest()
# # A tibble: 15 × 5
# time hull x y object
# <int> <int> <dbl> <dbl> <int>
# 1 1 4 17.15140 -19.0118351 4
# 2 1 3 13.90936 9.6568050 3
# 3 1 1 40.89143 0.6035503 1
# 4 2 4 16.83765 -19.2130165 4
# 5 2 3 13.80478 9.7573957 3
# 6 2 1 40.78685 0.7041420 1
# 7 3 4 16.62849 -19.5147915 4
# 8 3 3 13.70020 9.7573957 3
# 9 3 1 40.68227 0.7041420 1
# 10 4 4 16.31474 -19.8165684 4
# 11 4 3 13.59562 9.8579884 3
# 12 4 1 40.57769 0.7041420 1
# 13 5 4 16.10558 -20.1183434 4
# 14 5 3 13.49104 9.8579884 3
# 15 5 1 40.57769 0.7041420 1
(出于演示目的,我在此处保留了 hull
;您可能可以在上方 select(-data, -hull)
,因为您将拥有所需的内容,尤其是在 object
多余的情况下。)
对于你的最后一个问题,你可以选择以下任一方法:
filter(data_sample, time == 1) %>%
with(., chull(x, y))
with(filter(data_sample, time == 1), chull(x, y))