R - 如何在数据框中每行找到三个最小值?
R - How to find three smallest values per row in a data frame?
我有一个数据框,其中包含几个雨量计之间的距离,如下所示:
df <- structure(list(`1549` = c(NA, 490774.05, 290832.68, 87750.38,
138531.18, 103870.34, 112919.7, 19625.65, 398693.43, 73514.23,
129691.63, 64279.9, 141587.34, 43643.42, 122327.17, 223922.21,
159877.72, 259277.48, 397058.24, 162170.11), `1550` = c(490774.05,
NA, 346526.87, 575898.52, 437822.83, 576249.76, 568218.1, 509608.26,
123809.9, 562535.73, 564793.88, 550108.84, 407124.69, 484522.14,
438163.36, 266959.95, 426227.74, 288355.87, 118585.43, 451437.16
), `1551` = c(290832.68, 346526.87, NA, 344074.16, 162014.5,
322268.72, 302065.93, 308396.36, 222759.32, 336164.42, 285694.49,
351932.11, 149572.61, 256425.81, 174567.46, 207661.81, 140177.31,
83159.56, 228092.01, 154156.1), `1552` = c(87750.38, 575898.52,
344074.16, NA, 182091.17, 40867.26, 66590.75, 71839.53, 479119.61,
14460.63, 93261.62, 55663.25, 198493.52, 94903.03, 169984.4,
309812.1, 204134.81, 328832.15, 478194.47, 195100.02), `1553` = c(138531.18,
437822.83, 162014.5, 182091.17, NA, 161552.92, 143725.71, 153432.89,
325263.63, 174539.44, 132252.45, 194340.98, 31927.37, 98444.64,
16790.01, 198440.03, 22316.41, 159524.37, 326631.15, 29460.62
), `1554` = c(103870.34, 576249.76, 322268.72, 40867.26, 161552.92,
NA, 26830.77, 93867.72, 473756.48, 47917.78, 53473.37, 92965.23,
183147.09, 92489.91, 152245.46, 313474.9, 182522.18, 316347.16,
473608.16, 169315.19), `1555` = c(112919.7, 568218.1, 302065.93,
66590.75, 143725.71, 26830.77, NA, 107114.83, 462486.94, 71081.79,
26755.47, 114938.18, 168181.74, 90811.33, 136477.11, 308852.76,
163624.08, 301551.94, 462815.84, 148089.64), `1556` = c(19625.65,
509608.26, 308396.36, 71839.53, 153432.89, 93867.72, 107114.83,
NA, 418219.65, 57381.36, 127019.34, 44887.51, 158865.19, 55484.99,
137694.58, 242667.11, 175231.39, 278552.75, 416536.23, 175455.6
), `1557` = c(398693.43, 123809.9, 222759.32, 479119.61, 325263.63,
473756.48, 462486.94, 418219.65, NA, 466733.89, 455884.87, 461489.12,
296185.66, 384844.71, 327993.12, 185684.32, 311265.55, 169212.4,
9134.61, 335535.36), `1558` = c(73514.23, 562535.73, 336164.42,
14460.63, 174539.44, 47917.78, 71081.79, 57381.36, 466733.89,
NA, 96997.95, 45101.5, 189181.18, 83727.19, 161645.62, 296199.74,
196763.58, 318209.6, 465667.59, 189348.21), `1559` = c(129691.63,
564793.88, 285694.49, 93261.62, 132252.45, 53473.37, 26755.47,
127019.34, 455884.87, 96997.95, NA, 139876.37, 159753.19, 99985.18,
127897.21, 309991.97, 150269.39, 291728.65, 456729.45, 131782.96
), `1561` = c(64279.9, 550108.84, 351932.11, 55663.25, 194340.98,
92965.23, 114938.18, 44887.51, 461489.12, 45101.5, 139876.37,
NA, 202394.04, 96114.27, 179318.52, 283357.46, 216503.97, 323440.15,
459552.78, 214003.26), `1562` = c(141587.34, 407124.69, 149572.61,
198493.52, 31927.37, 183147.09, 168181.74, 158865.19, 296185.66,
189181.18, 159753.19, 202394.04, NA, 107341.48, 31911.1, 166860.76,
33107.97, 133450.41, 297181.53, 55912.87), `1563` = c(43643.42,
484522.14, 256425.81, 94903.03, 98444.64, 92489.91, 90811.33,
55484.99, 384844.71, 83727.19, 99985.18, 96114.27, 107341.48,
NA, 83210.44, 220988.12, 120496.54, 234506.15, 384171.99, 119976.68
), `1564` = c(122327.17, 438163.36, 174567.46, 169984.4, 16790.01,
152245.46, 136477.11, 137694.58, 327993.12, 161645.62, 127897.21,
179318.52, 31911.1, 83210.44, NA, 192483.7, 37576.32, 165076.25,
328900.73, 45164.38), `1565` = c(223922.21, 266959.95, 207661.81,
309812.1, 198440.03, 313474.9, 308852.76, 242667.11, 185684.32,
296199.74, 309991.97, 283357.46, 166860.76, 220988.12, 192483.7,
NA, 196022.38, 126522.76, 181932.96, 221505.05), `1566` = c(159877.72,
426227.74, 140177.31, 204134.81, 22316.41, 182522.18, 163624.08,
175231.39, 311265.55, 196763.58, 150269.39, 216503.97, 33107.97,
120496.54, 37576.32, 196022.38, NA, 143436.02, 313128.42, 26013.61
), `1567` = c(259277.48, 288355.87, 83159.56, 328832.15, 159524.37,
316347.16, 301551.94, 278552.75, 169212.4, 318209.6, 291728.65,
323440.15, 133450.41, 234506.15, 165076.25, 126522.76, 143436.02,
NA, 171940.61, 166888.68), `1568` = c(397058.24, 118585.43, 228092.01,
478194.47, 326631.15, 473608.16, 462815.84, 416536.23, 9134.61,
465667.59, 456729.45, 459552.78, 297181.53, 384171.99, 328900.73,
181932.96, 313128.42, 171940.61, NA, 337646.57), `1569` = c(162170.11,
451437.16, 154156.1, 195100.02, 29460.62, 169315.19, 148089.64,
175455.6, 335535.36, 189348.21, 131782.96, 214003.26, 55912.87,
119976.68, 45164.38, 221505.05, 26013.61, 166888.68, 337646.57,
NA)), row.names = c(1549L, 1550L, 1551L, 1552L, 1553L, 1554L,
1555L, 1556L, 1557L, 1558L, 1559L, 1561L, 1562L, 1563L, 1564L,
1565L, 1566L, 1567L, 1568L, 1569L), class = "data.frame")
行名和列名是每个仪表的 ID。仪表到自身的距离为 0,所以我用 NA 替换了 0。
对于每一行,我试图找到三个最近的仪表并将它们连同它们的 ID 一起保存。预期的输出将是这样的:
id nearest distance
1549 1556 19625.65
1549 1563 43643.42
1549 1561 64279.90
1550 1568 118585.43
1550 1557 123809.90
1550 1565 266959.95
等等。我可以用代码 t(apply(df, 1, sort)[ 1:3, ])
找到最近的三个点,但我很难获得它们的 ID 并组装最终数据框...
我的实际数据集更大 (6289 x 6289),那么达到预期结果的最快方法是什么?
我建议将您的数据转换为如下所示的长格式。然后,您可以按 ID 对它们进行分组,并找到每个 ID 的最小值:
library(tidyr)
library(dplyr)
df %>%
mutate(ID1=rownames(.)) %>%
gather('ID2','dist',-ID1) %>%
filter(!is.na(dist)) %>%
group_by(ID1) %>%
summarise(mindist=min(dist),nearest=ID2[which.min(dist)])
编辑:对于至少 n 个距离,您可以只对 aftergrouping 和 select 前 n:
进行排序
df %>%
mutate(ID1=rownames(.)) %>%
gather('ID2','dist',-ID1) %>%
filter(!is.na(dist)) %>%
group_by(ID1) %>%
arrange(dist) %>%
slice(1:3) %>%
mutate(dist_rank=1:3)
一种选择是使用 pivot_longer()
和 top_n()
。
library(dplyr)
library(tidyr)
library(tibble)
df %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
group_by(rowname) %>%
top_n(-3, wt = value) %>%
arrange(rowname, value)
# A tibble: 60 x 3
# Groups: rowname [20]
rowname name value
<chr> <chr> <dbl>
1 1549 1556 19626.
2 1549 1563 43643.
3 1549 1561 64280.
4 1550 1568 118585.
5 1550 1557 123810.
6 1550 1565 266960.
7 1551 1567 83160.
8 1551 1566 140177.
9 1551 1562 149573.
10 1552 1558 14461.
# ... with 50 more rows
在基础 R 中,使用行名称创建所需的变量。
res <- do.call(rbind, lapply(1:nrow(df), function(i) {
r <- t(sort(df[i, ])[1:3])
data.frame(id=rownames(df[i, ]), nearest=rownames(r), distance=unname(r),
stringsAsFactors=FALSE)
}))
head(res)
# id nearest distance
# 1 1549 1556 19625.65
# 2 1549 1563 43643.42
# 3 1549 1561 64279.90
# 4 1550 1568 118585.43
# 5 1550 1557 123809.90
我有一个数据框,其中包含几个雨量计之间的距离,如下所示:
df <- structure(list(`1549` = c(NA, 490774.05, 290832.68, 87750.38,
138531.18, 103870.34, 112919.7, 19625.65, 398693.43, 73514.23,
129691.63, 64279.9, 141587.34, 43643.42, 122327.17, 223922.21,
159877.72, 259277.48, 397058.24, 162170.11), `1550` = c(490774.05,
NA, 346526.87, 575898.52, 437822.83, 576249.76, 568218.1, 509608.26,
123809.9, 562535.73, 564793.88, 550108.84, 407124.69, 484522.14,
438163.36, 266959.95, 426227.74, 288355.87, 118585.43, 451437.16
), `1551` = c(290832.68, 346526.87, NA, 344074.16, 162014.5,
322268.72, 302065.93, 308396.36, 222759.32, 336164.42, 285694.49,
351932.11, 149572.61, 256425.81, 174567.46, 207661.81, 140177.31,
83159.56, 228092.01, 154156.1), `1552` = c(87750.38, 575898.52,
344074.16, NA, 182091.17, 40867.26, 66590.75, 71839.53, 479119.61,
14460.63, 93261.62, 55663.25, 198493.52, 94903.03, 169984.4,
309812.1, 204134.81, 328832.15, 478194.47, 195100.02), `1553` = c(138531.18,
437822.83, 162014.5, 182091.17, NA, 161552.92, 143725.71, 153432.89,
325263.63, 174539.44, 132252.45, 194340.98, 31927.37, 98444.64,
16790.01, 198440.03, 22316.41, 159524.37, 326631.15, 29460.62
), `1554` = c(103870.34, 576249.76, 322268.72, 40867.26, 161552.92,
NA, 26830.77, 93867.72, 473756.48, 47917.78, 53473.37, 92965.23,
183147.09, 92489.91, 152245.46, 313474.9, 182522.18, 316347.16,
473608.16, 169315.19), `1555` = c(112919.7, 568218.1, 302065.93,
66590.75, 143725.71, 26830.77, NA, 107114.83, 462486.94, 71081.79,
26755.47, 114938.18, 168181.74, 90811.33, 136477.11, 308852.76,
163624.08, 301551.94, 462815.84, 148089.64), `1556` = c(19625.65,
509608.26, 308396.36, 71839.53, 153432.89, 93867.72, 107114.83,
NA, 418219.65, 57381.36, 127019.34, 44887.51, 158865.19, 55484.99,
137694.58, 242667.11, 175231.39, 278552.75, 416536.23, 175455.6
), `1557` = c(398693.43, 123809.9, 222759.32, 479119.61, 325263.63,
473756.48, 462486.94, 418219.65, NA, 466733.89, 455884.87, 461489.12,
296185.66, 384844.71, 327993.12, 185684.32, 311265.55, 169212.4,
9134.61, 335535.36), `1558` = c(73514.23, 562535.73, 336164.42,
14460.63, 174539.44, 47917.78, 71081.79, 57381.36, 466733.89,
NA, 96997.95, 45101.5, 189181.18, 83727.19, 161645.62, 296199.74,
196763.58, 318209.6, 465667.59, 189348.21), `1559` = c(129691.63,
564793.88, 285694.49, 93261.62, 132252.45, 53473.37, 26755.47,
127019.34, 455884.87, 96997.95, NA, 139876.37, 159753.19, 99985.18,
127897.21, 309991.97, 150269.39, 291728.65, 456729.45, 131782.96
), `1561` = c(64279.9, 550108.84, 351932.11, 55663.25, 194340.98,
92965.23, 114938.18, 44887.51, 461489.12, 45101.5, 139876.37,
NA, 202394.04, 96114.27, 179318.52, 283357.46, 216503.97, 323440.15,
459552.78, 214003.26), `1562` = c(141587.34, 407124.69, 149572.61,
198493.52, 31927.37, 183147.09, 168181.74, 158865.19, 296185.66,
189181.18, 159753.19, 202394.04, NA, 107341.48, 31911.1, 166860.76,
33107.97, 133450.41, 297181.53, 55912.87), `1563` = c(43643.42,
484522.14, 256425.81, 94903.03, 98444.64, 92489.91, 90811.33,
55484.99, 384844.71, 83727.19, 99985.18, 96114.27, 107341.48,
NA, 83210.44, 220988.12, 120496.54, 234506.15, 384171.99, 119976.68
), `1564` = c(122327.17, 438163.36, 174567.46, 169984.4, 16790.01,
152245.46, 136477.11, 137694.58, 327993.12, 161645.62, 127897.21,
179318.52, 31911.1, 83210.44, NA, 192483.7, 37576.32, 165076.25,
328900.73, 45164.38), `1565` = c(223922.21, 266959.95, 207661.81,
309812.1, 198440.03, 313474.9, 308852.76, 242667.11, 185684.32,
296199.74, 309991.97, 283357.46, 166860.76, 220988.12, 192483.7,
NA, 196022.38, 126522.76, 181932.96, 221505.05), `1566` = c(159877.72,
426227.74, 140177.31, 204134.81, 22316.41, 182522.18, 163624.08,
175231.39, 311265.55, 196763.58, 150269.39, 216503.97, 33107.97,
120496.54, 37576.32, 196022.38, NA, 143436.02, 313128.42, 26013.61
), `1567` = c(259277.48, 288355.87, 83159.56, 328832.15, 159524.37,
316347.16, 301551.94, 278552.75, 169212.4, 318209.6, 291728.65,
323440.15, 133450.41, 234506.15, 165076.25, 126522.76, 143436.02,
NA, 171940.61, 166888.68), `1568` = c(397058.24, 118585.43, 228092.01,
478194.47, 326631.15, 473608.16, 462815.84, 416536.23, 9134.61,
465667.59, 456729.45, 459552.78, 297181.53, 384171.99, 328900.73,
181932.96, 313128.42, 171940.61, NA, 337646.57), `1569` = c(162170.11,
451437.16, 154156.1, 195100.02, 29460.62, 169315.19, 148089.64,
175455.6, 335535.36, 189348.21, 131782.96, 214003.26, 55912.87,
119976.68, 45164.38, 221505.05, 26013.61, 166888.68, 337646.57,
NA)), row.names = c(1549L, 1550L, 1551L, 1552L, 1553L, 1554L,
1555L, 1556L, 1557L, 1558L, 1559L, 1561L, 1562L, 1563L, 1564L,
1565L, 1566L, 1567L, 1568L, 1569L), class = "data.frame")
行名和列名是每个仪表的 ID。仪表到自身的距离为 0,所以我用 NA 替换了 0。
对于每一行,我试图找到三个最近的仪表并将它们连同它们的 ID 一起保存。预期的输出将是这样的:
id nearest distance
1549 1556 19625.65
1549 1563 43643.42
1549 1561 64279.90
1550 1568 118585.43
1550 1557 123809.90
1550 1565 266959.95
等等。我可以用代码 t(apply(df, 1, sort)[ 1:3, ])
找到最近的三个点,但我很难获得它们的 ID 并组装最终数据框...
我的实际数据集更大 (6289 x 6289),那么达到预期结果的最快方法是什么?
我建议将您的数据转换为如下所示的长格式。然后,您可以按 ID 对它们进行分组,并找到每个 ID 的最小值:
library(tidyr)
library(dplyr)
df %>%
mutate(ID1=rownames(.)) %>%
gather('ID2','dist',-ID1) %>%
filter(!is.na(dist)) %>%
group_by(ID1) %>%
summarise(mindist=min(dist),nearest=ID2[which.min(dist)])
编辑:对于至少 n 个距离,您可以只对 aftergrouping 和 select 前 n:
进行排序df %>%
mutate(ID1=rownames(.)) %>%
gather('ID2','dist',-ID1) %>%
filter(!is.na(dist)) %>%
group_by(ID1) %>%
arrange(dist) %>%
slice(1:3) %>%
mutate(dist_rank=1:3)
一种选择是使用 pivot_longer()
和 top_n()
。
library(dplyr)
library(tidyr)
library(tibble)
df %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
group_by(rowname) %>%
top_n(-3, wt = value) %>%
arrange(rowname, value)
# A tibble: 60 x 3
# Groups: rowname [20]
rowname name value
<chr> <chr> <dbl>
1 1549 1556 19626.
2 1549 1563 43643.
3 1549 1561 64280.
4 1550 1568 118585.
5 1550 1557 123810.
6 1550 1565 266960.
7 1551 1567 83160.
8 1551 1566 140177.
9 1551 1562 149573.
10 1552 1558 14461.
# ... with 50 more rows
在基础 R 中,使用行名称创建所需的变量。
res <- do.call(rbind, lapply(1:nrow(df), function(i) {
r <- t(sort(df[i, ])[1:3])
data.frame(id=rownames(df[i, ]), nearest=rownames(r), distance=unname(r),
stringsAsFactors=FALSE)
}))
head(res)
# id nearest distance
# 1 1549 1556 19625.65
# 2 1549 1563 43643.42
# 3 1549 1561 64279.90
# 4 1550 1568 118585.43
# 5 1550 1557 123809.90