仅从向量列表中获取最常出现的值
Get only the most often occurring values from a list of vectors
我有数据如下:
dat <- list(nr1 = list(list_of_account_numbers = " 0000000000",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111111",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111112", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111112", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " 0000000000",
" 0000000000"), nr2 = list(list_of_account_numbers = " NL30ABNA0111111111",
" NL31RABO0111111111", " NL30ABNA0111111111", " NL30ABNA0111111111",
" NL30ABNA0111111111", " NL31RABO0111111111", " NL31RABO0111111111",
" NL52RABO0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL30ABNA0111111111", " NL30ABNA0111111111", " NL30ABNA0111111111",
" NL74INGB0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL74INGB0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL16DEUT0111111111"), nr3 = list(
list_of_account_numbers = " NL11BANKO0111111111", " NL11BANKO0111111111",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111"))
我正在尝试为每个列表项(nr1
、nr2
、nr3
)编写一个代码,获取前 3 个最常出现的值。还有两个问题。
- 某些列表项的值为
0000000000
,应将其排除。
- 有些列表项没有3个值,只有一两个。
我认为要做的第一件事是取消列出项目并删除出现的 0000000000
;
IBAN_numbers <- list()
y <- " 0000000000"
for (i in 1:length(dat)) {
IBAN_numbers[[i]] <- unlist(dat[i])
IBAN_numbers[[i]] = IBAN_numbers[[i]][! IBAN_numbers[[i]] %in% y]
}
但我不确定如何实现最后一点。
table(IBAN_numbers[[1]])
# NL11BANKO0111111111 NL11BANKO0111111112 NL11BANKO0111111113
# 9 2 3
table(IBAN_numbers[[2]])
# NL16DEUT0111111111 NL30ABNA0111111111 NL31RABO0111111111 NL52RABO0111111111 NL74INGB0111111111
# 1 7 3 1 8
table(IBAN_numbers[[3]])
# NL11BANKO0111111111 NL11BANKO0111111113
# 10 3
我可以这样做:
IBAN_numbers <- list()
y <- " 0000000000"
for (i in 1:length(dat)) {
IBAN_numbers[[i]] <- unlist(dat[i])
IBAN_numbers[[i]] = IBAN_numbers[[i]][! IBAN_numbers[[i]] %in% y]
IBAN_numbers[[i]] = table(IBAN_numbers[[i]])
}
所以对于中间 table,我只想要三个条目(我不关心出现一次需要哪个选项,只要它不崩溃)。
谁能帮我完成最后一步?
您可以使用 lapply
-
y <- " 0000000000"
lapply(dat, function(x) {
x <- unlist(x)
head(sort(table(x[x != y]), decreasing = TRUE), 3)
})
#$nr1
#NL11BANKO0111111111 NL11BANKO0111111113 NL11BANKO0111111112
# 9 3 2
#$nr2
# NL74INGB0111111111 NL30ABNA0111111111 NL31RABO0111111111
# 8 7 3
#$nr3
# NL11BANKO0111111111 NL11BANKO0111111113
# 10 3
如果您只对名称感兴趣,可以使用 names(head(sort(table(x[x != y]), decreasing = TRUE), 3))
。
使用tidyverse
library(dplyr)
library(purrr)
map(dat, ~ tibble(col1 = flatten_chr(.x)) %>%
filter(col1 != y) %>%
count(col1) %>%
slice_max(n = 3, order_by = n))
-输出
$nr1
# A tibble: 3 × 2
col1 n
<chr> <int>
1 " NL11BANKO0111111111" 9
2 " NL11BANKO0111111113" 3
3 " NL11BANKO0111111112" 2
$nr2
# A tibble: 3 × 2
col1 n
<chr> <int>
1 " NL74INGB0111111111" 8
2 " NL30ABNA0111111111" 7
3 " NL31RABO0111111111" 3
$nr3
# A tibble: 2 × 2
col1 n
<chr> <int>
1 " NL11BANKO0111111111" 10
2 " NL11BANKO0111111113" 3
我有数据如下:
dat <- list(nr1 = list(list_of_account_numbers = " 0000000000",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111111",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111112", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111112", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " 0000000000",
" 0000000000"), nr2 = list(list_of_account_numbers = " NL30ABNA0111111111",
" NL31RABO0111111111", " NL30ABNA0111111111", " NL30ABNA0111111111",
" NL30ABNA0111111111", " NL31RABO0111111111", " NL31RABO0111111111",
" NL52RABO0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL30ABNA0111111111", " NL30ABNA0111111111", " NL30ABNA0111111111",
" NL74INGB0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL74INGB0111111111", " NL74INGB0111111111", " NL74INGB0111111111",
" NL16DEUT0111111111"), nr3 = list(
list_of_account_numbers = " NL11BANKO0111111111", " NL11BANKO0111111111",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111", " NL11BANKO0111111113",
" NL11BANKO0111111111", " NL11BANKO0111111111"))
我正在尝试为每个列表项(nr1
、nr2
、nr3
)编写一个代码,获取前 3 个最常出现的值。还有两个问题。
- 某些列表项的值为
0000000000
,应将其排除。 - 有些列表项没有3个值,只有一两个。
我认为要做的第一件事是取消列出项目并删除出现的 0000000000
;
IBAN_numbers <- list()
y <- " 0000000000"
for (i in 1:length(dat)) {
IBAN_numbers[[i]] <- unlist(dat[i])
IBAN_numbers[[i]] = IBAN_numbers[[i]][! IBAN_numbers[[i]] %in% y]
}
但我不确定如何实现最后一点。
table(IBAN_numbers[[1]])
# NL11BANKO0111111111 NL11BANKO0111111112 NL11BANKO0111111113
# 9 2 3
table(IBAN_numbers[[2]])
# NL16DEUT0111111111 NL30ABNA0111111111 NL31RABO0111111111 NL52RABO0111111111 NL74INGB0111111111
# 1 7 3 1 8
table(IBAN_numbers[[3]])
# NL11BANKO0111111111 NL11BANKO0111111113
# 10 3
我可以这样做:
IBAN_numbers <- list()
y <- " 0000000000"
for (i in 1:length(dat)) {
IBAN_numbers[[i]] <- unlist(dat[i])
IBAN_numbers[[i]] = IBAN_numbers[[i]][! IBAN_numbers[[i]] %in% y]
IBAN_numbers[[i]] = table(IBAN_numbers[[i]])
}
所以对于中间 table,我只想要三个条目(我不关心出现一次需要哪个选项,只要它不崩溃)。
谁能帮我完成最后一步?
您可以使用 lapply
-
y <- " 0000000000"
lapply(dat, function(x) {
x <- unlist(x)
head(sort(table(x[x != y]), decreasing = TRUE), 3)
})
#$nr1
#NL11BANKO0111111111 NL11BANKO0111111113 NL11BANKO0111111112
# 9 3 2
#$nr2
# NL74INGB0111111111 NL30ABNA0111111111 NL31RABO0111111111
# 8 7 3
#$nr3
# NL11BANKO0111111111 NL11BANKO0111111113
# 10 3
如果您只对名称感兴趣,可以使用 names(head(sort(table(x[x != y]), decreasing = TRUE), 3))
。
使用tidyverse
library(dplyr)
library(purrr)
map(dat, ~ tibble(col1 = flatten_chr(.x)) %>%
filter(col1 != y) %>%
count(col1) %>%
slice_max(n = 3, order_by = n))
-输出
$nr1
# A tibble: 3 × 2
col1 n
<chr> <int>
1 " NL11BANKO0111111111" 9
2 " NL11BANKO0111111113" 3
3 " NL11BANKO0111111112" 2
$nr2
# A tibble: 3 × 2
col1 n
<chr> <int>
1 " NL74INGB0111111111" 8
2 " NL30ABNA0111111111" 7
3 " NL31RABO0111111111" 3
$nr3
# A tibble: 2 × 2
col1 n
<chr> <int>
1 " NL11BANKO0111111111" 10
2 " NL11BANKO0111111113" 3