bind_rows() 在 R 中绑定每个数据帧时创建它们的副本
bind_rows() creates duplicate of each dataframe when binding them in R
假设这是我的数据框:
df <- data.frame(grp = c("ab -10", "ab 0", "ab 8", "ab -1",
"ab 6", "ab 6", "ab -10", "ab 1",
"ab -10", "ab 0", "ab 8", "ab -1",
"ab 6", "ab 6", "ab -10", "ab 1",
"d", "e", "e", "e"),
freq = c(1,0,0,1,0,1,2,0,1,0,2,2,1,1,0,1,0,2,2,1))
df
grp freq
1 ab -10 1
2 ab 0 0
3 ab 8 0
4 ab -1 1
5 ab 6 0
6 ab 6 1
7 ab -10 2
8 ab 1 0
9 ab -10 1
10 ab 0 0
11 ab 8 2
12 ab -1 2
13 ab 6 1
14 ab 6 1
15 ab -10 0
16 ab 1 1
17 d 0
18 e 2
19 e 2
20 e 1
我想要:
> finaldf
grp freq
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
这是我试过的:
df %>%
bind_rows(df %>%
filter(!grepl("ab", grp)),
df %>%
filter(grepl("ab", grp)) %>%
mutate(grp = parse_number(grp)) %>%
mutate(grp = cut(as.numeric(grp),
breaks = c(-999, 0, 6, 999),
labels = c("ab < 0", "ab 0-5", "ab 5+"),
right = F))) %>%
group_by(grp) %>%
summarise(N =n())
但似乎 bind_rows
正在复制数据帧。
grp freq
1 ab -10 1
2 ab 0 0
3 ab 8 0
4 ab -1 1
5 ab 6 0
6 ab 6 1
7 ab -10 2
8 ab 1 0
9 ab -10 1
10 ab 0 0
11 ab 8 2
12 ab -1 2
13 ab 6 1
14 ab 6 1
15 ab -10 0
16 ab 1 1
17 d 0
18 e 2
19 e 2
20 e 1
21 d 0
22 e 2
23 e 2
24 e 1
25 ab < 0 1
26 ab 0-5 0
27 ab 5+ 0
28 ab < 0 1
29 ab 5+ 0
30 ab 5+ 1
31 ab < 0 2
32 ab 0-5 0
33 ab < 0 1
34 ab 0-5 0
35 ab 5+ 2
36 ab < 0 2
37 ab 5+ 1
38 ab 5+ 1
39 ab < 0 0
40 ab 0-5 1
我可以 slice()
一半的行,但我更想知道我做错了什么?
任何其他简洁漂亮的方法也非常感谢!
这是一种方法,将列拆分为 'two' 和 separate
,重新编码数值,unite
,然后按 sum
[= 进行分组21=]
library(dplyr)
library(tidyr)
df %>%
separate(grp, into = c('grp1', 'value'), sep = "(?<=ab)\s+",
fill = "right", convert = TRUE) %>%
mutate(value = case_when(value <0 ~ '< 0',
between(value, 0, 5) ~ '0-5', value > 5 ~ '5+')) %>%
unite(grp, grp1, value, na.rm = TRUE, sep=" ") %>%
group_by(grp) %>%
summarise(freq = sum(freq), .groups = 'drop')
-输出
# A tibble: 5 × 2
grp freq
<chr> <dbl>
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
在 OP 的代码中,这是开头 df %>%
需要删除,因为我们要在 bind_rows
中传递两个 filter
ed 数据集。当我们添加 df %>%
时,它将作为第一个参数传递给 bind_rows
,从而复制行
library(readr)
bind_rows(df %>%
filter(!grepl("ab", grp)),
df %>%
filter(grepl("ab", grp)) %>%
mutate(grp = parse_number(grp)) %>%
mutate(grp = cut(as.numeric(grp),
breaks = c(-999, 0, 6, 999),
labels = c("ab < 0", "ab 0-5", "ab 5+"),
right = FALSE))) %>%
group_by(grp) %>%
summarise(N =sum(freq))
# A tibble: 5 × 2
grp N
<chr> <dbl>
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
tidyverse
解决方案 count()
+ str_replace()
:
library(dplyr)
library(stringr)
df %>%
count(grp = str_replace(grp, "-?\d+", function(x) {
x <- as.numeric(x)
case_when(x < 0 ~ '< 0', between(x, 0, 5) ~ '0-5', x > 5 ~ '5+')
}), wt = freq)
# grp n
# 1 ab < 0 7
# 2 ab 0-5 1
# 3 ab 5+ 5
# 4 d 0
# 5 e 5
诀窍是将一个函数传递给 str_replace()
:它会为每个匹配项调用一次,它的 return 值将用于替换匹配项。另外,df %>% count(a, wt = b)
是df %>% group_by(a) %>% summarise(n = sum(b))
的快捷方式。
假设这是我的数据框:
df <- data.frame(grp = c("ab -10", "ab 0", "ab 8", "ab -1",
"ab 6", "ab 6", "ab -10", "ab 1",
"ab -10", "ab 0", "ab 8", "ab -1",
"ab 6", "ab 6", "ab -10", "ab 1",
"d", "e", "e", "e"),
freq = c(1,0,0,1,0,1,2,0,1,0,2,2,1,1,0,1,0,2,2,1))
df
grp freq
1 ab -10 1
2 ab 0 0
3 ab 8 0
4 ab -1 1
5 ab 6 0
6 ab 6 1
7 ab -10 2
8 ab 1 0
9 ab -10 1
10 ab 0 0
11 ab 8 2
12 ab -1 2
13 ab 6 1
14 ab 6 1
15 ab -10 0
16 ab 1 1
17 d 0
18 e 2
19 e 2
20 e 1
我想要:
> finaldf
grp freq
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
这是我试过的:
df %>%
bind_rows(df %>%
filter(!grepl("ab", grp)),
df %>%
filter(grepl("ab", grp)) %>%
mutate(grp = parse_number(grp)) %>%
mutate(grp = cut(as.numeric(grp),
breaks = c(-999, 0, 6, 999),
labels = c("ab < 0", "ab 0-5", "ab 5+"),
right = F))) %>%
group_by(grp) %>%
summarise(N =n())
但似乎 bind_rows
正在复制数据帧。
grp freq
1 ab -10 1
2 ab 0 0
3 ab 8 0
4 ab -1 1
5 ab 6 0
6 ab 6 1
7 ab -10 2
8 ab 1 0
9 ab -10 1
10 ab 0 0
11 ab 8 2
12 ab -1 2
13 ab 6 1
14 ab 6 1
15 ab -10 0
16 ab 1 1
17 d 0
18 e 2
19 e 2
20 e 1
21 d 0
22 e 2
23 e 2
24 e 1
25 ab < 0 1
26 ab 0-5 0
27 ab 5+ 0
28 ab < 0 1
29 ab 5+ 0
30 ab 5+ 1
31 ab < 0 2
32 ab 0-5 0
33 ab < 0 1
34 ab 0-5 0
35 ab 5+ 2
36 ab < 0 2
37 ab 5+ 1
38 ab 5+ 1
39 ab < 0 0
40 ab 0-5 1
我可以 slice()
一半的行,但我更想知道我做错了什么?
任何其他简洁漂亮的方法也非常感谢!
这是一种方法,将列拆分为 'two' 和 separate
,重新编码数值,unite
,然后按 sum
[= 进行分组21=]
library(dplyr)
library(tidyr)
df %>%
separate(grp, into = c('grp1', 'value'), sep = "(?<=ab)\s+",
fill = "right", convert = TRUE) %>%
mutate(value = case_when(value <0 ~ '< 0',
between(value, 0, 5) ~ '0-5', value > 5 ~ '5+')) %>%
unite(grp, grp1, value, na.rm = TRUE, sep=" ") %>%
group_by(grp) %>%
summarise(freq = sum(freq), .groups = 'drop')
-输出
# A tibble: 5 × 2
grp freq
<chr> <dbl>
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
在 OP 的代码中,这是开头 df %>%
需要删除,因为我们要在 bind_rows
中传递两个 filter
ed 数据集。当我们添加 df %>%
时,它将作为第一个参数传递给 bind_rows
,从而复制行
library(readr)
bind_rows(df %>%
filter(!grepl("ab", grp)),
df %>%
filter(grepl("ab", grp)) %>%
mutate(grp = parse_number(grp)) %>%
mutate(grp = cut(as.numeric(grp),
breaks = c(-999, 0, 6, 999),
labels = c("ab < 0", "ab 0-5", "ab 5+"),
right = FALSE))) %>%
group_by(grp) %>%
summarise(N =sum(freq))
# A tibble: 5 × 2
grp N
<chr> <dbl>
1 ab < 0 7
2 ab 0-5 1
3 ab 5+ 5
4 d 0
5 e 5
tidyverse
解决方案 count()
+ str_replace()
:
library(dplyr)
library(stringr)
df %>%
count(grp = str_replace(grp, "-?\d+", function(x) {
x <- as.numeric(x)
case_when(x < 0 ~ '< 0', between(x, 0, 5) ~ '0-5', x > 5 ~ '5+')
}), wt = freq)
# grp n
# 1 ab < 0 7
# 2 ab 0-5 1
# 3 ab 5+ 5
# 4 d 0
# 5 e 5
诀窍是将一个函数传递给 str_replace()
:它会为每个匹配项调用一次,它的 return 值将用于替换匹配项。另外,df %>% count(a, wt = b)
是df %>% group_by(a) %>% summarise(n = sum(b))
的快捷方式。