使用逗号分隔的长度不等的数字字符串对多列进行数学运算
Do mathematical operation on multiple columns with comma-separated strings of numbers of unequal length
我有一个大型数据框,其中一些列包含由逗号分隔的长度不等的长串数字数据,这里是 A
、B
和 C
列:
df <- data.frame(
id = 1:3,
A = c("200, 100, 80, 100","120, 210, 220", "170, 200"),
B = c("0.1, 0.2, 0.3","0.2, 0.3, 1.0, 0.4, 0.9", "0.55, 0.77, 0.99, 0.35"),
C = c("700.1, 701.0, 699.2", "702.5, 702.9", "705.4, 705.4, 706.0")
)
我需要为 A
、B
和 C
中的这些数值数据计算 百分比变化 。我认为,为了促进这一点,我需要使用 separate_rows
将每个数字分隔到它自己的行中。 但是我如何一次性对所有三列 A
、B
和 C
执行此步骤?
我所能做的就是逐列进行 - 首先是 A
,然后是 B
,最后是 C
:
library(tidyverse)
df %>%
# Step 1 - column `A`:
separate_rows(A, sep = ",", convert = TRUE) %>%
mutate(A_0 = lag((lead(A)-A)/A*100)) %>%
group_by(id) %>%
summarise(across(c(B,C), first),
A = paste0(A, collapse = ", "),
A_0 = paste0(A_0, collapse = ", ")
) %>%
ungroup() %>%
# Step 2 - column `B`:
separate_rows(B, sep = ",", convert = TRUE) %>%
mutate(B_0 = lag((lead(B)-B)/B*100)) %>%
group_by(id) %>%
summarise(across(c(A,A_0,C), first),
B = paste0(B, collapse = ", "),
B_0 = paste0(B_0, collapse = ", ")
) %>%
ungroup() %>%
# Step 3 - column `C`:
separate_rows(C, sep = ",", convert = TRUE) %>%
mutate(C_0 = lag((lead(C)-C)/C*100)) %>%
group_by(id) %>%
summarise(across(c(A,A_0,B,B_0), first),
C = paste0(C, collapse = ", "),
C_0 = paste0(C_0, collapse = ", ")
)
# A tibble: 3 × 7
id A A_0 B B_0 C C_0
<int> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 200, 100, 80, 100 NA, -50, -20, 25 0.1, 0.… NA, 100, 50 700.1… NA, 0.12…
2 2 120, 210, 220 20, 75, 4.76190476190476 0.2, 0.… -33.33333333… 702.5… 0.471967…
3 3 170, 200 -22.7272727272727, 17.6470588235294 0.55, 0… -38.88888888… 705.4… 0.355669…
有没有更好的方法?
我们可以循环 across
列,在 ,
处拆分,后跟一个或多个空格 (\s+
),在 list
处循环 [=16] =],转换为numeric
,得到lead
与当前值比例之差的lag
,pasted
(toString
)return作为字符向量 (_chr
),如果需要,对 select
中的列进行排序
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, ~ {
map_chr(strsplit(.x, ",\s+"), ~ {
tmp <- as.numeric(.x)
toString(lag((lead(tmp)- tmp)/tmp *100))})
}, .names = "{.col}_0")) %>%
select(id, gtools::mixedsort(names(.)[-1]))
-输出
id A A_0 B B_0 C
1 1 200, 100, 80, 100 NA, -50, -20, 25 0.1, 0.2, 0.3 NA, 100, 50 700.1, 701.0, 699.2
2 2 120, 210, 220 NA, 75, 4.76190476190476 0.2, 0.3, 1.0, 0.4, 0.9 NA, 50, 233.333333333333, -60, 125 702.5, 702.9
3 3 170, 200 NA, 17.6470588235294 0.55, 0.77, 0.99, 0.35 NA, 40, 28.5714285714286, -64.6464646464647 705.4, 705.4, 706.0
C_0
1 NA, 0.128553063848018, -0.256776034236798
2 NA, 0.0569395017793562
3 NA, 0, 0.0850581230507546
这样的事情可以让你开始
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.1.3
#> Warning: package 'readr' was built under R version 4.1.3
#> Warning: package 'dplyr' was built under R version 4.1.3
df_example <- data.frame(
id = 1:3,
A = c("200, 100, 80, 100","120, 210, 220", "170, 200"),
B = c("0.1, 0.2, 0.3","0.2, 0.3, 1.0, 0.4, 0.9", "0.55, 0.77, 0.99, 0.35"),
C = c("700.1, 701.0, 699.2", "702.5, 702.9", "705.4, 705.4, 706.0")
)
df_example |>
as_tibble() |>
mutate(across(-id,
.names = "{.col}_0",
\(x) x |>
str_split(',') |>
map(as.numeric) |>
map(\(x) lag((lead(x)-x)/x*100)) |>
map_chr(toString))
)
#> # A tibble: 3 x 7
#> id A B C A_0 B_0 C_0
#> <int> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 200, 100, 80, 100 0.1, 0.2, 0.3 700.1, 701.~ NA, ~ NA, ~ NA, ~
#> 2 2 120, 210, 220 0.2, 0.3, 1.0, 0.4, 0.9 702.5, 702.9 NA, ~ NA, ~ NA, ~
#> 3 3 170, 200 0.55, 0.77, 0.99, 0.35 705.4, 705.~ NA, ~ NA, ~ NA, ~
由 reprex package (v2.0.1)
于 2022-03-28 创建
我有一个大型数据框,其中一些列包含由逗号分隔的长度不等的长串数字数据,这里是 A
、B
和 C
列:
df <- data.frame(
id = 1:3,
A = c("200, 100, 80, 100","120, 210, 220", "170, 200"),
B = c("0.1, 0.2, 0.3","0.2, 0.3, 1.0, 0.4, 0.9", "0.55, 0.77, 0.99, 0.35"),
C = c("700.1, 701.0, 699.2", "702.5, 702.9", "705.4, 705.4, 706.0")
)
我需要为 A
、B
和 C
中的这些数值数据计算 百分比变化 。我认为,为了促进这一点,我需要使用 separate_rows
将每个数字分隔到它自己的行中。 但是我如何一次性对所有三列 A
、B
和 C
执行此步骤?
我所能做的就是逐列进行 - 首先是 A
,然后是 B
,最后是 C
:
library(tidyverse)
df %>%
# Step 1 - column `A`:
separate_rows(A, sep = ",", convert = TRUE) %>%
mutate(A_0 = lag((lead(A)-A)/A*100)) %>%
group_by(id) %>%
summarise(across(c(B,C), first),
A = paste0(A, collapse = ", "),
A_0 = paste0(A_0, collapse = ", ")
) %>%
ungroup() %>%
# Step 2 - column `B`:
separate_rows(B, sep = ",", convert = TRUE) %>%
mutate(B_0 = lag((lead(B)-B)/B*100)) %>%
group_by(id) %>%
summarise(across(c(A,A_0,C), first),
B = paste0(B, collapse = ", "),
B_0 = paste0(B_0, collapse = ", ")
) %>%
ungroup() %>%
# Step 3 - column `C`:
separate_rows(C, sep = ",", convert = TRUE) %>%
mutate(C_0 = lag((lead(C)-C)/C*100)) %>%
group_by(id) %>%
summarise(across(c(A,A_0,B,B_0), first),
C = paste0(C, collapse = ", "),
C_0 = paste0(C_0, collapse = ", ")
)
# A tibble: 3 × 7
id A A_0 B B_0 C C_0
<int> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 200, 100, 80, 100 NA, -50, -20, 25 0.1, 0.… NA, 100, 50 700.1… NA, 0.12…
2 2 120, 210, 220 20, 75, 4.76190476190476 0.2, 0.… -33.33333333… 702.5… 0.471967…
3 3 170, 200 -22.7272727272727, 17.6470588235294 0.55, 0… -38.88888888… 705.4… 0.355669…
有没有更好的方法?
我们可以循环 across
列,在 ,
处拆分,后跟一个或多个空格 (\s+
),在 list
处循环 [=16] =],转换为numeric
,得到lead
与当前值比例之差的lag
,pasted
(toString
)return作为字符向量 (_chr
),如果需要,对 select
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, ~ {
map_chr(strsplit(.x, ",\s+"), ~ {
tmp <- as.numeric(.x)
toString(lag((lead(tmp)- tmp)/tmp *100))})
}, .names = "{.col}_0")) %>%
select(id, gtools::mixedsort(names(.)[-1]))
-输出
id A A_0 B B_0 C
1 1 200, 100, 80, 100 NA, -50, -20, 25 0.1, 0.2, 0.3 NA, 100, 50 700.1, 701.0, 699.2
2 2 120, 210, 220 NA, 75, 4.76190476190476 0.2, 0.3, 1.0, 0.4, 0.9 NA, 50, 233.333333333333, -60, 125 702.5, 702.9
3 3 170, 200 NA, 17.6470588235294 0.55, 0.77, 0.99, 0.35 NA, 40, 28.5714285714286, -64.6464646464647 705.4, 705.4, 706.0
C_0
1 NA, 0.128553063848018, -0.256776034236798
2 NA, 0.0569395017793562
3 NA, 0, 0.0850581230507546
这样的事情可以让你开始
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.1.3
#> Warning: package 'readr' was built under R version 4.1.3
#> Warning: package 'dplyr' was built under R version 4.1.3
df_example <- data.frame(
id = 1:3,
A = c("200, 100, 80, 100","120, 210, 220", "170, 200"),
B = c("0.1, 0.2, 0.3","0.2, 0.3, 1.0, 0.4, 0.9", "0.55, 0.77, 0.99, 0.35"),
C = c("700.1, 701.0, 699.2", "702.5, 702.9", "705.4, 705.4, 706.0")
)
df_example |>
as_tibble() |>
mutate(across(-id,
.names = "{.col}_0",
\(x) x |>
str_split(',') |>
map(as.numeric) |>
map(\(x) lag((lead(x)-x)/x*100)) |>
map_chr(toString))
)
#> # A tibble: 3 x 7
#> id A B C A_0 B_0 C_0
#> <int> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 200, 100, 80, 100 0.1, 0.2, 0.3 700.1, 701.~ NA, ~ NA, ~ NA, ~
#> 2 2 120, 210, 220 0.2, 0.3, 1.0, 0.4, 0.9 702.5, 702.9 NA, ~ NA, ~ NA, ~
#> 3 3 170, 200 0.55, 0.77, 0.99, 0.35 705.4, 705.~ NA, ~ NA, ~ NA, ~
由 reprex package (v2.0.1)
于 2022-03-28 创建