仅当字符串的第三部分存在时将其提取到新列
Extracting the third part of a string to a new column only if it exists
我有如下数据:
数据
library(dplyr)
dat_in_one <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
library(dplyr)
dat_in_two <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25) East` = c(5L, 0L), `[25,50) South` = c(0L, 0L), `[25,100) West` = c(38L,
3L), `[50,100) North` = c(0L, 0L), `[100,250) East` = c(43L, 5L), `[100,500) South` = c(0L,
0L), `[250,500) South` = c(27L, 12L), `[500,1000) South` = c(44L, 0L), `[1000,1500) South` = c(0L,
0L), `[1500,3000) South` = c(0L, 0L), `[500,1000000] East` = c(0L, 53L),
`[1000,1000000] South` = c(20L, 0L), `[3000,1000000] South` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
问题
我想修改下面的这段代码(从 strata
列中提取数字并从中创建两个新列):
dat_in_one %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper'), '(\d+),(\d+)', convert = TRUE)
# A tibble: 28 x 5
rn strata lower upper value
<chr> <list> <int> <int> <dbl>
1 Type_A <dbl [7]> 0 25 5
2 Type_A <dbl [7]> 25 50 0
3 Type_A <dbl [7]> 25 100 38
4 Type_A <dbl [7]> 50 100 0
5 Type_A <dbl [7]> 100 250 43
6 Type_A <dbl [7]> 100 500 0
# ... with 22 more rows
它目前忽略 dat_in_two
中的术语 East
、South
等。我想尝试修改此代码,使其适用于 dat_in_one
和 dat_in_two
,其中 dat_in_two
创建第三列。我试过了,但根本不起作用。
dat_in_two %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'the_rest'), '(\d+),(\d+),(\w+)', convert = TRUE)
dat_in_one
的期望结果
# A tibble: 28 x 5
rn strata lower upper value
<chr> <list> <int> <int> <dbl>
1 Type_A <dbl [7]> 0 25 5
2 Type_A <dbl [7]> 25 50 0
3 Type_A <dbl [7]> 25 100 38
4 Type_A <dbl [7]> 50 100 0
5 Type_A <dbl [7]> 100 250 43
6 Type_A <dbl [7]> 100 500 0
# ... with 22 more rows
dat_in_two
的期望结果
# A tibble: 28 x 5
rn strata lower upper rest value
<chr> <list> <int> <int> <char> <dbl>
1 Type_A <dbl [7]> 0 25 East 5
2 Type_A <dbl [7]> 25 50 South 0
3 Type_A <dbl [7]> 25 100 West 38
4 Type_A <dbl [7]> 50 100 North 0
5 Type_A <dbl [7]> 100 250 East 43
6 Type_A <dbl [7]> 100 500 South 0
# ... with 22 more rows
这个怎么样:
dat_in_one <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
dat_in_two <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25) East` = c(5L, 0L), `[25,50) South` = c(0L, 0L), `[25,100) West` = c(38L,
3L), `[50,100) North` = c(0L, 0L), `[100,250) East` = c(43L, 5L), `[100,500) South` = c(0L,
0L), `[250,500) South` = c(27L, 12L), `[500,1000) South` = c(44L, 0L), `[1000,1500) South` = c(0L,
0L), `[1500,3000) South` = c(0L, 0L), `[500,1000000] East` = c(0L, 53L),
`[1000,1000000] South` = c(20L, 0L), `[3000,1000000] South` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
library(dplyr)
library(tidyr)
is_all_na <- function(x)all(is.na(x))
dat_in_two %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'rest'), '(\d+),(\d+)[\]\)]\s*(\w*)', convert = TRUE) %>%
select(-where(is_all_na))
#> # A tibble: 28 × 6
#> rn strata lower upper rest value
#> <chr> <list> <int> <int> <chr> <dbl>
#> 1 Type_A <dbl [7]> 0 25 East 5
#> 2 Type_A <dbl [7]> 25 50 South 0
#> 3 Type_A <dbl [7]> 25 100 West 38
#> 4 Type_A <dbl [7]> 50 100 North 0
#> 5 Type_A <dbl [7]> 100 250 East 43
#> 6 Type_A <dbl [7]> 100 500 South 0
#> 7 Type_A <dbl [7]> 250 500 South 27
#> 8 Type_A <dbl [7]> 500 1000 South 44
#> 9 Type_A <dbl [7]> 1000 1500 South 0
#> 10 Type_A <dbl [7]> 1500 3000 South 0
#> # … with 18 more rows
dat_in_one %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'rest'), '(\d+),(\d+)[\]\)]\s*(\w*)', convert = TRUE) %>%
select(-where(is_all_na))
#> # A tibble: 28 × 5
#> rn strata lower upper value
#> <chr> <list> <int> <int> <dbl>
#> 1 Type_A <dbl [7]> 0 25 5
#> 2 Type_A <dbl [7]> 25 50 0
#> 3 Type_A <dbl [7]> 25 100 38
#> 4 Type_A <dbl [7]> 50 100 0
#> 5 Type_A <dbl [7]> 100 250 43
#> 6 Type_A <dbl [7]> 100 500 0
#> 7 Type_A <dbl [7]> 250 500 27
#> 8 Type_A <dbl [7]> 500 1000 44
#> 9 Type_A <dbl [7]> 1000 1500 0
#> 10 Type_A <dbl [7]> 1500 3000 0
#> # … with 18 more rows
由 reprex package (v2.0.1)
于 2022-05-04 创建
我有如下数据:
数据
library(dplyr)
dat_in_one <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
library(dplyr)
dat_in_two <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25) East` = c(5L, 0L), `[25,50) South` = c(0L, 0L), `[25,100) West` = c(38L,
3L), `[50,100) North` = c(0L, 0L), `[100,250) East` = c(43L, 5L), `[100,500) South` = c(0L,
0L), `[250,500) South` = c(27L, 12L), `[500,1000) South` = c(44L, 0L), `[1000,1500) South` = c(0L,
0L), `[1500,3000) South` = c(0L, 0L), `[500,1000000] East` = c(0L, 53L),
`[1000,1000000] South` = c(20L, 0L), `[3000,1000000] South` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
问题
我想修改下面的这段代码(从 strata
列中提取数字并从中创建两个新列):
dat_in_one %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper'), '(\d+),(\d+)', convert = TRUE)
# A tibble: 28 x 5
rn strata lower upper value
<chr> <list> <int> <int> <dbl>
1 Type_A <dbl [7]> 0 25 5
2 Type_A <dbl [7]> 25 50 0
3 Type_A <dbl [7]> 25 100 38
4 Type_A <dbl [7]> 50 100 0
5 Type_A <dbl [7]> 100 250 43
6 Type_A <dbl [7]> 100 500 0
# ... with 22 more rows
它目前忽略 dat_in_two
中的术语 East
、South
等。我想尝试修改此代码,使其适用于 dat_in_one
和 dat_in_two
,其中 dat_in_two
创建第三列。我试过了,但根本不起作用。
dat_in_two %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'the_rest'), '(\d+),(\d+),(\w+)', convert = TRUE)
dat_in_one
的期望结果# A tibble: 28 x 5
rn strata lower upper value
<chr> <list> <int> <int> <dbl>
1 Type_A <dbl [7]> 0 25 5
2 Type_A <dbl [7]> 25 50 0
3 Type_A <dbl [7]> 25 100 38
4 Type_A <dbl [7]> 50 100 0
5 Type_A <dbl [7]> 100 250 43
6 Type_A <dbl [7]> 100 500 0
# ... with 22 more rows
dat_in_two
的期望结果# A tibble: 28 x 5
rn strata lower upper rest value
<chr> <list> <int> <int> <char> <dbl>
1 Type_A <dbl [7]> 0 25 East 5
2 Type_A <dbl [7]> 25 50 South 0
3 Type_A <dbl [7]> 25 100 West 38
4 Type_A <dbl [7]> 50 100 North 0
5 Type_A <dbl [7]> 100 250 East 43
6 Type_A <dbl [7]> 100 500 South 0
# ... with 22 more rows
这个怎么样:
dat_in_one <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
dat_in_two <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25) East` = c(5L, 0L), `[25,50) South` = c(0L, 0L), `[25,100) West` = c(38L,
3L), `[50,100) North` = c(0L, 0L), `[100,250) East` = c(43L, 5L), `[100,500) South` = c(0L,
0L), `[250,500) South` = c(27L, 12L), `[500,1000) South` = c(44L, 0L), `[1000,1500) South` = c(0L,
0L), `[1500,3000) South` = c(0L, 0L), `[500,1000000] East` = c(0L, 53L),
`[1000,1000000] South` = c(20L, 0L), `[3000,1000000] South` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
library(dplyr)
library(tidyr)
is_all_na <- function(x)all(is.na(x))
dat_in_two %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'rest'), '(\d+),(\d+)[\]\)]\s*(\w*)', convert = TRUE) %>%
select(-where(is_all_na))
#> # A tibble: 28 × 6
#> rn strata lower upper rest value
#> <chr> <list> <int> <int> <chr> <dbl>
#> 1 Type_A <dbl [7]> 0 25 East 5
#> 2 Type_A <dbl [7]> 25 50 South 0
#> 3 Type_A <dbl [7]> 25 100 West 38
#> 4 Type_A <dbl [7]> 50 100 North 0
#> 5 Type_A <dbl [7]> 100 250 East 43
#> 6 Type_A <dbl [7]> 100 500 South 0
#> 7 Type_A <dbl [7]> 250 500 South 27
#> 8 Type_A <dbl [7]> 500 1000 South 44
#> 9 Type_A <dbl [7]> 1000 1500 South 0
#> 10 Type_A <dbl [7]> 1500 3000 South 0
#> # … with 18 more rows
dat_in_one %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper', 'rest'), '(\d+),(\d+)[\]\)]\s*(\w*)', convert = TRUE) %>%
select(-where(is_all_na))
#> # A tibble: 28 × 5
#> rn strata lower upper value
#> <chr> <list> <int> <int> <dbl>
#> 1 Type_A <dbl [7]> 0 25 5
#> 2 Type_A <dbl [7]> 25 50 0
#> 3 Type_A <dbl [7]> 25 100 38
#> 4 Type_A <dbl [7]> 50 100 0
#> 5 Type_A <dbl [7]> 100 250 43
#> 6 Type_A <dbl [7]> 100 500 0
#> 7 Type_A <dbl [7]> 250 500 27
#> 8 Type_A <dbl [7]> 500 1000 44
#> 9 Type_A <dbl [7]> 1000 1500 0
#> 10 Type_A <dbl [7]> 1500 3000 0
#> # … with 18 more rows
由 reprex package (v2.0.1)
于 2022-05-04 创建