根据多个条件填充多列的行
Filling rows of multiple columns based on multiple conditions
我有如下示例数据:
dat <- structure(list(
zipcode = c(1001, 1002, 1003, 1004, 1101, 1102, 1103, 1104, 1201, 1202, 1203, 1302),
areacode = c(4, 4, NA, 4, 4, 4, NA, 1, 4, 4, NA, 4),
type = structure(c(1L, 1L, NA, 1L, 2L, 2L, NA, 1L, 1L, 1L, NA, 1L),
.Label = c("clay", "sand"), class = "factor"),
region = c(3, 3, NA, 3, 3, 3, NA, 3, 3, 3, NA, 3),
do_not_fill = c(1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1)),
class = c("data.table", "data.frame"), row.names = c(NA, -4L))
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 NA <NA> NA NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
我想根据两个条件仅填写areacode, type and region
列。
areacode
前后NA
必须相同。
- NA前后
zipcode
的前两位必须相同
基于 , and this solution,我尝试遵循(但是 data.table
解决方案是受欢迎的,甚至是首选的):
library(dplyr)
dat |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
但是我在某处做错了,因为我得到:
Error:
! `n` and `row.names` must be consistent.
Run `rlang::last_error()` to see where the error occurred.
期望的输出:
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
编辑
as_tibble(dat) |>
mutate(type = as.character(areacode)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 4 3 1
2 1002 4 4 3 NA
3 1003 4 4 3 NA
4 1004 4 4 3 1
5 1101 4 4 3 1
6 1102 4 4 3 NA
7 1103 NA NA NA NA
8 1104 1 1 3 1
9 1201 4 4 3 NA
10 1202 4 4 3 NA
11 1203 NA NA NA NA
12 1302 4 4 3 1
您需要先将其转换为小标题。我想这是因为 data.table 有额外的属性
看看行名,
rownames(as_tibble(dat))
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
rownames(dat)
[1] "1" "2" "3" "4"
as_tibble(dat) |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 clay 3 1
2 1002 4 clay 3 NA
3 1003 4 clay 3 NA
4 1004 4 clay 3 1
5 1101 4 sand 3 1
6 1102 4 sand 3 NA
7 1103 NA NA NA NA
8 1104 1 clay 3 1
9 1201 4 clay 3 NA
10 1202 4 clay 3 NA
11 1203 NA NA NA NA
12 1302 4 clay 3 1
这可以在 data.table
中使用相同的代码完成:
dat[, c(lapply(.SD, \(v) {fifelse(
is.na(areacode) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))), lag(v), v)}),
.SD[, .(do_not_fill)]), .SDcols = !patterns("do_not_fill")]
zipcode areacode type region do_not_fill
<num> <num> <fctr> <num> <num>
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1004 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
我有如下示例数据:
dat <- structure(list(
zipcode = c(1001, 1002, 1003, 1004, 1101, 1102, 1103, 1104, 1201, 1202, 1203, 1302),
areacode = c(4, 4, NA, 4, 4, 4, NA, 1, 4, 4, NA, 4),
type = structure(c(1L, 1L, NA, 1L, 2L, 2L, NA, 1L, 1L, 1L, NA, 1L),
.Label = c("clay", "sand"), class = "factor"),
region = c(3, 3, NA, 3, 3, 3, NA, 3, 3, 3, NA, 3),
do_not_fill = c(1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1)),
class = c("data.table", "data.frame"), row.names = c(NA, -4L))
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 NA <NA> NA NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
我想根据两个条件仅填写areacode, type and region
列。
areacode
前后NA
必须相同。- NA前后
zipcode
的前两位必须相同
基于 data.table
解决方案是受欢迎的,甚至是首选的):
library(dplyr)
dat |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
但是我在某处做错了,因为我得到:
Error:
! `n` and `row.names` must be consistent.
Run `rlang::last_error()` to see where the error occurred.
期望的输出:
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
编辑
as_tibble(dat) |>
mutate(type = as.character(areacode)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 4 3 1
2 1002 4 4 3 NA
3 1003 4 4 3 NA
4 1004 4 4 3 1
5 1101 4 4 3 1
6 1102 4 4 3 NA
7 1103 NA NA NA NA
8 1104 1 1 3 1
9 1201 4 4 3 NA
10 1202 4 4 3 NA
11 1203 NA NA NA NA
12 1302 4 4 3 1
您需要先将其转换为小标题。我想这是因为 data.table 有额外的属性
看看行名,
rownames(as_tibble(dat))
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
rownames(dat)
[1] "1" "2" "3" "4"
as_tibble(dat) |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 clay 3 1
2 1002 4 clay 3 NA
3 1003 4 clay 3 NA
4 1004 4 clay 3 1
5 1101 4 sand 3 1
6 1102 4 sand 3 NA
7 1103 NA NA NA NA
8 1104 1 clay 3 1
9 1201 4 clay 3 NA
10 1202 4 clay 3 NA
11 1203 NA NA NA NA
12 1302 4 clay 3 1
这可以在 data.table
中使用相同的代码完成:
dat[, c(lapply(.SD, \(v) {fifelse(
is.na(areacode) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))), lag(v), v)}),
.SD[, .(do_not_fill)]), .SDcols = !patterns("do_not_fill")]
zipcode areacode type region do_not_fill
<num> <num> <fctr> <num> <num>
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1004 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1