根据多个条件填充多列的行

Question

我有如下示例数据：

dat <- structure(list(
zipcode = c(1001, 1002, 1003, 1004, 1101, 1102, 1103, 1104, 1201, 1202, 1203, 1302), 
areacode = c(4, 4, NA, 4, 4, 4, NA, 1, 4, 4, NA, 4), 
type = structure(c(1L, 1L, NA, 1L, 2L, 2L, NA, 1L, 1L, 1L, NA, 1L), 
.Label = c("clay", "sand"), class = "factor"), 
region = c(3, 3, NA, 3, 3, 3, NA, 3, 3, 3, NA, 3), 
do_not_fill = c(1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1)), 
class = c("data.table", "data.frame"), row.names = c(NA, -4L))

    zipcode areacode type region do_not_fill
 1:    1001        4 clay      3           1
 2:    1002        4 clay      3          NA
 3:    1003       NA <NA>     NA          NA
 4:    1004        4 clay      3           1
 5:    1101        4 sand      3           1
 6:    1102        4 sand      3          NA
 7:    1103       NA <NA>     NA          NA
 8:    1104        1 clay      3           1
 9:    1201        4 clay      3          NA
10:    1202        4 clay      3          NA
11:    1203       NA <NA>     NA          NA
12:    1302        4 clay      3           1

我想根据两个条件仅填写areacode, type and region列。

areacode前后NA必须相同。
NA前后zipcode的前两位必须相同

基于 , and this solution，我尝试遵循（但是 data.table 解决方案是受欢迎的，甚至是首选的）：

library(dplyr)
dat |> 
  mutate(type = as.character(type)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                         lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))

但是我在某处做错了，因为我得到：

Error:
! `n` and `row.names` must be consistent.
Run `rlang::last_error()` to see where the error occurred.

期望的输出：

    zipcode areacode type region do_not_fill
 1:    1001        4 clay      3           1
 2:    1002        4 clay      3          NA
 3:    1003        4 clay      3          NA
 4:    1004        4 clay      3           1
 5:    1101        4 sand      3           1
 6:    1102        4 sand      3          NA
 7:    1103       NA <NA>     NA          NA
 8:    1104        1 clay      3           1
 9:    1201        4 clay      3          NA
10:    1202        4 clay      3          NA
11:    1203       NA <NA>     NA          NA
12:    1302        4 clay      3           1

编辑

as_tibble(dat) |>
  mutate(type = as.character(areacode)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                           lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))


# A tibble: 12 x 5
   zipcode areacode type  region do_not_fill
     <dbl>    <dbl> <chr>  <dbl>       <dbl>
 1    1001        4 4          3           1
 2    1002        4 4          3          NA
 3    1003        4 4          3          NA
 4    1004        4 4          3           1
 5    1101        4 4          3           1
 6    1102        4 4          3          NA
 7    1103       NA NA        NA          NA
 8    1104        1 1          3           1
 9    1201        4 4          3          NA
10    1202        4 4          3          NA
11    1203       NA NA        NA          NA
12    1302        4 4          3           1

Answer 1

您需要先将其转换为小标题。我想这是因为 data.table 有额外的属性

看看行名，

rownames(as_tibble(dat))
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
rownames(dat)
 [1] "1" "2" "3" "4"

as_tibble(dat) |>
  mutate(type = as.character(type)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                           lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))

# A tibble: 12 x 5
   zipcode areacode type  region do_not_fill
     <dbl>    <dbl> <chr>  <dbl>       <dbl>
 1    1001        4 clay       3           1
 2    1002        4 clay       3          NA
 3    1003        4 clay       3          NA
 4    1004        4 clay       3           1
 5    1101        4 sand       3           1
 6    1102        4 sand       3          NA
 7    1103       NA NA        NA          NA
 8    1104        1 clay       3           1
 9    1201        4 clay       3          NA
10    1202        4 clay       3          NA
11    1203       NA NA        NA          NA
12    1302        4 clay       3           1

Answer 2

这可以在 data.table 中使用相同的代码完成：

dat[, c(lapply(.SD, \(v) {fifelse(
  is.na(areacode) & lag(areacode) == lead(areacode) &
    lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))), lag(v), v)}), 
  .SD[, .(do_not_fill)]), .SDcols = !patterns("do_not_fill")]


    zipcode areacode   type region do_not_fill
      <num>    <num> <fctr>  <num>       <num>
 1:    1001        4   clay      3           1
 2:    1002        4   clay      3          NA
 3:    1004        4   clay      3          NA
 4:    1004        4   clay      3           1
 5:    1101        4   sand      3           1
 6:    1102        4   sand      3          NA
 7:    1103       NA   <NA>     NA          NA
 8:    1104        1   clay      3           1
 9:    1201        4   clay      3          NA
10:    1202        4   clay      3          NA
11:    1203       NA   <NA>     NA          NA
12:    1302        4   clay      3           1

根据多个条件填充多列的行

Filling rows of multiple columns based on multiple conditions

r

dplyr

data.table

编辑