有条件地替换 R 中 "tree structure" 数据帧中的变量

Conditional replacement of variables in a "tree structure" dataframe in R

我有一些关于 n 年期间不同政府级别收入的数据。有两级政府,12。 Level 2 是我在设置数据后需要进行分析的级别,这意味着最终样本将仅包括 2 级地方政府。二级政府“属于”相应的一级政府:例如DE1 是 1 级,DE1x 是 DE1 伞下的 2 级次区域政府。这种模式在数据集中是一致的(例如 CZ1x 属于 CZ1,IT3x 属于 IT3 等等)。数据可以被认为具有树结构。国家代码也可用,由政府级别的前两个字母给出。数据如下所示:

data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
            "DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
            "CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") , 
            c(rep("DE",21), rep("CZ",9)),
            c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
              "1","1", "2","2","2","2","2","2","2","2","2",
              "1", "1","1", "2", "2","2", "2", "2","2"),
          c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
            "2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
            "2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
            "2000", "2001", "2002"),
            c( runif(n = 3, min = 1300, max = 21220), "NA","NA", 16000, 
          runif(n=12, min = 1300, max = 21220), "NA", 18000,"NA", runif(n=6, min = 1300, max = 21220),
          "NA","NA","NA")))

colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")

data
Region Country Gvt Lvl Time          Revenue
DE1       DE       1   2000 16858.6538477242
DE1       DE       1   2001  7788.3873622492
DE1       DE       1   2002 19988.1219627894
DE11      DE       2   2000               NA
DE11      DE       2   2001               NA
DE11      DE       2   2002            16000
DE12      DE       2   2000 6660.73037594557
DE12      DE       2   2001 9005.15880053863
DE12      DE       2   2002 2322.38461054862
DE2       DE       1   2000 16887.0197726786
DE2       DE       1   2001 11184.8074057698
DE2       DE       1   2002 1442.17075794935
DE21      DE       2   2000 6902.39389214665
DE21      DE       2   2001 6562.93060332537
DE21      DE       2   2002 17302.4776424281
DE22      DE       2   2000 16508.5772226751
DE22      DE       2   2001 2753.07780653238
DE22      DE       2   2002 2198.10680534691
DE23      DE       2   2000               NA
DE23      DE       2   2001            18000
DE23      DE       2   2002               NA
CZ0       CZ       1   2000 8614.85693316907
CZ0       CZ       1   2001 9601.59771829844
CZ0       CZ       1   2002 7134.94570834562
CZ01      CZ       2   2000 8562.89313737303
CZ01      CZ       2   2001 10880.8537839726
CZ01      CZ       2   2002  6957.3313607648
CZ02      CZ       2   2000               NA
CZ02      CZ       2   2001               NA
CZ02      CZ       2   2002               NA

在某些情况下,第 2 级的数据缺失,或者是所有年份,或者只是其中一些年份。发生这种情况时,我想用相应的更高级别 (1) 观察到的那个缺失年份的值(如果有的话)替换较低级别政府 (2) 的 NA。

我想读取 区域 的前三个字符串可能会成功,但我正在努力想出一个可行的解决方案。我的样本中有 9000 多个观察值(行)。

Region中提取公共部分后,您可以使用fill填充缺失值:

library(dplyr)

data %>%
  mutate(Revenue = as.numeric(na_if(Revenue, "NA"))) %>%
  group_by(group = substr(Region, 1, 3), Time) %>%
  mutate(Revenue = replace(Revenue, is.na(Revenue),Revenue[Region == group])) %>%
  ungroup

# A tibble: 30 x 6
#   Region Country `Gvt Lvl` Time  Revenue group
#   <chr>  <chr>   <chr>     <chr>   <dbl> <chr>
# 1 DE1    DE      1         2000   13259. DE1  
# 2 DE1    DE      1         2001   16229. DE1  
# 3 DE1    DE      1         2002   19929. DE1  
# 4 DE11   DE      2         2000   13259. DE1  
# 5 DE11   DE      2         2001   16229. DE1  
# 6 DE11   DE      2         2002   16000  DE1  
# 7 DE12   DE      2         2000    2793. DE1  
# 8 DE12   DE      2         2001    3491. DE1  
# 9 DE12   DE      2         2002   14854. DE1  
#10 DE2    DE      1         2000    3976. DE2  
# … with 20 more rows
library(data.table)
#> Warning: package 'data.table' was built under R version 4.0.4

set.seed(42)
data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
                              "DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
                              "CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") , 
                            c(rep("DE",21), rep("CZ",9)),
                            c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
                              "1","1", "2","2","2","2","2","2","2","2","2",
                              "1", "1","1", "2", "2","2", "2", "2","2"),
                            c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
                              "2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
                              "2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
                              "2000", "2001", "2002"),
                            c( runif(n = 3, min = 1300, max = 21220), NA,NA, 16000, 
                               runif(n=12, min = 1300, max = 21220), NA, 18000,NA, runif(n=6, min = 1300, max = 21220),
                               NA,NA,NA)))

colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")

setDT(data)

data[,level1:=substr(Region,1,3)][]
#>     Region Country Gvt Lvl Time          Revenue level1
#>  1:    DE1      DE       1 2000 19522.9363864474    DE1
#>  2:    DE1      DE       1 2001 19966.5422328934    DE1
#>  3:    DE1      DE       1 2002 6999.89953294396    DE1
#>  4:   DE11      DE       2 2000             <NA>    DE1
#>  5:   DE11      DE       2 2001             <NA>    DE1
#>  6:   DE11      DE       2 2002            16000    DE1
#>  7:   DE12      DE       2 2000 17842.5167112611    DE1
#>  8:   DE12      DE       2 2001 14083.5707363486    DE1
#>  9:   DE12      DE       2 2002 11640.3913066722    DE1
#> 10:    DE2      DE       1 2000  15972.839227654    DE2
#> 11:    DE2      DE       1 2001 3982.55861697719    DE2
#> 12:    DE2      DE       1 2002 14387.2864248045    DE2
#> 13:   DE21      DE       2 2000  15344.890498016    DE2
#> 14:   DE21      DE       2 2001 10418.2161828689    DE2
#> 15:   DE21      DE       2 2002 15624.7160529159    DE2
#> 16:   DE22      DE       2 2000 19918.6711632833    DE2
#> 17:   DE22      DE       2 2001 6388.14218087122    DE2
#> 18:   DE22      DE       2 2002 10508.8730250672    DE2
#> 19:   DE23      DE       2 2000             <NA>    DE2
#> 20:   DE23      DE       2 2001            18000    DE2
#> 21:   DE23      DE       2 2002             <NA>    DE2
#> 22:    CZ0      CZ       1 2000 20025.0892932899    CZ0
#> 23:    CZ0      CZ       1 2001 20786.2704534456    CZ0
#> 24:    CZ0      CZ       1 2002 3640.34824416041    CZ0
#> 25:   CZ01      CZ       2 2000 10761.9418646954    CZ0
#> 26:   CZ01      CZ       2 2001 12461.8283051997    CZ0
#> 27:   CZ01      CZ       2 2002 19308.3052349649    CZ0
#> 28:   CZ02      CZ       2 2000             <NA>    CZ0
#> 29:   CZ02      CZ       2 2001             <NA>    CZ0
#> 30:   CZ02      CZ       2 2002             <NA>    CZ0
#>     Region Country Gvt Lvl Time          Revenue level1

data[data[`Gvt Lvl`==1],Revenue:=fcoalesce(Revenue,i.Revenue), on=.(level1,Time)][]
#>     Region Country Gvt Lvl Time          Revenue level1
#>  1:    DE1      DE       1 2000 19522.9363864474    DE1
#>  2:    DE1      DE       1 2001 19966.5422328934    DE1
#>  3:    DE1      DE       1 2002 6999.89953294396    DE1
#>  4:   DE11      DE       2 2000 19522.9363864474    DE1
#>  5:   DE11      DE       2 2001 19966.5422328934    DE1
#>  6:   DE11      DE       2 2002            16000    DE1
#>  7:   DE12      DE       2 2000 17842.5167112611    DE1
#>  8:   DE12      DE       2 2001 14083.5707363486    DE1
#>  9:   DE12      DE       2 2002 11640.3913066722    DE1
#> 10:    DE2      DE       1 2000  15972.839227654    DE2
#> 11:    DE2      DE       1 2001 3982.55861697719    DE2
#> 12:    DE2      DE       1 2002 14387.2864248045    DE2
#> 13:   DE21      DE       2 2000  15344.890498016    DE2
#> 14:   DE21      DE       2 2001 10418.2161828689    DE2
#> 15:   DE21      DE       2 2002 15624.7160529159    DE2
#> 16:   DE22      DE       2 2000 19918.6711632833    DE2
#> 17:   DE22      DE       2 2001 6388.14218087122    DE2
#> 18:   DE22      DE       2 2002 10508.8730250672    DE2
#> 19:   DE23      DE       2 2000  15972.839227654    DE2
#> 20:   DE23      DE       2 2001            18000    DE2
#> 21:   DE23      DE       2 2002 14387.2864248045    DE2
#> 22:    CZ0      CZ       1 2000 20025.0892932899    CZ0
#> 23:    CZ0      CZ       1 2001 20786.2704534456    CZ0
#> 24:    CZ0      CZ       1 2002 3640.34824416041    CZ0
#> 25:   CZ01      CZ       2 2000 10761.9418646954    CZ0
#> 26:   CZ01      CZ       2 2001 12461.8283051997    CZ0
#> 27:   CZ01      CZ       2 2002 19308.3052349649    CZ0
#> 28:   CZ02      CZ       2 2000 20025.0892932899    CZ0
#> 29:   CZ02      CZ       2 2001 20786.2704534456    CZ0
#> 30:   CZ02      CZ       2 2002 3640.34824416041    CZ0
#>     Region Country Gvt Lvl Time          Revenue level1

reprex package (v1.0.0)

于 2021 年 3 月 16 日创建