有条件地替换 R 中 "tree structure" 数据帧中的变量
Conditional replacement of variables in a "tree structure" dataframe in R
我有一些关于 n 年期间不同政府级别收入的数据。有两级政府,1 和 2。 Level 2 是我在设置数据后需要进行分析的级别,这意味着最终样本将仅包括 2 级地方政府。二级政府“属于”相应的一级政府:例如DE1 是 1 级,DE1x 是 DE1 伞下的 2 级次区域政府。这种模式在数据集中是一致的(例如 CZ1x 属于 CZ1,IT3x 属于 IT3 等等)。数据可以被认为具有树结构。国家代码也可用,由政府级别的前两个字母给出。数据如下所示:
data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
"DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
"CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") ,
c(rep("DE",21), rep("CZ",9)),
c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
"1","1", "2","2","2","2","2","2","2","2","2",
"1", "1","1", "2", "2","2", "2", "2","2"),
c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
"2000", "2001", "2002"),
c( runif(n = 3, min = 1300, max = 21220), "NA","NA", 16000,
runif(n=12, min = 1300, max = 21220), "NA", 18000,"NA", runif(n=6, min = 1300, max = 21220),
"NA","NA","NA")))
colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")
data
Region Country Gvt Lvl Time Revenue
DE1 DE 1 2000 16858.6538477242
DE1 DE 1 2001 7788.3873622492
DE1 DE 1 2002 19988.1219627894
DE11 DE 2 2000 NA
DE11 DE 2 2001 NA
DE11 DE 2 2002 16000
DE12 DE 2 2000 6660.73037594557
DE12 DE 2 2001 9005.15880053863
DE12 DE 2 2002 2322.38461054862
DE2 DE 1 2000 16887.0197726786
DE2 DE 1 2001 11184.8074057698
DE2 DE 1 2002 1442.17075794935
DE21 DE 2 2000 6902.39389214665
DE21 DE 2 2001 6562.93060332537
DE21 DE 2 2002 17302.4776424281
DE22 DE 2 2000 16508.5772226751
DE22 DE 2 2001 2753.07780653238
DE22 DE 2 2002 2198.10680534691
DE23 DE 2 2000 NA
DE23 DE 2 2001 18000
DE23 DE 2 2002 NA
CZ0 CZ 1 2000 8614.85693316907
CZ0 CZ 1 2001 9601.59771829844
CZ0 CZ 1 2002 7134.94570834562
CZ01 CZ 2 2000 8562.89313737303
CZ01 CZ 2 2001 10880.8537839726
CZ01 CZ 2 2002 6957.3313607648
CZ02 CZ 2 2000 NA
CZ02 CZ 2 2001 NA
CZ02 CZ 2 2002 NA
在某些情况下,第 2 级的数据缺失,或者是所有年份,或者只是其中一些年份。发生这种情况时,我想用相应的更高级别 (1) 观察到的那个缺失年份的值(如果有的话)替换较低级别政府 (2) 的 NA。
我想读取 区域 的前三个字符串可能会成功,但我正在努力想出一个可行的解决方案。我的样本中有 9000 多个观察值(行)。
从Region
中提取公共部分后,您可以使用fill
填充缺失值:
library(dplyr)
data %>%
mutate(Revenue = as.numeric(na_if(Revenue, "NA"))) %>%
group_by(group = substr(Region, 1, 3), Time) %>%
mutate(Revenue = replace(Revenue, is.na(Revenue),Revenue[Region == group])) %>%
ungroup
# A tibble: 30 x 6
# Region Country `Gvt Lvl` Time Revenue group
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 DE1 DE 1 2000 13259. DE1
# 2 DE1 DE 1 2001 16229. DE1
# 3 DE1 DE 1 2002 19929. DE1
# 4 DE11 DE 2 2000 13259. DE1
# 5 DE11 DE 2 2001 16229. DE1
# 6 DE11 DE 2 2002 16000 DE1
# 7 DE12 DE 2 2000 2793. DE1
# 8 DE12 DE 2 2001 3491. DE1
# 9 DE12 DE 2 2002 14854. DE1
#10 DE2 DE 1 2000 3976. DE2
# … with 20 more rows
library(data.table)
#> Warning: package 'data.table' was built under R version 4.0.4
set.seed(42)
data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
"DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
"CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") ,
c(rep("DE",21), rep("CZ",9)),
c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
"1","1", "2","2","2","2","2","2","2","2","2",
"1", "1","1", "2", "2","2", "2", "2","2"),
c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
"2000", "2001", "2002"),
c( runif(n = 3, min = 1300, max = 21220), NA,NA, 16000,
runif(n=12, min = 1300, max = 21220), NA, 18000,NA, runif(n=6, min = 1300, max = 21220),
NA,NA,NA)))
colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")
setDT(data)
data[,level1:=substr(Region,1,3)][]
#> Region Country Gvt Lvl Time Revenue level1
#> 1: DE1 DE 1 2000 19522.9363864474 DE1
#> 2: DE1 DE 1 2001 19966.5422328934 DE1
#> 3: DE1 DE 1 2002 6999.89953294396 DE1
#> 4: DE11 DE 2 2000 <NA> DE1
#> 5: DE11 DE 2 2001 <NA> DE1
#> 6: DE11 DE 2 2002 16000 DE1
#> 7: DE12 DE 2 2000 17842.5167112611 DE1
#> 8: DE12 DE 2 2001 14083.5707363486 DE1
#> 9: DE12 DE 2 2002 11640.3913066722 DE1
#> 10: DE2 DE 1 2000 15972.839227654 DE2
#> 11: DE2 DE 1 2001 3982.55861697719 DE2
#> 12: DE2 DE 1 2002 14387.2864248045 DE2
#> 13: DE21 DE 2 2000 15344.890498016 DE2
#> 14: DE21 DE 2 2001 10418.2161828689 DE2
#> 15: DE21 DE 2 2002 15624.7160529159 DE2
#> 16: DE22 DE 2 2000 19918.6711632833 DE2
#> 17: DE22 DE 2 2001 6388.14218087122 DE2
#> 18: DE22 DE 2 2002 10508.8730250672 DE2
#> 19: DE23 DE 2 2000 <NA> DE2
#> 20: DE23 DE 2 2001 18000 DE2
#> 21: DE23 DE 2 2002 <NA> DE2
#> 22: CZ0 CZ 1 2000 20025.0892932899 CZ0
#> 23: CZ0 CZ 1 2001 20786.2704534456 CZ0
#> 24: CZ0 CZ 1 2002 3640.34824416041 CZ0
#> 25: CZ01 CZ 2 2000 10761.9418646954 CZ0
#> 26: CZ01 CZ 2 2001 12461.8283051997 CZ0
#> 27: CZ01 CZ 2 2002 19308.3052349649 CZ0
#> 28: CZ02 CZ 2 2000 <NA> CZ0
#> 29: CZ02 CZ 2 2001 <NA> CZ0
#> 30: CZ02 CZ 2 2002 <NA> CZ0
#> Region Country Gvt Lvl Time Revenue level1
data[data[`Gvt Lvl`==1],Revenue:=fcoalesce(Revenue,i.Revenue), on=.(level1,Time)][]
#> Region Country Gvt Lvl Time Revenue level1
#> 1: DE1 DE 1 2000 19522.9363864474 DE1
#> 2: DE1 DE 1 2001 19966.5422328934 DE1
#> 3: DE1 DE 1 2002 6999.89953294396 DE1
#> 4: DE11 DE 2 2000 19522.9363864474 DE1
#> 5: DE11 DE 2 2001 19966.5422328934 DE1
#> 6: DE11 DE 2 2002 16000 DE1
#> 7: DE12 DE 2 2000 17842.5167112611 DE1
#> 8: DE12 DE 2 2001 14083.5707363486 DE1
#> 9: DE12 DE 2 2002 11640.3913066722 DE1
#> 10: DE2 DE 1 2000 15972.839227654 DE2
#> 11: DE2 DE 1 2001 3982.55861697719 DE2
#> 12: DE2 DE 1 2002 14387.2864248045 DE2
#> 13: DE21 DE 2 2000 15344.890498016 DE2
#> 14: DE21 DE 2 2001 10418.2161828689 DE2
#> 15: DE21 DE 2 2002 15624.7160529159 DE2
#> 16: DE22 DE 2 2000 19918.6711632833 DE2
#> 17: DE22 DE 2 2001 6388.14218087122 DE2
#> 18: DE22 DE 2 2002 10508.8730250672 DE2
#> 19: DE23 DE 2 2000 15972.839227654 DE2
#> 20: DE23 DE 2 2001 18000 DE2
#> 21: DE23 DE 2 2002 14387.2864248045 DE2
#> 22: CZ0 CZ 1 2000 20025.0892932899 CZ0
#> 23: CZ0 CZ 1 2001 20786.2704534456 CZ0
#> 24: CZ0 CZ 1 2002 3640.34824416041 CZ0
#> 25: CZ01 CZ 2 2000 10761.9418646954 CZ0
#> 26: CZ01 CZ 2 2001 12461.8283051997 CZ0
#> 27: CZ01 CZ 2 2002 19308.3052349649 CZ0
#> 28: CZ02 CZ 2 2000 20025.0892932899 CZ0
#> 29: CZ02 CZ 2 2001 20786.2704534456 CZ0
#> 30: CZ02 CZ 2 2002 3640.34824416041 CZ0
#> Region Country Gvt Lvl Time Revenue level1
由 reprex package (v1.0.0)
于 2021 年 3 月 16 日创建
我有一些关于 n 年期间不同政府级别收入的数据。有两级政府,1 和 2。 Level 2 是我在设置数据后需要进行分析的级别,这意味着最终样本将仅包括 2 级地方政府。二级政府“属于”相应的一级政府:例如DE1 是 1 级,DE1x 是 DE1 伞下的 2 级次区域政府。这种模式在数据集中是一致的(例如 CZ1x 属于 CZ1,IT3x 属于 IT3 等等)。数据可以被认为具有树结构。国家代码也可用,由政府级别的前两个字母给出。数据如下所示:
data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
"DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
"CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") ,
c(rep("DE",21), rep("CZ",9)),
c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
"1","1", "2","2","2","2","2","2","2","2","2",
"1", "1","1", "2", "2","2", "2", "2","2"),
c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
"2000", "2001", "2002"),
c( runif(n = 3, min = 1300, max = 21220), "NA","NA", 16000,
runif(n=12, min = 1300, max = 21220), "NA", 18000,"NA", runif(n=6, min = 1300, max = 21220),
"NA","NA","NA")))
colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")
data
Region Country Gvt Lvl Time Revenue
DE1 DE 1 2000 16858.6538477242
DE1 DE 1 2001 7788.3873622492
DE1 DE 1 2002 19988.1219627894
DE11 DE 2 2000 NA
DE11 DE 2 2001 NA
DE11 DE 2 2002 16000
DE12 DE 2 2000 6660.73037594557
DE12 DE 2 2001 9005.15880053863
DE12 DE 2 2002 2322.38461054862
DE2 DE 1 2000 16887.0197726786
DE2 DE 1 2001 11184.8074057698
DE2 DE 1 2002 1442.17075794935
DE21 DE 2 2000 6902.39389214665
DE21 DE 2 2001 6562.93060332537
DE21 DE 2 2002 17302.4776424281
DE22 DE 2 2000 16508.5772226751
DE22 DE 2 2001 2753.07780653238
DE22 DE 2 2002 2198.10680534691
DE23 DE 2 2000 NA
DE23 DE 2 2001 18000
DE23 DE 2 2002 NA
CZ0 CZ 1 2000 8614.85693316907
CZ0 CZ 1 2001 9601.59771829844
CZ0 CZ 1 2002 7134.94570834562
CZ01 CZ 2 2000 8562.89313737303
CZ01 CZ 2 2001 10880.8537839726
CZ01 CZ 2 2002 6957.3313607648
CZ02 CZ 2 2000 NA
CZ02 CZ 2 2001 NA
CZ02 CZ 2 2002 NA
在某些情况下,第 2 级的数据缺失,或者是所有年份,或者只是其中一些年份。发生这种情况时,我想用相应的更高级别 (1) 观察到的那个缺失年份的值(如果有的话)替换较低级别政府 (2) 的 NA。
我想读取 区域 的前三个字符串可能会成功,但我正在努力想出一个可行的解决方案。我的样本中有 9000 多个观察值(行)。
从Region
中提取公共部分后,您可以使用fill
填充缺失值:
library(dplyr)
data %>%
mutate(Revenue = as.numeric(na_if(Revenue, "NA"))) %>%
group_by(group = substr(Region, 1, 3), Time) %>%
mutate(Revenue = replace(Revenue, is.na(Revenue),Revenue[Region == group])) %>%
ungroup
# A tibble: 30 x 6
# Region Country `Gvt Lvl` Time Revenue group
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 DE1 DE 1 2000 13259. DE1
# 2 DE1 DE 1 2001 16229. DE1
# 3 DE1 DE 1 2002 19929. DE1
# 4 DE11 DE 2 2000 13259. DE1
# 5 DE11 DE 2 2001 16229. DE1
# 6 DE11 DE 2 2002 16000 DE1
# 7 DE12 DE 2 2000 2793. DE1
# 8 DE12 DE 2 2001 3491. DE1
# 9 DE12 DE 2 2002 14854. DE1
#10 DE2 DE 1 2000 3976. DE2
# … with 20 more rows
library(data.table)
#> Warning: package 'data.table' was built under R version 4.0.4
set.seed(42)
data <- as.data.frame(cbind(c("DE1", "DE1", "DE1", "DE11", "DE11", "DE11", "DE12", "DE12", "DE12", "DE2",
"DE2","DE2", "DE21","DE21","DE21","DE22","DE22","DE22","DE23","DE23","DE23",
"CZ0", "CZ0","CZ0", "CZ01", "CZ01","CZ01", "CZ02", "CZ02","CZ02") ,
c(rep("DE",21), rep("CZ",9)),
c("1", "1", "1", "2", "2", "2", "2", "2", "2", "1",
"1","1", "2","2","2","2","2","2","2","2","2",
"1", "1","1", "2", "2","2", "2", "2","2"),
c("2000", "2001", "2002", "2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002","2000", "2001", "2002","2000", "2001", "2002",
"2000", "2001", "2002", "2000", "2001", "2002", "2000", "2001", "2002",
"2000", "2001", "2002"),
c( runif(n = 3, min = 1300, max = 21220), NA,NA, 16000,
runif(n=12, min = 1300, max = 21220), NA, 18000,NA, runif(n=6, min = 1300, max = 21220),
NA,NA,NA)))
colnames(data) <- c("Region", "Country", "Gvt Lvl","Time", "Revenue")
setDT(data)
data[,level1:=substr(Region,1,3)][]
#> Region Country Gvt Lvl Time Revenue level1
#> 1: DE1 DE 1 2000 19522.9363864474 DE1
#> 2: DE1 DE 1 2001 19966.5422328934 DE1
#> 3: DE1 DE 1 2002 6999.89953294396 DE1
#> 4: DE11 DE 2 2000 <NA> DE1
#> 5: DE11 DE 2 2001 <NA> DE1
#> 6: DE11 DE 2 2002 16000 DE1
#> 7: DE12 DE 2 2000 17842.5167112611 DE1
#> 8: DE12 DE 2 2001 14083.5707363486 DE1
#> 9: DE12 DE 2 2002 11640.3913066722 DE1
#> 10: DE2 DE 1 2000 15972.839227654 DE2
#> 11: DE2 DE 1 2001 3982.55861697719 DE2
#> 12: DE2 DE 1 2002 14387.2864248045 DE2
#> 13: DE21 DE 2 2000 15344.890498016 DE2
#> 14: DE21 DE 2 2001 10418.2161828689 DE2
#> 15: DE21 DE 2 2002 15624.7160529159 DE2
#> 16: DE22 DE 2 2000 19918.6711632833 DE2
#> 17: DE22 DE 2 2001 6388.14218087122 DE2
#> 18: DE22 DE 2 2002 10508.8730250672 DE2
#> 19: DE23 DE 2 2000 <NA> DE2
#> 20: DE23 DE 2 2001 18000 DE2
#> 21: DE23 DE 2 2002 <NA> DE2
#> 22: CZ0 CZ 1 2000 20025.0892932899 CZ0
#> 23: CZ0 CZ 1 2001 20786.2704534456 CZ0
#> 24: CZ0 CZ 1 2002 3640.34824416041 CZ0
#> 25: CZ01 CZ 2 2000 10761.9418646954 CZ0
#> 26: CZ01 CZ 2 2001 12461.8283051997 CZ0
#> 27: CZ01 CZ 2 2002 19308.3052349649 CZ0
#> 28: CZ02 CZ 2 2000 <NA> CZ0
#> 29: CZ02 CZ 2 2001 <NA> CZ0
#> 30: CZ02 CZ 2 2002 <NA> CZ0
#> Region Country Gvt Lvl Time Revenue level1
data[data[`Gvt Lvl`==1],Revenue:=fcoalesce(Revenue,i.Revenue), on=.(level1,Time)][]
#> Region Country Gvt Lvl Time Revenue level1
#> 1: DE1 DE 1 2000 19522.9363864474 DE1
#> 2: DE1 DE 1 2001 19966.5422328934 DE1
#> 3: DE1 DE 1 2002 6999.89953294396 DE1
#> 4: DE11 DE 2 2000 19522.9363864474 DE1
#> 5: DE11 DE 2 2001 19966.5422328934 DE1
#> 6: DE11 DE 2 2002 16000 DE1
#> 7: DE12 DE 2 2000 17842.5167112611 DE1
#> 8: DE12 DE 2 2001 14083.5707363486 DE1
#> 9: DE12 DE 2 2002 11640.3913066722 DE1
#> 10: DE2 DE 1 2000 15972.839227654 DE2
#> 11: DE2 DE 1 2001 3982.55861697719 DE2
#> 12: DE2 DE 1 2002 14387.2864248045 DE2
#> 13: DE21 DE 2 2000 15344.890498016 DE2
#> 14: DE21 DE 2 2001 10418.2161828689 DE2
#> 15: DE21 DE 2 2002 15624.7160529159 DE2
#> 16: DE22 DE 2 2000 19918.6711632833 DE2
#> 17: DE22 DE 2 2001 6388.14218087122 DE2
#> 18: DE22 DE 2 2002 10508.8730250672 DE2
#> 19: DE23 DE 2 2000 15972.839227654 DE2
#> 20: DE23 DE 2 2001 18000 DE2
#> 21: DE23 DE 2 2002 14387.2864248045 DE2
#> 22: CZ0 CZ 1 2000 20025.0892932899 CZ0
#> 23: CZ0 CZ 1 2001 20786.2704534456 CZ0
#> 24: CZ0 CZ 1 2002 3640.34824416041 CZ0
#> 25: CZ01 CZ 2 2000 10761.9418646954 CZ0
#> 26: CZ01 CZ 2 2001 12461.8283051997 CZ0
#> 27: CZ01 CZ 2 2002 19308.3052349649 CZ0
#> 28: CZ02 CZ 2 2000 20025.0892932899 CZ0
#> 29: CZ02 CZ 2 2001 20786.2704534456 CZ0
#> 30: CZ02 CZ 2 2002 3640.34824416041 CZ0
#> Region Country Gvt Lvl Time Revenue level1
由 reprex package (v1.0.0)
于 2021 年 3 月 16 日创建