for循环清理多个数据帧
for loop to clean multiple dataframes
这是我的代码:
library(PNADcIBGE)
#DOWNLOAD DATABASES#
data2018q1 <- get_pnadc(year = 2018, quarter = 1, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
data2018q2 <- get_pnadc(year = 2018, quarter = 2, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 2)
data2018q3 <- get_pnadc(year = 2018, quarter = 3, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 3)
data2018q4 <- get_pnadc(year = 2018, quarter = 4, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 4)
#CLEAN DATABASES#
data_list <- list(data_2018q1,data_2018q2,data_2018q3,data_2018q4,data_2019q1,data_2019q2,data_2019q3,data_2019q4,data_2020q1,data_2020q2,data_2020q3,data_2020q4,data_2021q1,data_2021q2,data_2021q3)
我正在尝试为 data_lis 内的所有数据帧的变量 VD4020 使用 log10。 log10之后,我想做其他的事情,比如重命名列,添加列和其他清理东西。我尝试使用 lapply 但没有成功,也无法执行 For 循环。我这样做的方法是手动进入每个数据框并做一个子数据框,例如
data_2018q1 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q2 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q3 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q1$VD4020 <- log(data_2018q1$VD4020)
data_2018q2$VD4020 <- log(data_2018q2$VD4020)
data_2018q3$VD4020 <- log(data_2018q3$VD4020)
以此类推每个命令。这真的很耗时,因为我需要处理 15 个数据帧。
两个想法。
你没有问,但这样加载数据可能更容易:
datalist <- lapply(setNames(1:4, paste0("data_2018q", 1:4)), function(qtr) {
get_pnadc(year = 2018, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
})
Log-10 在一列上。仅供参考,!= "NA"
正在进行字符串比较,您可能需要 !is.na(.)
.
datalist2 <- lapply(datalist, function(dat) {
dat$VD4020[!is.na(dat$VD4020)] <- log10(dat$VD4020[!is.na(dat$VD4020)])
dat
})
或者回信给datalist <- lapply(...)
,但是对于learning/debugging我经常喜欢把它拆开,直到我对这个过程有信心。
编辑:对于 multi-years,您可以 auto-list 使用:
eg <- expand.grid(q = 1:4, y = 2018:2021)[,2:1] # so that the 'q' column goes more quickly
eg
# y q
# 1 2018 1
# 2 2018 2
# 3 2018 3
# 4 2018 4
# 5 2019 1
# 6 2019 2
# 7 2019 3
# 8 2019 4
# 9 2020 1
# 10 2020 2
# 11 2020 3
# 12 2020 4
# 13 2021 1
# 14 2021 2
# 15 2021 3
# 16 2021 4
你说你只需要到 2021 年第 3 季度,所以我们可以删除第 16 行:
nms <- with(eg[-16,], sprintf("data_%sq%s", y, q))
nms
# [1] "data_2018q1" "data_2018q2" "data_2018q3" "data_2018q4" "data_2019q1" "data_2019q2" "data_2019q3" "data_2019q4" "data_2020q1"
# [10] "data_2020q2" "data_2020q3" "data_2020q4" "data_2021q1" "data_2021q2" "data_2021q3"
datalist <- setNames(Map(function(yr, qtr) {
get_pnadc(year = yr, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = yr, defperiod = 1)
}, eg$y, eg$q), nms)
这是我的代码:
library(PNADcIBGE)
#DOWNLOAD DATABASES#
data2018q1 <- get_pnadc(year = 2018, quarter = 1, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
data2018q2 <- get_pnadc(year = 2018, quarter = 2, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 2)
data2018q3 <- get_pnadc(year = 2018, quarter = 3, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 3)
data2018q4 <- get_pnadc(year = 2018, quarter = 4, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 4)
#CLEAN DATABASES#
data_list <- list(data_2018q1,data_2018q2,data_2018q3,data_2018q4,data_2019q1,data_2019q2,data_2019q3,data_2019q4,data_2020q1,data_2020q2,data_2020q3,data_2020q4,data_2021q1,data_2021q2,data_2021q3)
我正在尝试为 data_lis 内的所有数据帧的变量 VD4020 使用 log10。 log10之后,我想做其他的事情,比如重命名列,添加列和其他清理东西。我尝试使用 lapply 但没有成功,也无法执行 For 循环。我这样做的方法是手动进入每个数据框并做一个子数据框,例如
data_2018q1 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q2 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q3 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q1$VD4020 <- log(data_2018q1$VD4020)
data_2018q2$VD4020 <- log(data_2018q2$VD4020)
data_2018q3$VD4020 <- log(data_2018q3$VD4020)
以此类推每个命令。这真的很耗时,因为我需要处理 15 个数据帧。
两个想法。
你没有问,但这样加载数据可能更容易:
datalist <- lapply(setNames(1:4, paste0("data_2018q", 1:4)), function(qtr) { get_pnadc(year = 2018, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1) })
Log-10 在一列上。仅供参考,
!= "NA"
正在进行字符串比较,您可能需要!is.na(.)
.datalist2 <- lapply(datalist, function(dat) { dat$VD4020[!is.na(dat$VD4020)] <- log10(dat$VD4020[!is.na(dat$VD4020)]) dat })
或者回信给
datalist <- lapply(...)
,但是对于learning/debugging我经常喜欢把它拆开,直到我对这个过程有信心。
编辑:对于 multi-years,您可以 auto-list 使用:
eg <- expand.grid(q = 1:4, y = 2018:2021)[,2:1] # so that the 'q' column goes more quickly
eg
# y q
# 1 2018 1
# 2 2018 2
# 3 2018 3
# 4 2018 4
# 5 2019 1
# 6 2019 2
# 7 2019 3
# 8 2019 4
# 9 2020 1
# 10 2020 2
# 11 2020 3
# 12 2020 4
# 13 2021 1
# 14 2021 2
# 15 2021 3
# 16 2021 4
你说你只需要到 2021 年第 3 季度,所以我们可以删除第 16 行:
nms <- with(eg[-16,], sprintf("data_%sq%s", y, q))
nms
# [1] "data_2018q1" "data_2018q2" "data_2018q3" "data_2018q4" "data_2019q1" "data_2019q2" "data_2019q3" "data_2019q4" "data_2020q1"
# [10] "data_2020q2" "data_2020q3" "data_2020q4" "data_2021q1" "data_2021q2" "data_2021q3"
datalist <- setNames(Map(function(yr, qtr) {
get_pnadc(year = yr, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = yr, defperiod = 1)
}, eg$y, eg$q), nms)