for循环清理多个数据帧

for loop to clean multiple dataframes

这是我的代码:

library(PNADcIBGE)
#DOWNLOAD DATABASES#
data2018q1 <- get_pnadc(year = 2018, quarter = 1, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
data2018q2 <- get_pnadc(year = 2018, quarter = 2, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 2)
data2018q3 <- get_pnadc(year = 2018, quarter = 3, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 3)
data2018q4 <- get_pnadc(year = 2018, quarter = 4, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 4)

#CLEAN DATABASES#

data_list <- list(data_2018q1,data_2018q2,data_2018q3,data_2018q4,data_2019q1,data_2019q2,data_2019q3,data_2019q4,data_2020q1,data_2020q2,data_2020q3,data_2020q4,data_2021q1,data_2021q2,data_2021q3)

我正在尝试为 data_lis 内的所有数据帧的变量 VD4020 使用 log10。 log10之后,我想做其他的事情,比如重命名列,添加列和其他清理东西。我尝试使用 lapply 但没有成功,也无法执行 For 循环。我这样做的方法是手动进入每个数据框并做一个子数据框,例如

data_2018q1 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q2 <- subset.data.frame(data_2018q2, VD4020!="NA")
data_2018q3 <- subset.data.frame(data_2018q2, VD4020!="NA")
 
data_2018q1$VD4020 <- log(data_2018q1$VD4020)
data_2018q2$VD4020 <- log(data_2018q2$VD4020)
data_2018q3$VD4020 <- log(data_2018q3$VD4020)

以此类推每个命令。这真的很耗时,因为我需要处理 15 个数据帧。

两个想法。

  1. 你没有问,但这样加载数据可能更容易:

    datalist <- lapply(setNames(1:4, paste0("data_2018q", 1:4)), function(qtr) {
      get_pnadc(year = 2018, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = 2018, defperiod = 1)
    })
    
    
  2. Log-10 在一列上。仅供参考,!= "NA" 正在进行字符串比较,您可能需要 !is.na(.).

    datalist2 <- lapply(datalist, function(dat) {
      dat$VD4020[!is.na(dat$VD4020)] <- log10(dat$VD4020[!is.na(dat$VD4020)])
      dat
    })
    

    或者回信给datalist <- lapply(...),但是对于learning/debugging我经常喜欢把它拆开,直到我对这个过程有信心。


编辑:对于 multi-years,您可以 auto-list 使用:

eg <- expand.grid(q = 1:4, y = 2018:2021)[,2:1] # so that the 'q' column goes more quickly
eg
#       y q
# 1  2018 1
# 2  2018 2
# 3  2018 3
# 4  2018 4
# 5  2019 1
# 6  2019 2
# 7  2019 3
# 8  2019 4
# 9  2020 1
# 10 2020 2
# 11 2020 3
# 12 2020 4
# 13 2021 1
# 14 2021 2
# 15 2021 3
# 16 2021 4

你说你只需要到 2021 年第 3 季度,所以我们可以删除第 16 行:

nms <- with(eg[-16,], sprintf("data_%sq%s", y, q))
nms
#  [1] "data_2018q1" "data_2018q2" "data_2018q3" "data_2018q4" "data_2019q1" "data_2019q2" "data_2019q3" "data_2019q4" "data_2020q1"
# [10] "data_2020q2" "data_2020q3" "data_2020q4" "data_2021q1" "data_2021q2" "data_2021q3"
datalist <- setNames(Map(function(yr, qtr) {
  get_pnadc(year = yr, quarter = qtr, vars = c("UF", "Capital", "V1022", "V2007", "V2009", "V2010", "V3009A", "VD3004", "VD4001", "VD4004", "V4001", "V4003", "V4005", "V4010", "V4002", "V4012", "V403322","VD4019", "VD4020"), labels = FALSE, deflator = TRUE, design = FALSE, defyear = yr, defperiod = 1)
}, eg$y, eg$q), nms)