用户定义的函数,用于自动处理多个 csv 文件的现有争论过程

User defined function to automate an existing wrangling process for multiple csv files

Objective:编写一个 user defined function 来自动执行多个 .csv 文件的 wrangling process

我有 3 个 .csv files,即 Finland_caucasian.csvMexico_hispanic.csvJamaica_black.csv

Finland_caucasian.csv为例,数据框如下所示:

date <- rep('1999',9)
cityID <- c(1001,1001,1001,1002,1002,1002,1005,1005,1005)
country <- rep('FIN'9)
name <- rep('caucasian',9)
prov_code <- c(1,1,1,2,2,3,3,2,3)
class <- c(1,2,5,1,2,5,1,2,5)
gender <- c('M','M','F','F','F','M','F','M','M')
savings_month1 <-c(100.12,105.11,109.11,94.13,34.00,201.91,154.39,212.76,233.48)
savings_month2 <-c(110.15,115.12,193.21,194.13,334.50,21.13,164.19,292.67,235.85)
savings_month3 <-c(109.15,159.12,199.22,199.18,339.59,291.95,169.94,299.78,233.22)
data<- data.frame(date,cityID,name,country,prov_code,class,gender,savings_month1,savings_month2,savings_month3)
data


原始 w运行gling 过程(没有用户定义的函数)如下所示:

# template for the user-defined function- should import csv files in a folder
#############################################################################
data_wrangler <- function(input_multi_csvfiles){
   # import each csv file
   for(files in filenames){
  # wrangle process
  }
}

# test code 
data_wrangler(files in folder path)

################################################################################################
# code for the wrangling process
library(dplyr)
#Step I: import all three files in R
 data = read.csv("filepath//file_name.csv") # 'Finland_caucasian.csv','Mexico_hispanic.csv' and 'Jamaica_black.csv'

#Step II : select the desired column names 
df <- data%>%select(date:gender,savings_month1:savings_month3) # I used column index `:` because the other csv files might have undesired columns in between. 

#Step III: check the unique prov_code, country,name, class
print(unique(df$prov_code))
print(unique(df$country))
print(unique(df$name))
print(unique(df$class))

#Step IV: filter the df based on 'class'
# Here, I will like to automate to reflect the unique countries ( FIN, MEX and JAM) instead of manually entering the countries three times, likewise for the name column- caucasian, hispanic and black
country_name_df <-subset(df,country == 'FIN' & name == 'caucasian' & class >= 1 & class <= 5) # in the other csv files class ranges from 0 to 8, but I want to filter for ONLY class ranging from 1 to 5

#Step V:  Paste filtered class on each of the column names
df10 <- filter(country_name_df,class == 1)
#paste the filtered class condition to the column names 
colnames(df10) <- paste('1',colnames(df10),sep = '_')
df11 <- filter(country_name_df,class == 2)
#paste the filtered class condition to the column names 
colnames(df11) <- paste('2',colnames(df11),sep = '_')
df12 <- filter(country_name_df,class == 5)
#paste the filtered class condition to the column names 
colnames(df12) <- paste('5',colnames(df12),sep = '_')

#Step VI:  column bind or merge df10, df11 and df12
 df15 <- cbind(df10,df11,df12)
#check for content of class column of df15
unique(df15$`5_prov_code`)

#Step VII: final wrangle for the dataframe
df20<- df15%>%select(`1_date`:`1_gender`)
df30 <-select(df15,contains("month")) # select all columns containing string 'month'  example '1_savings_month1', '3_savings_month2' etc
target <- df15$`1_prov_code`
final_df <- cbind(df20,df30,target)

#Step VIII: export each of the final_df as csv files 
# wrangled_Finland+caucasian.csv, wrangled_Mexico+hispanic.csv,wrangled_Jamaica+black.csv
write.csv(final_df,'wrangled_FIN+caucasian.csv',row.names = FALSE)
 

我对其他想法持开放态度。谢谢

阿克伦更新

我运行你的代码(见下文)。

我遇到了错误

更新 2:gender_1 之后的列名称序列应如下所示 savings_month1_1savings_month1_2savings_month1_5savings_month2_1savings_month2_2,savings_month2_5,savings_month3_1,savings_month3_2,savings_month3_5

更新 3:更改后出错 savings_month1:savings_month3

如果工作目录中有 3 个文件,请使用 map 读取 list 中的文件,执行 selectfilter 步骤并使用 [= 'class' 或 split 上的 17=] 变成 data.frames

list
library(dplyr)
library(purrr)
library(readr)
files <- list.files(pattern = "\.csv", full.names = TRUE)
df_list <- map(files, ~ read_csv(.x) %>%
               select(date:gender, savings_month1:savings_month3) %>%
               filter(country == 'FIN', name == 'caucasian', 
                    class >= 1 , class <= 5) %>%
               group_by(class))

如果我们需要带参数的函数

f1 <- function(filenm, cntrynm, racenm, class_low, class_high) {
      out <- readr::read_csv(filenm) %>%
       dplyr::select(date:gender, savings_month1_1:savings_month3_5) %>%
           dplyr::filter(country == cntrynm, name == racenm, 
                    class >= class_low , class <= class_high) %>%

        dplyr::mutate(rn = data.table::rowid(class)) %>% 
        tidyr::pivot_wider(names_from = class, 
           values_from = setdiff(names(.), c("rn", "class"))) %>%
       dplyr::select(-rn) %>% 
       dplyr::select(ends_with('_1'), contains('month')) %>% 
       dplyr::rename(target = prov_code_1)               
       readr::write_csv(out, glue::glue("wrangled_{cntrynm}+{racenm}.csv"))
       return(out)
}

然后,我们用pmap

中的参数应用函数
pmap(list(filenm = files, cntrynm = c("FIN", "MEX", "JAM"),
           racenm = c("caucasian", "hispanic", "jamaican"),
              class_low  = c(1, 1, 1), class_high = c(5, 5, 5)), f1)