如何合并映射文件后环境中的文件列表

How to merge a list of file that are in the environment following a mapping file

我的 R 环境中有一个文件列表。我想使用映射文件将其中一些合并在一起。

映射文件名为map_rule1,如下所示。

map_rule1
# A tibble: 8 x 4
  EDC_file_name Tab             DatasetName         GroupVar1
  <chr>         <chr>           <chr>               <chr>    
1 e1            Demographics    Demographics Merged Subject  
2 e2            Demographics    NA                  NA       
3 e3            PatientRegister Patient Register    Subject  
4 e4            PatientRegister NA                  NA       
5 e5            PatientRegister NA                  NA       
6 e6            PatientRegister NA                  NA       
7 e7            PatientConsent  Patient Consent     NA       
8 e8            PatientConsent  NA                  NA      

数据列中列出的项目是我当前 r 环境中的文件。我想通过Group_V1中列出的变量和New_data_Name中列出的新数据名称将归类为同一域的那些合并到一个文件中。我有 100 多个文件需要合并。这就是为什么我想创建一个循环方法或任何其他方式来自动合并这些文件的原因。

示例数据和 Map_Rule 可以使用代码构建:

e1<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), SEX = structure(c(2L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Female", "Male"), class = "factor")), class = "data.frame", row.names = c(NA, 
 -27L))


e2<-
structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), RACE = structure(c(2L, 
 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 
 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c("Black (including African, Caribbean descent)", 
 "Caucasian"), class = "factor")), class = "data.frame", row.names = c(NA, 
 -27L)) 
e3<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), ETHNIC_STD = c(2L, 
 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 
 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L)), class = "data.frame", row.names = c(NA, 
 -27L)) 


e4<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), subjectId = c(168L, 
 171L, 174L, 175L, 196L, 199L, 207L, 208L, 213L, 209L, 210L, 212L, 
 283L, 325L, 329L, 527L, 315L, 316L, 320L, 334L, 339L, 582L, 319L, 
 523L, 526L, 601L, 532L)), class = "data.frame", row.names = c(NA, 
 -27L))

e5<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), siteid = c(9L, 9L, 
 9L, 9L, 9L, 9L, 9L, 9L, 9L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 
 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 17L)), class = "data.frame", row.names = c(NA, 
 -27L))
e5<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), siteid = c(9L, 9L, 
 9L, 9L, 9L, 9L, 9L, 9L, 9L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 
 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 17L)), class = "data.frame", row.names = c(NA, 
 -27L))

e7<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0007", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), Location = structure(c(2L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Urban", "Ural"), class = "factor")), class = "data.frame", row.names = c(NA, 
 -27L))
e8<-structure(list(Subject = structure(c(1L, 2L, 3L, 5L, 6L, 4L, 
 7L, 8L, 9L, 21L, 22L, 23L, 24L, 25L, 27L, 26L, 10L, 11L, 12L, 
 13L, 14L, 15L, 17L, 19L, 18L, 20L, 16L), .Label = c("300-0001", 
 "300-0002", "300-0003", "300-0004", "300-0005", "300-0006", "300-0007", 
 "300-0008", "300-0009", "301-0001", "301-0002", "301-0003", "301-0004", 
 "301-0005", "301-0006", "302-0001", "303-0001", "303-0002", "303-0003", 
 "303-0004", "304-0001", "304-0002", "304-0003", "304-0004", "304-0005", 
 "304-0006", "304-0007"), class = "factor"), SEX = structure(c(2L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Female", "Male"), class = "factor")), class = "data.frame", row.names = c(NA, 
 -27L))

map_rule1<-structure(list(EDC_file_name = c("e1", "e2", "e3", 
 "e4", "e5", "e6", "e7", "e8"), Tab = c("Demographics", 
 "Demographics", "PatientRegister", "PatientRegister", "PatientRegister", 
 "PatientRegister", "PatientConsent", "PatientConsent"), DatasetName = c("Demographics Merged", 
 NA, "Patient Register", NA, NA,  NA, "Patient Consent", NA), GroupVar1 = c( "Subject", 
  NA, "Subject",  NA, NA,  NA, 
  NA,  NA)), row.names = c(NA, -8L), class = c("tbl_df", 
 "tbl", "data.frame"))

有什么建议吗?谢谢

以下是我认为可能有效的方法。在 map_rule1 规则集的清理版本上进行测试:它有两个错误来源,您可能需要对其进行陷阱或预清理:1) e6 未定义,以及 2) 我决定弄清楚如何处理丢失的 merge-by 列是我感觉不到的额外复杂程度:

 temp  <- lapply( split(map_rule1, map_rule1$Tab) , 
             # breaks into groups by Domain
                   function( d){ assign( d$DatasetName[1], 
                                        # names= first items in col
                   # I don't generally use assign but seems reasonable here
                     Reduce( function(x,y){ merge(x,y, by=d$GroupVar1[1])}, 
                                              lapply(d$EDC_file_name, get) ) ,
                                            #use first item as named by-argument 
                                 envir=globalenv() )}
             # named objects need to  appear outside this function
                )
#need to run this before calculating `temp`
map_rule1 <- 
structure(list(EDC_file_name = c("e1", "e2", "e3", "e4", "e5"
), Tab = c("Demographics", "Demographics", "PatientRegister", 
"PatientRegister", "PatientRegister"), DatasetName = c("Demographics Merged", 
NA, "Patient Register", NA, NA), GroupVar1 = c("Subject", NA, 
"Subject", NA, NA)), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame")) 

------------结果------

# First what was in temp
str(temp)
List of 2
 $ Demographics   :'data.frame':    27 obs. of  3 variables:
  ..$ Subject: Factor w/ 27 levels "300-0001","300-0002",..: 1 2 3 4 5 6 7 8 9 10 ...
  ..$ SEX    : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 1 2 2 2 2 ...
  ..$ RACE   : Factor w/ 2 levels "Black (including African, Caribbean descent)",..: 2 2 2 2 2 1 2 2 2 2 ...
 $ PatientRegister:'data.frame':    27 obs. of  4 variables:
  ..$ Subject   : Factor w/ 27 levels "300-0001","300-0002",..: 1 2 3 4 5 6 7 8 9 10 ...
  ..$ ETHNIC_STD: int [1:27] 2 2 2 2 2 2 2 2 2 2 ...
  ..$ subjectId : int [1:27] 168 171 174 199 175 196 207 208 213 315 ...
  ..$ siteid    : int [1:27] 9 9 9 9 9 9 9 9 9 15 ...

 # Second the results in the global environment 
 # with the weird un-Rish names containing spaces

 `Demographics Merged`
    Subject    SEX                                         RACE
1  300-0001   Male                                    Caucasian
2  300-0002 Female                                    Caucasian
3  300-0003   Male                                    Caucasian
4  300-0004 Female                                    Caucasian
5  300-0005   Male                                    Caucasian
6  300-0006 Female Black (including African, Caribbean descent)
7  300-0007   Male                                    Caucasian
8  300-0008   Male                                    Caucasian
9  300-0009   Male                                    Caucasian
10 301-0001   Male                                    Caucasian
11 301-0002 Female                                    Caucasian
12 301-0003   Male                                    Caucasian
13 301-0004   Male                                    Caucasian
14 301-0005   Male Black (including African, Caribbean descent)
15 301-0006   Male                                    Caucasian
16 302-0001   Male                                    Caucasian
17 303-0001   Male                                    Caucasian
18 303-0002   Male Black (including African, Caribbean descent)
19 303-0003   Male                                    Caucasian
20 303-0004   Male                                    Caucasian
21 304-0001   Male                                    Caucasian
22 304-0002   Male                                    Caucasian
23 304-0003 Female Black (including African, Caribbean descent)
24 304-0004   Male Black (including African, Caribbean descent)
25 304-0005   Male Black (including African, Caribbean descent)
26 304-0006 Female                                    Caucasian
27 304-0007   Male                                    Caucasian

您可以通过 运行 lapply 代码在您的工作区中获得 unRish-named-results 而无需将其结果分配给 temp