R:合并前查找不匹配的列名
R: finding mismatched column names before merging
我有大量数据框要合并。每个都有几百列。在执行此操作之前,我想确定所有不匹配的列名。到目前为止,我可以生成一个不匹配列表,但格式很糟糕,我不太清楚如何判断它们来自哪个数据框。
#create data
df1 <- data.frame("col1" = 3:4, "Age" = c(22,16), "Name" = c("James","Jim"))
df2 <- data.frame("col1" = 3:4, "Age" = c(18,19), "Name" = c("Mike","Mia"))
df3 <- data.frame("mismatch_col_name_1" = 1:2, "Age" = c(21,15), "name" = c("John","Dora"))
df4 <- data.frame("mismatch_col_name_2" = 1:2, "Age" = c(21,15), "Name" = c("John","Dora"))
files <- list(df1, df2, df3, df4)
# find mismatched column names
mismatches <- NULL
for (i in 1:(length(files) - 1)) {
mismatches <- c(mismatches, setdiff(colnames(files[[i]]), colnames(files[[i+1]])))
}
mismatches <- c(mismatches, setdiff(colnames(files[[length(files)]]), colnames(files[[1]])))
print(mismatches)
[1] "col1" "Name" "mismatch_col_name_1" "name"
[5] "mismatch_col_name_2"
期望的输出类似于:
"df3" "mismatch_col_name_1" "name"
"df4" "mismatch_col_name_2" "Name"
甚至 df 名称和列号。对任何解决方案或更好的方法感兴趣。
这里有一种方法可以让您获得一个列表(在 R 意义上),其中包含每个文件的不匹配项。它基于您知道要与每个文件进行比较的 "true" 组名称的假设。
lapply(files, function(x) {
# vector of desired names
master <- c('col1', 'Age', 'Name')
# use 'match' to compare this df's names to the master. the order of the
# cols won't matter; if the name in x appears in master, 'match' will return
# an integer indicating the position of the col with that name in x.
comparison <- match(names(x), master)
# if all col names in x appear in master, you get all integers, so: NULL
if (!any(is.na(comparison))) {
NULL
# if names in x don't appear in master, you get an NA from 'match', so here you
# create a vector of the names in x that aren't in master. You could also capture
# their position here if that's helpful.
} else {
mismatches <- names(x)[which(is.na(comparison))]
}
})
结果:
[[1]]
NULL
[[2]]
NULL
[[3]]
[1] "mismatch_col_name_1" "name"
[[4]]
[1] "mismatch_col_name_2"
您可以通过多种方式组织或总结此列表的内容,但这主要是格式问题。
我有大量数据框要合并。每个都有几百列。在执行此操作之前,我想确定所有不匹配的列名。到目前为止,我可以生成一个不匹配列表,但格式很糟糕,我不太清楚如何判断它们来自哪个数据框。
#create data
df1 <- data.frame("col1" = 3:4, "Age" = c(22,16), "Name" = c("James","Jim"))
df2 <- data.frame("col1" = 3:4, "Age" = c(18,19), "Name" = c("Mike","Mia"))
df3 <- data.frame("mismatch_col_name_1" = 1:2, "Age" = c(21,15), "name" = c("John","Dora"))
df4 <- data.frame("mismatch_col_name_2" = 1:2, "Age" = c(21,15), "Name" = c("John","Dora"))
files <- list(df1, df2, df3, df4)
# find mismatched column names
mismatches <- NULL
for (i in 1:(length(files) - 1)) {
mismatches <- c(mismatches, setdiff(colnames(files[[i]]), colnames(files[[i+1]])))
}
mismatches <- c(mismatches, setdiff(colnames(files[[length(files)]]), colnames(files[[1]])))
print(mismatches)
[1] "col1" "Name" "mismatch_col_name_1" "name"
[5] "mismatch_col_name_2"
期望的输出类似于:
"df3" "mismatch_col_name_1" "name"
"df4" "mismatch_col_name_2" "Name"
甚至 df 名称和列号。对任何解决方案或更好的方法感兴趣。
这里有一种方法可以让您获得一个列表(在 R 意义上),其中包含每个文件的不匹配项。它基于您知道要与每个文件进行比较的 "true" 组名称的假设。
lapply(files, function(x) {
# vector of desired names
master <- c('col1', 'Age', 'Name')
# use 'match' to compare this df's names to the master. the order of the
# cols won't matter; if the name in x appears in master, 'match' will return
# an integer indicating the position of the col with that name in x.
comparison <- match(names(x), master)
# if all col names in x appear in master, you get all integers, so: NULL
if (!any(is.na(comparison))) {
NULL
# if names in x don't appear in master, you get an NA from 'match', so here you
# create a vector of the names in x that aren't in master. You could also capture
# their position here if that's helpful.
} else {
mismatches <- names(x)[which(is.na(comparison))]
}
})
结果:
[[1]]
NULL
[[2]]
NULL
[[3]]
[1] "mismatch_col_name_1" "name"
[[4]]
[1] "mismatch_col_name_2"
您可以通过多种方式组织或总结此列表的内容,但这主要是格式问题。