如何识别 R 中两个数据集之间的不匹配?

How to identify mismatch between two data sets in R?

我有两个数据集。数据集1和数据集2如下:

数据集 1:-

family_id     house_id   number_family_member
1             1052        2
2             5042        3
3             1111        2

数据集2:-

family_id     house_id   age   gender
1             1052       24    male
1             1052       25    female
2             5042       23    male
2             5042       20    female
3             1111       1     male
3             1111       20    female
3             1111       21    female

数据集1中输入的成员数量与数据集2中输入的个人详细信息不匹配。就像家庭 ID 2 一样,家庭成员的数量在数据集 1 中为 3,但在数据集 2 中只有 2 个成员的条目。 如何识别两个数据集之间的这些类型的不匹配????

我们可以使用count来统计家庭成员的数量并创建一个新的数据框df3,然后使用setequal来比较df1df3.

library(dplyr)

df3 <- df2 %>% 
  count(family_id, house_id) %>%
  rename(number_family_member = n)

setequal(df1, df3)
# FALSE: Rows in x but not y: 2, 3. Rows in y but not x: 2, 3. 

数据

df1 <- read.table(text = "family_id     house_id   number_family_member
1             1052        2
                  2             5042        3
                  3             1111        2",
                  header = TRUE, stringsAsFactors = FALSE)

df2 <- read.table(text = "family_id     house_id   age   gender
1             1052       24    male
1             1052       25    female
2             5042       23    male
2             5042       20    female
3             1111       1     male
3             1111       20    female
3             1111       21    female",
                  header = TRUE, stringsAsFactors = FALSE)

这可以通过 aggregatemerge 来完成。

agg <- aggregate(family_id ~ factor(family_id), dataset2, length)
mrg <- merge(agg, dataset1[c(1, 3)], by.x = "factor(family_id)", by.y = "family_id")

result <- data.frame(family_id = dataset1$family_id)
result$Match <- ifelse(dataset1$number_family_member == mrg$family_id, "match", "mismatch")
result
#  family_id     Match
#1         1     match
#2         2  mismatch
#3         3  mismatch

rm(agg, mrg)    # final clean up

数据。

dataset1 <- read.table(text = "
family_id     house_id   number_family_member
1             1052        2
2             5042        3
3             1111        2
", header = TRUE)

dataset2 <- read.table(text = "
family_id     house_id   age   gender
1             1052       24    male
1             1052       25    female
2             5042       23    male
2             5042       20    female
3             1111       1     male
3             1111       20    female
3             1111       21    female
", header = TRUE)

这两种观点可能对您有所帮助:

dataset2 %>%
  add_count(family_id) %>%
  inner_join(dataset1) %>%
  mutate(match= n ==number_family_member)

# # A tibble: 7 x 7
#   family_id house_id   age gender     n number_family_member match
#       <int>    <int> <int> <fctr> <int>                <int> <lgl>
# 1         1     1052    24   male     2                    2  TRUE
# 2         1     1052    25 female     2                    2  TRUE
# 3         2     5042    23   male     2                    3 FALSE
# 4         2     5042    20 female     2                    3 FALSE
# 5         3     1111     1   male     3                    2 FALSE
# 6         3     1111    20 female     3                    2 FALSE
# 7         3     1111    21 female     3                    2 FALSE

dataset2 %>%
  count(family_id) %>%
  inner_join(dataset1) %>%
  mutate(match= n ==number_family_member)

# # A tibble: 3 x 5
#   family_id     n house_id number_family_member match
#       <int> <int>    <int>                <int> <lgl>
# 1         1     2     1052                    2  TRUE
# 2         2     2     5042                    3 FALSE
# 3         3     3     1111                    2 FALSE