如何将相同的唯一 ID 分配给 r 中两个数据帧之间的匹配观察值?

How to assign identical unique IDs to matching observations between two dataframes in r?


我有一个实际问题,当我有两个(或更多)数据框并想为每个数据集中和跨两个数据集的每个匹配观察分配唯一 ID,例如:

#1. Create dataframe df1:

a1 <- c(1, 1, 1, 1, 2, 2, 2, 2, 1, 1)
b1 <- c(1, 5, 3, 2, 3, 4, 5, 1, 5, 2)
c1 <- c("white", "red", "black", "white", "red", 
        "white", "black", "silver", "red", "green")
df1 <- data.frame(a1, b1, c1)
df1

   a1 b1     c1
1   1  1  white
2   1  5    red
3   1  3  black
4   1  2  white
5   2  3    red
6   2  4  white
7   2  5  black
8   2  1 silver
9   1  5    red
10  1  2  green

#2. Create dataframe df2:

a2 <- c(2, 2, 1, 1, 2, 2, 2, 2, 2, 2)
b2 <- c(3, 1, 3, 2, 1, 3, 4, 5, 3, 5)
c2 <- c("black", "blue", "black", "white", "silver", 
        "green", "green", "red", "blue", "white")
df2 <- data.frame(a2, b2, c2)
df2

   a2 b2     c2
1   2  3  black
2   2  1   blue
3   1  3  black
4   1  2  white
5   2  1 silver
6   2  3  green
7   2  4  green
8   2  5    red
9   2  3   blue
10  2  5  white

#3. Assign unique IDs to each observation in df1:

library(data.table)
df1.2 <- data.table(df1, key="a1,b1,c1") 
df1.2[, id:=.GRP, by=key(df1.2)]
df1.2 <- as.data.frame(df1.2)
df1.2

   a1 b1     c1 id
1   1  1  white  1
2   1  2  green  2
3   1  2  white  3
4   1  3  black  4
5   1  5    red  5
6   1  5    red  5
7   2  1 silver  6
8   2  3    red  7
9   2  4  white  8
10  2  5  black  9

#4. The problematic part!! Assign identical unique IDs to matching observations of df2 as compared to df1.2 
#and assign other unique IDs to all other non-matching obs of df2. 
#Name the resulting dataframe as df2.2 
#My expected result will ideally look as follows:

df2.2

   a2 b2     c2 id
1   2  3  black 10 
2   2  1   blue 11
3   1  3  black  4
4   1  2  white  3
5   2  1 silver  6
6   2  3  green 12
7   2  4  green 13
8   2  5    red 14
9   2  3   blue 15
10  2  5  white 16

任何关于如何获得 df2.2 的帮助将不胜感激。谢谢。

您可以通过编写生成唯一 ID 的函数然后将其应用于 df1df2 的组合来完成您想要的。

# Inspiration: 
unique.id <- function(x) as.numeric(factor(x))

(df1.info <- do.call(paste, df1))
#  [1] "1 1 white 1"  "1 5 red 5"    "1 3 black 4"  "1 2 white 3"  "2 3 red 11"  
#  [6] "2 4 white 13" "2 5 black 14" "2 1 silver 7" "1 5 red 5"    "1 2 green 2" 
df2.info <- do.call(paste, df2)
ids <- unique.id(c(df1.info, df2.info))
df1$id <- head(ids, nrow(df1))
df1
#    a1 b1     c1 id
# 1   1  1  white  1
# 2   1  5    red  5
# 3   1  3  black  4
# 4   1  2  white  3
# 5   2  3    red 11
# 6   2  4  white 13
# 7   2  5  black 14
# 8   2  1 silver  7
# 9   1  5    red  5
# 10  1  2  green  2
df2$id <- tail(ids, nrow(df2))
df2
#    a2 b2     c2 id
# 1   2  3  black  8
# 2   2  1   blue  6
# 3   1  3  black  4
# 4   1  2  white  3
# 5   2  1 silver  7
# 6   2  3  green 10
# 7   2  4  green 12
# 8   2  5    red 15
# 9   2  3   blue  9
# 10  2  5  white 16

解决这个问题的一个简单方法是创建哈希:

library(dplyr)
library(digest)

df1 %>%
  rowwise() %>%
  do( data.frame(., id=digest( paste(.$a1,.$b1,.$c1), algo="md5"),
                   stringsAsFactors=FALSE)) %>% ungroup()

df2 %>%
  rowwise() %>%
  do( data.frame(., id=digest( paste(.$a2,.$b2,.$c2), algo="md5"),
               stringsAsFactors=FALSE)) %>% ungroup()

这将为 df1 生成以下内容:

   a1 b1     c1                               id
1   1  1  white b86fbb78b27f7db2ee50af2d68cce452
2   1  5    red 68d47f544832989834517630e4a2764c
3   1  3  black 724e37192140cb2009cf3d982f2be1e4
4   1  2  white f731b8b38255b8c312543283f8e1c634
5   2  3    red 2d50b86902056a51faad04d2c566faf2
6   2  4  white 9396667cd51d1e1b61b0b22a7767d3d9
7   2  5  black 9ba1f3e04c61c006d3c5382fcad098e6
8   2  1 silver 38dcd29d200c8b33cd38ac78ef9dd751
9   1  5    red 68d47f544832989834517630e4a2764c
10  1  2  green 7d9b1aadfd79de142b234b83d7867b9b

以及 df2 的以下内容:

   a2 b2     c2                               id
1   2  3  black d285febc8ab08e99b11609b98f077e66
2   2  1   blue bfa0405276406ac4bc596daf957dfa11
3   1  3  black 724e37192140cb2009cf3d982f2be1e4
4   1  2  white f731b8b38255b8c312543283f8e1c634
5   2  1 silver 38dcd29d200c8b33cd38ac78ef9dd751
6   2  3  green 67eefe9ee2d82486ded30a268289296b
7   2  4  green d773f58cf144eab15ef459e326494a2f
8   2  5    red 0724318a9f59d3960edfe4e90f9c4eff
9   2  3   blue 6883420cc137ba45b773f642176e9ce6
10  2  5  white 5dea9e63b5fbfb31fb81260cb5a5f41c

假设您的列都相同,您最简单的方法可能是:

df.all <- rbind(df1, df2)

(您可能需要将列重命名为相同。)

现在,在整个数据集上执行您已经使用数据 table 想出的相同技巧。然后拆分数据集备份:

df1 <- df.all[1:nrow(df1),]
df2 <- df.all[- (1:nrow(df1)),]

注意:我并不是说数据 table 技巧是为独特组合生成数字的理想方式!但是你已经写出来了。