如何为 R 中此 'df' 中的重复行分配唯一代码?

How to assign a unique code for duplicate rows in this 'df' in R?

我有这个数据框df

df <- data.frame(stringsAsFactors=FALSE,
          id = c(1L, 2L, 3L, 4L, 5L, 6L),
     Country = c("ESP", "ESP", "ESP", "ITA", "ITA", "ITA"),
        Year = c(1965L, 1965L, 1965L, 1965L, 1965L, 1965L),
   Time.step = c("Month", "Month", "Month", "Month", "Month", "Month"),
    GSA.numb = c("GSA 5", "GSA 5", "GSA 5", "GSA 17", "GSA 17", "GSA 17"),
     Species = c("Mullus", "Mullus", "Mullus", "Eledone", "Eledone", "Eledone"),
    Quantity = c(500L, 200L, 200L, 350L, 350L, 125L)
                )

df

   id  Country   Year    Time.step    GSA.numb  Species   Quantity
    1    ESP     1965     Month       GSA 5      Mullus     500   
    2    ESP     1965     Month       GSA 5      Mullus     200  
    3    ESP     1965     Month       GSA 5      Mullus     200 
    4    ITA     1965     Month       GSA 17     Eledone    350
    5    ITA     1965     Month       GSA 17     Eledone    350 
    6    ITA     1965     Month       GSA 17     Eledone    125

我有一些重复的行,如:3 和 5。 当行重复时,我可以为 F 或 T 逻辑值创建一列:

df$dup <- duplicated(df[,2:7]) #No id! 

结果:

id  Country   Year    Time.step    GSA.numb  Species   Quantity dup
 1    ESP     1965     Month       GSA 5      Mullus     500   FALSE
 2    ESP     1965     Month       GSA 5      Mullus     200   FALSE
 3    ESP     1965     Month       GSA 5      Mullus     200   TRUE
 4    ITA     1965     Month       GSA 17     Eledone    350   FALSE
 5    ITA     1965     Month       GSA 17     Eledone    350   TRUE
 6    ITA     1965     Month       GSA 17     Eledone    125   FALSE

现在,我想要一个新列(以动态方式,我的真实 df 非常大,有很多行、列和变量),可以在其中查看重复行的数量正确,像这样:

aspected.df

id  Country Year  Time.step  GSA.numb  Species   Quantity dup  ref  
 1  ESP     1965  Month      GSA 5      Mullus     500   FALSE NA
 2  ESP     1965  Month      GSA 5      Mullus     200   FALSE NA
 3  ESP     1965  Month      GSA 5      Mullus     200   TRUE  =id2
 4  ITA     1965  Month      GSA 17     Eledone    350   FALSE NA
 5  ITA     1965  Month      GSA 17     Eledone    350   TRUE  =id4
 6  ITA     1965  Month      GSA 17     Eledone    125   FALSE NA

我试过:

with(df, ave(as.character(Species), df[,2:6], FUN = make.unique)) 

但结果是:

[1] "Mullus"    "Mullus.1"  "Mullus.2"  "Eledone"   "Eledone.1" "Eledone.2"

我想我需要更多的代码输入。哪些功能有用? (duplicated,make.unit, row.names 等等...)

您可以使用 tidyverse 函数快速识别重复项

df$dup <- duplicated(df[,2:7]) #No id! 

library(tidyverse)

df %>% 
 group_by(dup) %>% 
 mutate(ref=ifelse(dup, paste0("id",1:n()), NA_character_))

#> # A tibble: 6 x 9
#> # Groups:   dup [2]
#>      id Country  Year Time.step GSA.numb Species Quantity dup   ref  
#>   <int> <chr>   <int> <chr>     <chr>    <chr>      <int> <lgl> <chr>
#> 1     1 ESP      1965 Month     GSA 5    Mullus       500 FALSE NA   
#> 2     2 ESP      1965 Month     GSA 5    Mullus       200 FALSE NA   
#> 3     3 ESP      1965 Month     GSA 5    Mullus       200 TRUE  id1  
#> 4     4 ITA      1965 Month     GSA 17   Eledone      350 FALSE NA   
#> 5     5 ITA      1965 Month     GSA 17   Eledone      350 TRUE  id2  
#> 6     6 ITA      1965 Month     GSA 17   Eledone      125 FALSE NA 

此示例使用基数 R 并将找到的重复值与原始值匹配。如果您的单行也有多个重复项,这会很有帮助。

示例数据(使用 dput(control = NULL) 因此 characters/factors 被转换为数字)

df <- data.frame(id = c(1, 1, 1, 2, 2, 2), 
           Country = c(1965, 1965, 1965, 1965, 1965, 1965), 
           Year = c(1, 1, 1, 1, 1, 1), 
           Time.step = c(1, 1, 1, 1, 1, 1), 
           GSA.numb = c(5, 5, 5, 17, 17, 17), 
           Species = c(2, 2, 2, 1, 1, 1), Quantity = c(500, 200, 200, 350, 350, 125))

代码是矢量化的,因此,尽管有外部循环,它应该 运行 在您的大型数据帧上相当快。

df$dup <- duplicated(df)
dupes <- df[df$dup,]
df$ref <- NA # initialize 
for(i in 1:nrow(dupes)){
  z=which(df[,1] == dupes[i,1]&
          df[,2] == dupes[i,2]&
          df[,3] == dupes[i,3]&
          df[,4] == dupes[i,4]&
          df[,5] == dupes[i,5]&
          df[,6] == dupes[i,6]&
          df[,7] == dupes[i,7]) # make sure not to include that $dup column!
  df$ref[z[-1]] <- paste0("=id",min(z))
}
df
#  id Country Year Time.step GSA.numb Species Quantity   dup  ref
#1  1    1965    1         1        5       2      500 FALSE <NA>
#2  1    1965    1         1        5       2      200 FALSE <NA>
#3  1    1965    1         1        5       2      200  TRUE =id2
#4  2    1965    1         1       17       1      350 FALSE <NA>
#5  2    1965    1         1       17       1      350  TRUE =id4
#6  2    1965    1         1       17       1      125 FALSE <NA>

即使您可以使用应用函数来收紧它,这也会 运行 更快。

使用tidyverse:

df %>%
  group_by_at(vars(-id)) %>% #Group by all variables except of id
  mutate(n = n(), #Identifying the duplicate rows
         dup = ifelse(seq_along(n) > 1, TRUE, FALSE), #Coding the first unique row as TRUE and others as FALSE
         ref = ifelse(dup == TRUE, paste0("=id", first(id[dup == FALSE])), NA_character_)) %>% #Pasting the id of the first unique row
 select(-n)

     id Country  Year Time.step GSA.numb Species Quantity dup   ref  
  <int> <chr>   <int> <chr>     <chr>    <chr>      <int> <lgl> <chr>
1     1 ESP      1965 Month     GSA 5    Mullus       500 FALSE <NA> 
2     2 ESP      1965 Month     GSA 5    Mullus       200 FALSE <NA> 
3     3 ESP      1965 Month     GSA 5    Mullus       200 TRUE  =id2 
4     4 ITA      1965 Month     GSA 17   Eledone      350 FALSE <NA> 
5     5 ITA      1965 Month     GSA 17   Eledone      350 TRUE  =id4 
6     6 ITA      1965 Month     GSA 17   Eledone      125 FALSE <NA> 

一种data.table方法,从初始文件开始:

library(data.table)

setDT(df)[, `:=` (dup = seq_len(.N) > 1, ref = paste0("id", first(id))), 
          by = .(Country, Year, Time.step, GSA.numb, Species, Quantity)][dup == FALSE, ref := NA]

输出:

   id Country Year Time.step GSA.numb Species Quantity   dup  ref
1:  1     ESP 1965     Month     GSA5  Mullus      500 FALSE <NA>
2:  2     ESP 1965     Month     GSA5  Mullus      200 FALSE <NA>
3:  3     ESP 1965     Month     GSA5  Mullus      200  TRUE  id2
4:  4     ITA 1965     Month    GSA17 Eledone      350 FALSE <NA>
5:  5     ITA 1965     Month    GSA17 Eledone      350  TRUE  id4
6:  6     ITA 1965     Month    GSA17 Eledone      125 FALSE <NA>

一个tidyverse方法(dup之前已经创建):

library(tidyverse)

df %>% 
  group_by_at(vars(2:7)) %>% 
  mutate(ref = ifelse(dup, paste0("id", first(id)), NA_character_))

输出:

     id Country  Year Time.step GSA.numb Species Quantity dup   ref  
  <int> <chr>   <int> <chr>     <chr>    <chr>      <int> <lgl> <chr>
1     1 ESP      1965 Month     GSA5     Mullus       500 FALSE NA   
2     2 ESP      1965 Month     GSA5     Mullus       200 FALSE NA   
3     3 ESP      1965 Month     GSA5     Mullus       200 TRUE  id2  
4     4 ITA      1965 Month     GSA17    Eledone      350 FALSE NA   
5     5 ITA      1965 Month     GSA17    Eledone      350 TRUE  id4  
6     6 ITA      1965 Month     GSA17    Eledone      125 FALSE NA

如果您想在语句中创建 dup 列:

df %>% 
  group_by_at(vars(2:7)) %>% 
  mutate(
    dup = row_number() > 1,
    ref = ifelse(dup, paste0("id", first(id)), NA_character_))

输出:

     id Country  Year Time.step GSA.numb Species Quantity dup   ref  
  <int> <chr>   <int> <chr>     <chr>    <chr>      <int> <lgl> <chr>
1     1 ESP      1965 Month     GSA5     Mullus       500 FALSE NA   
2     2 ESP      1965 Month     GSA5     Mullus       200 FALSE NA   
3     3 ESP      1965 Month     GSA5     Mullus       200 TRUE  id2  
4     4 ITA      1965 Month     GSA17    Eledone      350 FALSE NA   
5     5 ITA      1965 Month     GSA17    Eledone      350 TRUE  id4  
6     6 ITA      1965 Month     GSA17    Eledone      125 FALSE NA