使用另一个数据框完成并填充组中缺失的数据

Question

如何按组补全缺失值？

我有一个带有推荐和排名的 df，当我没有至少 4 个时，我需要插入默认推荐。

输入示例：

library(tidyverse)

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)

# content_id rank recomendation_id  name
# 1         1     1                 recomendation_1
# 1         2     2                 recomendation_2
# 2         1     3                 recomendation_3
# 3         1     4                 recomendation_4
# 3         2     5                 recomendation_5
# 3         3     6                 recomendation_6
# 3         4     7                 recomendation_7
# 3         5     8                 recomendation_8
# 3         6     9                 recomendation_9

我已经尝试用 complete/fill 来做到这一点，但它不尊重组，它还会削减排名范围之外的值。

df %>% 
  complete(content_id, rank = 1:4, 
           fill = list(
            recomendation_id = fixed_recomendations$recomendation_id,
            name = fixed_recomendations$name
            ))

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     52              recomendation_52
# 2           3     53              recomendation_53
# 2           4     54              recomendation_54
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7

期望的输出：

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     50              recomendation_50
# 2           3     51              recomendation_51
# 2           4     52              recomendation_52
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7
# 3           5      8               recomendation_8
# 3           6      9               recomendation_9

Answer 1

我使用的软件包套件与您使用的不同，但这是我在几分钟内提出的使用 foreach 的解决方案。我本可以使用基本的 for 循环来完成它，但是 foreach 中的 .combine 功能很有用，而且它可以并行化，以防你的实际问题很大。

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)


tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
  if (tab[i]<4){
    foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
      data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
    }
  }
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new

   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
12          1    3               50 recomendation_50
11          1    4               51 recomendation_51
3           2    1                3  recomendation_3
23          2    2               50 recomendation_50
21          2    3               51 recomendation_51
22          2    4               52 recomendation_52
4           3    1                4  recomendation_4
5           3    2                5  recomendation_5
6           3    3                6  recomendation_6
7           3    4                7  recomendation_7
8           3    5                8  recomendation_8
9           3    6                9  recomendation_9

Answer 2

我不知道这是否是最好的方法，但我分两步解决了它，首先我使用 complete 创建了一个带有 NA 值的 tibble，然后我过滤了 NA 值并使用 [=13= 更新它的组]:

add_missing <- function(x){
  for(i in 1:nrow(x)){
    x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
    x[i,]$name = fixed_recomendations[i,]$name     
  }
  x
}

df_missing <- df %>% 
  complete(content_id, rank = 1:4) %>% 
  filter(is.na(recomendation_id)) %>% 
  split(.$content_id) %>% 
  map_df(add_missing)

rbind(df, df_missing) %>% arrange(content_id, rank)

>   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
3           1    3               50 recomendation_50
4           1    4               51 recomendation_51
5           2    1                3  recomendation_3
6           2    2               50 recomendation_50
7           2    3               51 recomendation_51
8           2    4               52 recomendation_52
9           3    1                4  recomendation_4
10          3    2                5  recomendation_5
11          3    3                6  recomendation_6
12          3    4                7  recomendation_7
13          3    5                8  recomendation_8
14          3    6                9  recomendation_9

使用另一个数据框完成并填充组中缺失的数据

Complete and fill missing data in groups using another data frame

r

dplyr

tidyr

tidyverse