使用另一个数据框完成并填充组中缺失的数据
Complete and fill missing data in groups using another data frame
如何按组补全缺失值?
我有一个带有推荐和排名的 df,当我没有至少 4 个时,我需要插入默认推荐。
输入示例:
library(tidyverse)
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 2 1 3 recomendation_3
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
我已经尝试用 complete/fill 来做到这一点,但它不尊重组,它还会削减排名范围之外的值。
df %>%
complete(content_id, rank = 1:4,
fill = list(
recomendation_id = fixed_recomendations$recomendation_id,
name = fixed_recomendations$name
))
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 52 recomendation_52
# 2 3 53 recomendation_53
# 2 4 54 recomendation_54
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
期望的输出:
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 50 recomendation_50
# 2 3 51 recomendation_51
# 2 4 52 recomendation_52
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
我使用的软件包套件与您使用的不同,但这是我在几分钟内提出的使用 foreach
的解决方案。我本可以使用基本的 for
循环来完成它,但是 foreach 中的 .combine
功能很有用,而且它可以并行化,以防你的实际问题很大。
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
if (tab[i]<4){
foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
}
}
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new
content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
12 1 3 50 recomendation_50
11 1 4 51 recomendation_51
3 2 1 3 recomendation_3
23 2 2 50 recomendation_50
21 2 3 51 recomendation_51
22 2 4 52 recomendation_52
4 3 1 4 recomendation_4
5 3 2 5 recomendation_5
6 3 3 6 recomendation_6
7 3 4 7 recomendation_7
8 3 5 8 recomendation_8
9 3 6 9 recomendation_9
我不知道这是否是最好的方法,但我分两步解决了它,首先我使用 complete 创建了一个带有 NA 值的 tibble,然后我过滤了 NA 值并使用 [=13= 更新它的组]:
add_missing <- function(x){
for(i in 1:nrow(x)){
x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
x[i,]$name = fixed_recomendations[i,]$name
}
x
}
df_missing <- df %>%
complete(content_id, rank = 1:4) %>%
filter(is.na(recomendation_id)) %>%
split(.$content_id) %>%
map_df(add_missing)
rbind(df, df_missing) %>% arrange(content_id, rank)
> content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
3 1 3 50 recomendation_50
4 1 4 51 recomendation_51
5 2 1 3 recomendation_3
6 2 2 50 recomendation_50
7 2 3 51 recomendation_51
8 2 4 52 recomendation_52
9 3 1 4 recomendation_4
10 3 2 5 recomendation_5
11 3 3 6 recomendation_6
12 3 4 7 recomendation_7
13 3 5 8 recomendation_8
14 3 6 9 recomendation_9
如何按组补全缺失值?
我有一个带有推荐和排名的 df,当我没有至少 4 个时,我需要插入默认推荐。
输入示例:
library(tidyverse)
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 2 1 3 recomendation_3
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
我已经尝试用 complete/fill 来做到这一点,但它不尊重组,它还会削减排名范围之外的值。
df %>%
complete(content_id, rank = 1:4,
fill = list(
recomendation_id = fixed_recomendations$recomendation_id,
name = fixed_recomendations$name
))
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 52 recomendation_52
# 2 3 53 recomendation_53
# 2 4 54 recomendation_54
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
期望的输出:
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 50 recomendation_50
# 2 3 51 recomendation_51
# 2 4 52 recomendation_52
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
我使用的软件包套件与您使用的不同,但这是我在几分钟内提出的使用 foreach
的解决方案。我本可以使用基本的 for
循环来完成它,但是 foreach 中的 .combine
功能很有用,而且它可以并行化,以防你的实际问题很大。
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
if (tab[i]<4){
foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
}
}
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new
content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
12 1 3 50 recomendation_50
11 1 4 51 recomendation_51
3 2 1 3 recomendation_3
23 2 2 50 recomendation_50
21 2 3 51 recomendation_51
22 2 4 52 recomendation_52
4 3 1 4 recomendation_4
5 3 2 5 recomendation_5
6 3 3 6 recomendation_6
7 3 4 7 recomendation_7
8 3 5 8 recomendation_8
9 3 6 9 recomendation_9
我不知道这是否是最好的方法,但我分两步解决了它,首先我使用 complete 创建了一个带有 NA 值的 tibble,然后我过滤了 NA 值并使用 [=13= 更新它的组]:
add_missing <- function(x){
for(i in 1:nrow(x)){
x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
x[i,]$name = fixed_recomendations[i,]$name
}
x
}
df_missing <- df %>%
complete(content_id, rank = 1:4) %>%
filter(is.na(recomendation_id)) %>%
split(.$content_id) %>%
map_df(add_missing)
rbind(df, df_missing) %>% arrange(content_id, rank)
> content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
3 1 3 50 recomendation_50
4 1 4 51 recomendation_51
5 2 1 3 recomendation_3
6 2 2 50 recomendation_50
7 2 3 51 recomendation_51
8 2 4 52 recomendation_52
9 3 1 4 recomendation_4
10 3 2 5 recomendation_5
11 3 3 6 recomendation_6
12 3 4 7 recomendation_7
13 3 5 8 recomendation_8
14 3 6 9 recomendation_9