通过优化创建因素组合
Create a combination of factors with optimization
library(dplyr)
library(tidyr)
df <- data.frame(
First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8"),
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
)
df <- df %>%
mutate(
ID = row_number(),
lmt = n_distinct(ID)
) %>%
separate_rows(Second, sep = "; ") %>%
group_by(ID) %>%
mutate(
wgt = row_number()
) %>% ungroup()
假设对于每个 ID,我只想保留 First
和 Second
的 1 个组合(即 df
中唯一 ID 的长度应始终等于 lmt
).
但是,我想通过优化某些参数来做到这一点。解决方案的设计方式应为:
与wgt
的组合应尽可能选择1,或者也应选择2,但应避免3(即wgt
的总和应最小);
Second
中某个值的频率与First
中的频率之差应该接近于0。
关于如何在 R 中处理这个问题有什么想法吗?
上述情况的预期输出是:
ID First Second wgt lmt
1 1 MW3 MW4 1 8
2 2 MW3 MW7 3 8
3 3 MW4 MW7 2 8
4 4 MW5 MW5 1 8
5 5 MW6 MW3 1 8
6 6 MW7 MW8 2 8
7 7 MW7 MW3 1 8
8 8 MW8 MW6 1 8
为什么?仅仅是因为在这种组合中,右侧 (Second
) 的任何元素都没有左侧 (First
) 多。例如,右侧和左侧有两个 MW3 元素。
然而,这里要付出的代价是wgt
并不总是1(wgt
的总和不是8而是12)。
澄清:如果两个标准不能同时最小化,则应优先考虑第二个标准(频率之间的差异)的最小化。
我解决了这个问题,我可以使用 minconflicts 算法的变体来分享解决方案。这里的关键是找到一个结合你的要求的评分函数。下面的实施遵循您的建议“假设 objective 应该优先考虑第二个标准(频率之间的差异)”的最小化。在您的实际数据上试验其他评分函数,让我们看看您能走多远。
根据您的原始数据(8 个 ID),我得到的解决方案与您发布的解决方案一样好:
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 2 2 0
2: MW4 1 1 0
3: MW5 1 1 0
4: MW6 1 1 0
5: MW7 2 2 0
6: MW8 1 1 0
[1] "Total freq diff: 0"
[1] "Total wgt: 12"
对于具有 10000 个 ID 的随机数据,该算法能够找到 First/Second 频率没有差异的解决方案(但 wgt 的总和大于最小值):
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 1660 1660 0
2: MW4 1762 1762 0
3: MW5 1599 1599 0
4: MW6 1664 1664 0
5: MW7 1646 1646 0
6: MW8 1669 1669 0
[1] "Total freq diff: 0"
[1] "Total wgt: 19521"
代码如下:
library(data.table)
df <- as.data.table(df)
df <- df[, .(ID, First, Second, wgt)]
# PLAY AROUND WITH THIS PARAMETER
freq_weight <- 0.9
wgt_min <- df[, uniqueN(ID)]
wgt_max <- df[, uniqueN(ID) * 3]
freq_min <- 0
freq_max <- df[, uniqueN(ID) * 2] #verify if this is the worst case scenario
score <- function(solution){
# compute raw scores
current_wgt <- solution[, sum(wgt)]
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
current_freq <- compare[, sum(diff)]
# normalize
wgt_score <- (current_wgt - wgt_min) / (wgt_max - wgt_min)
freq_score <- (current_freq - freq_min) / (freq_max - freq_min)
#combine
score <- (freq_weight * freq_score) + ((1 - freq_weight) * wgt_score)
return(score)
}
#initialize random solution
current_solution <- df[, .SD[sample(.N, 1)], by = ID]
#get freq of First (this does not change)
First_freq <- current_solution[, .(FirstCount = .N), by = First]
names(First_freq)[1] <- "Name"
#get mincoflict to be applied on each iteration
minconflict <- function(df, solution){
#pick ID
change <- solution[, sample(unique(ID), 1)]
#get permissible values
values <- df[ID == change, .(Second, wgt)]
#assign scores
values[, score := NA_real_]
for (i in 1:nrow(values)) {
solution[ID == change, c("Second", "wgt") := values[i, .(Second, wgt)]]
set(values, i, "score", score(solution))
}
#return the best combination
scores <<- c(scores, values[, min(score)])
solution[ID == change, c("Second", "wgt") := values[which.min(score), .(Second, wgt)]]
}
#optimize
scores <- 1
iter <- 0
while(TRUE){
minconflict(df, current_solution)
iter <- iter + 1
#SET MAX NUMBER OF ITERATIONS HERE
if(scores[length(scores)] == 0 | iter >= 1000) break
}
# summarize obtained solution
solution_summary <- function(solution){
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
print(compare)
print(paste("Total freq diff: ", compare[, sum(diff)]))
print(paste("Total wgt: ", solution[, sum(wgt)]))
}
solution_summary(current_solution)
这基本上是一个二分图匹配问题,因此可以通过最大流或线性规划 (bipartite graph matching to match two sets) 在合理的时间内准确解决。
library(lpSolve)
MISMATCH.COST <- 1000
.create.row <- function(row.names, first) {
row <- vector(mode="numeric", length=length(first))
for (i in 1:length(row.names))
row = row + (-MISMATCH.COST+i)*(row.names[i]==first)
return(row)
}
find.pairing <- function(First, Second) {
row.names = sapply(Second, strsplit, "; ")
# Create cost matrix for assignment
mat = sapply(row.names, .create.row, First)
assignment <- lp.assign(mat)
print("Total cost:")
print(assignment$objval+length(First)*MISMATCH.COST)
solution <- lp.assign(mat)$solution
pairs <- which(solution>0, arr.ind=T)
matches = First[pairs[,1]]
# Find out where a mismatch has occured, and replace match
for (i in 1:length(matches)) {
if (!(matches[i] %in% row.names[[i]])) {
matches[i] = row.names[[i]][1]
}
}
result = data.frame(
First[pairs[,2]],
matches)
return(result)
}
运行 它在您的示例中给出了最佳解决方案(它应该始终如此)
> First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8")
> Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
+ "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
+ "MW3; MW4; MW5", "MW6; MW3; MW7")
> find.pairing(First, Second)
[1] "Total cost:"
[1] 12
First.pairs...2.. matches
1 MW3 MW4
2 MW3 MW3
3 MW4 MW7
4 MW5 MW5
5 MW6 MW7
6 MW7 MW8
7 MW7 MW3
8 MW8 MW6
library(dplyr)
library(tidyr)
df <- data.frame(
First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8"),
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
)
df <- df %>%
mutate(
ID = row_number(),
lmt = n_distinct(ID)
) %>%
separate_rows(Second, sep = "; ") %>%
group_by(ID) %>%
mutate(
wgt = row_number()
) %>% ungroup()
假设对于每个 ID,我只想保留 First
和 Second
的 1 个组合(即 df
中唯一 ID 的长度应始终等于 lmt
).
但是,我想通过优化某些参数来做到这一点。解决方案的设计方式应为:
与
wgt
的组合应尽可能选择1,或者也应选择2,但应避免3(即wgt
的总和应最小);Second
中某个值的频率与First
中的频率之差应该接近于0。
关于如何在 R 中处理这个问题有什么想法吗?
上述情况的预期输出是:
ID First Second wgt lmt
1 1 MW3 MW4 1 8
2 2 MW3 MW7 3 8
3 3 MW4 MW7 2 8
4 4 MW5 MW5 1 8
5 5 MW6 MW3 1 8
6 6 MW7 MW8 2 8
7 7 MW7 MW3 1 8
8 8 MW8 MW6 1 8
为什么?仅仅是因为在这种组合中,右侧 (Second
) 的任何元素都没有左侧 (First
) 多。例如,右侧和左侧有两个 MW3 元素。
然而,这里要付出的代价是wgt
并不总是1(wgt
的总和不是8而是12)。
澄清:如果两个标准不能同时最小化,则应优先考虑第二个标准(频率之间的差异)的最小化。
我解决了这个问题,我可以使用 minconflicts 算法的变体来分享解决方案。这里的关键是找到一个结合你的要求的评分函数。下面的实施遵循您的建议“假设 objective 应该优先考虑第二个标准(频率之间的差异)”的最小化。在您的实际数据上试验其他评分函数,让我们看看您能走多远。
根据您的原始数据(8 个 ID),我得到的解决方案与您发布的解决方案一样好:
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 2 2 0
2: MW4 1 1 0
3: MW5 1 1 0
4: MW6 1 1 0
5: MW7 2 2 0
6: MW8 1 1 0
[1] "Total freq diff: 0"
[1] "Total wgt: 12"
对于具有 10000 个 ID 的随机数据,该算法能够找到 First/Second 频率没有差异的解决方案(但 wgt 的总和大于最小值):
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 1660 1660 0
2: MW4 1762 1762 0
3: MW5 1599 1599 0
4: MW6 1664 1664 0
5: MW7 1646 1646 0
6: MW8 1669 1669 0
[1] "Total freq diff: 0"
[1] "Total wgt: 19521"
代码如下:
library(data.table)
df <- as.data.table(df)
df <- df[, .(ID, First, Second, wgt)]
# PLAY AROUND WITH THIS PARAMETER
freq_weight <- 0.9
wgt_min <- df[, uniqueN(ID)]
wgt_max <- df[, uniqueN(ID) * 3]
freq_min <- 0
freq_max <- df[, uniqueN(ID) * 2] #verify if this is the worst case scenario
score <- function(solution){
# compute raw scores
current_wgt <- solution[, sum(wgt)]
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
current_freq <- compare[, sum(diff)]
# normalize
wgt_score <- (current_wgt - wgt_min) / (wgt_max - wgt_min)
freq_score <- (current_freq - freq_min) / (freq_max - freq_min)
#combine
score <- (freq_weight * freq_score) + ((1 - freq_weight) * wgt_score)
return(score)
}
#initialize random solution
current_solution <- df[, .SD[sample(.N, 1)], by = ID]
#get freq of First (this does not change)
First_freq <- current_solution[, .(FirstCount = .N), by = First]
names(First_freq)[1] <- "Name"
#get mincoflict to be applied on each iteration
minconflict <- function(df, solution){
#pick ID
change <- solution[, sample(unique(ID), 1)]
#get permissible values
values <- df[ID == change, .(Second, wgt)]
#assign scores
values[, score := NA_real_]
for (i in 1:nrow(values)) {
solution[ID == change, c("Second", "wgt") := values[i, .(Second, wgt)]]
set(values, i, "score", score(solution))
}
#return the best combination
scores <<- c(scores, values[, min(score)])
solution[ID == change, c("Second", "wgt") := values[which.min(score), .(Second, wgt)]]
}
#optimize
scores <- 1
iter <- 0
while(TRUE){
minconflict(df, current_solution)
iter <- iter + 1
#SET MAX NUMBER OF ITERATIONS HERE
if(scores[length(scores)] == 0 | iter >= 1000) break
}
# summarize obtained solution
solution_summary <- function(solution){
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
print(compare)
print(paste("Total freq diff: ", compare[, sum(diff)]))
print(paste("Total wgt: ", solution[, sum(wgt)]))
}
solution_summary(current_solution)
这基本上是一个二分图匹配问题,因此可以通过最大流或线性规划 (bipartite graph matching to match two sets) 在合理的时间内准确解决。
library(lpSolve)
MISMATCH.COST <- 1000
.create.row <- function(row.names, first) {
row <- vector(mode="numeric", length=length(first))
for (i in 1:length(row.names))
row = row + (-MISMATCH.COST+i)*(row.names[i]==first)
return(row)
}
find.pairing <- function(First, Second) {
row.names = sapply(Second, strsplit, "; ")
# Create cost matrix for assignment
mat = sapply(row.names, .create.row, First)
assignment <- lp.assign(mat)
print("Total cost:")
print(assignment$objval+length(First)*MISMATCH.COST)
solution <- lp.assign(mat)$solution
pairs <- which(solution>0, arr.ind=T)
matches = First[pairs[,1]]
# Find out where a mismatch has occured, and replace match
for (i in 1:length(matches)) {
if (!(matches[i] %in% row.names[[i]])) {
matches[i] = row.names[[i]][1]
}
}
result = data.frame(
First[pairs[,2]],
matches)
return(result)
}
运行 它在您的示例中给出了最佳解决方案(它应该始终如此)
> First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8")
> Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
+ "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
+ "MW3; MW4; MW5", "MW6; MW3; MW7")
> find.pairing(First, Second)
[1] "Total cost:"
[1] 12
First.pairs...2.. matches
1 MW3 MW4
2 MW3 MW3
3 MW4 MW7
4 MW5 MW5
5 MW6 MW7
6 MW7 MW8
7 MW7 MW3
8 MW8 MW6