R - 在 data.frame 中计算成对比较组中的相似值
R - count similar values in pair-wise comparing groups in data.frame
我有篮球运动员的观察数据。每个ID代表一个玩家。
df <- data.frame(id = c("A", "B", "c"),
V1 = c(1, 3, 2),
V2 = c(1, 2, 2),
V3 = c(3, 1, NA))
df
id V1 V2 V3
1 A 1 1 3
2 B 3 2 1
3 c 2 2 NA
我想对所有玩家进行成对比较,并计算他们变量之间的相似度。
值是否位于不同的列中并不重要。请注意,有些玩家在某些领域有 NA
。
期望的结果应该是这样的:
desired <- data.frame(id_x = c("A", "A", "B"),
id_y = c("B", "C", "C"),
similar = c(2, 0, 1))
desired
id_x id_y similar
1 A B 2
2 A C 0
3 B C 1
真实数据由数以万计的玩家组成,所以性能也很重要。
非常感谢任何指点。
将给出两种不同的方法:
A=lapply(apply(df[-1],1,list),unlist)
combn(A,2,function(x)sum(unique(na.omit(x[[1]]))%in%unique(na.omit(x[[2]]))))
[1] 2 0 1
或
B=apply(df[-1],1,function(x)apply(df[-1],1,function(y)sum(unique(na.omit(x))%in%unique(na.omit(y)))))
B[lower.tri(B)]
[1] 2 0 1
我正在对 Onyambu 提供的出色答案进行基准测试。
制作更大的测试样本:
df2 <- data.frame(ID = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(2:16, 10e2, replace = TRUE),
V3 = sample(3:17, 10e2, replace = TRUE))
运行 基准:
library(microbenchmark)
bench <- microbenchmark(
# option A
A=lapply(apply(df2[-1],1,list),unlist),
A1=combn(A,2,function(x)sum(unique(x[[1]])%in%unique(x[[2]]))),
# option B
B=apply(my.df2[-1],1,function(x)apply(df2[-1],1,function(y)sum(unique(x)%in%uni
que(y)))),
B2= B[lower.tri(B)],
# repeat 5 times
times=5)
生产:
bench
Unit: milliseconds
expr min lq mean median uq max neval cld
A 10.44847 10.83849 11.79438 11.33756 11.34568 15.00171 5 a
A1 25420.53573 25735.88333 26721.22973 25802.89428 26658.98114 29987.85417 5 b
B 52173.85540 52519.34839 53327.35931 52661.64372 54508.70321 54773.24582 5 c
B2 33.43663 34.16278 34.91674 35.19001 35.81182 35.98246 5 a
原始数据较大。
是否有性能更好的选项?
我们可以列出每对行并使用它来找到它们的交集。往下看:
toCheck <- combn(rownames(df), 2, simplify = FALSE)
names(toCheck) <-
sapply(toCheck, paste, collapse = "&")
sapply(toCheck, function(x){
length(base::intersect(as.list(df[x[1],-1]), as.list(df[x[2],-1])))
})
# 1&2 1&3 2&3
# 2 0 1
正在您的较大数据集上进行测试:
set.seed(45)
df2 <- data.frame(ID = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(1:16, 10e2, replace = TRUE),
V3 = sample(1:17, 10e2, replace = TRUE))
M_M_approach <- function(mdf) {
Check <- combn(rownames(mdf), 2, simplify = FALSE)
names(Check) <- sapply(Check, paste, collapse = "&")
sapply(Check, function(x){
length(base::intersect(as.list(mdf[x[1],-1]), as.list(mdf[x[2],-1]))) })
}
M_M_approach(df2)
# 1&2 1&3 2&3
# 1 1 2
microbenchmark::microbenchmark(M_M_approach = M_M_approach(df2), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# M_M_approach 225.6985 228.1924 248.5623 250.4814 255.1007 283.3385 5
也许你也可以用proxy
来解决这个问题:
library(proxy)
df <- data.frame(id = c("A", "B", "c"),
V1 = c(1, 3, 2),
V2 = c(1, 2, 2),
V3 = c(3, 1, NA))
myfun <- function(x, y) {
sum(unique(setdiff(x, NA)) %in% y)
}
pr_DB$set_entry(FUN=myfun, names="myfun", distance=FALSE, loop=TRUE)
similar <- proxy::simil(df[, -1L], method="myfun")
res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)
print(res)
id_x id_y similar
1 B A 2
2 c A 0
3 c B 1
它在我的机器上似乎确实快了一些:
df <- data.frame(id = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(2:16, 10e2, replace = TRUE),
V3 = sample(3:17, 10e2, replace = TRUE))
system.time({
similar <- proxy::simil(df[, -1L], method="myfun")
res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)
})
user system elapsed
7.84 0.05 7.92
我有篮球运动员的观察数据。每个ID代表一个玩家。
df <- data.frame(id = c("A", "B", "c"),
V1 = c(1, 3, 2),
V2 = c(1, 2, 2),
V3 = c(3, 1, NA))
df
id V1 V2 V3
1 A 1 1 3
2 B 3 2 1
3 c 2 2 NA
我想对所有玩家进行成对比较,并计算他们变量之间的相似度。
值是否位于不同的列中并不重要。请注意,有些玩家在某些领域有 NA
。
期望的结果应该是这样的:
desired <- data.frame(id_x = c("A", "A", "B"),
id_y = c("B", "C", "C"),
similar = c(2, 0, 1))
desired
id_x id_y similar
1 A B 2
2 A C 0
3 B C 1
真实数据由数以万计的玩家组成,所以性能也很重要。
非常感谢任何指点。
将给出两种不同的方法:
A=lapply(apply(df[-1],1,list),unlist)
combn(A,2,function(x)sum(unique(na.omit(x[[1]]))%in%unique(na.omit(x[[2]]))))
[1] 2 0 1
或
B=apply(df[-1],1,function(x)apply(df[-1],1,function(y)sum(unique(na.omit(x))%in%unique(na.omit(y)))))
B[lower.tri(B)]
[1] 2 0 1
我正在对 Onyambu 提供的出色答案进行基准测试。
制作更大的测试样本:
df2 <- data.frame(ID = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(2:16, 10e2, replace = TRUE),
V3 = sample(3:17, 10e2, replace = TRUE))
运行 基准:
library(microbenchmark)
bench <- microbenchmark(
# option A
A=lapply(apply(df2[-1],1,list),unlist),
A1=combn(A,2,function(x)sum(unique(x[[1]])%in%unique(x[[2]]))),
# option B
B=apply(my.df2[-1],1,function(x)apply(df2[-1],1,function(y)sum(unique(x)%in%uni
que(y)))),
B2= B[lower.tri(B)],
# repeat 5 times
times=5)
生产:
bench
Unit: milliseconds
expr min lq mean median uq max neval cld
A 10.44847 10.83849 11.79438 11.33756 11.34568 15.00171 5 a
A1 25420.53573 25735.88333 26721.22973 25802.89428 26658.98114 29987.85417 5 b
B 52173.85540 52519.34839 53327.35931 52661.64372 54508.70321 54773.24582 5 c
B2 33.43663 34.16278 34.91674 35.19001 35.81182 35.98246 5 a
原始数据较大。
是否有性能更好的选项?
我们可以列出每对行并使用它来找到它们的交集。往下看:
toCheck <- combn(rownames(df), 2, simplify = FALSE)
names(toCheck) <-
sapply(toCheck, paste, collapse = "&")
sapply(toCheck, function(x){
length(base::intersect(as.list(df[x[1],-1]), as.list(df[x[2],-1])))
})
# 1&2 1&3 2&3
# 2 0 1
正在您的较大数据集上进行测试:
set.seed(45)
df2 <- data.frame(ID = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(1:16, 10e2, replace = TRUE),
V3 = sample(1:17, 10e2, replace = TRUE))
M_M_approach <- function(mdf) {
Check <- combn(rownames(mdf), 2, simplify = FALSE)
names(Check) <- sapply(Check, paste, collapse = "&")
sapply(Check, function(x){
length(base::intersect(as.list(mdf[x[1],-1]), as.list(mdf[x[2],-1]))) })
}
M_M_approach(df2)
# 1&2 1&3 2&3
# 1 1 2
microbenchmark::microbenchmark(M_M_approach = M_M_approach(df2), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# M_M_approach 225.6985 228.1924 248.5623 250.4814 255.1007 283.3385 5
也许你也可以用proxy
来解决这个问题:
library(proxy)
df <- data.frame(id = c("A", "B", "c"),
V1 = c(1, 3, 2),
V2 = c(1, 2, 2),
V3 = c(3, 1, NA))
myfun <- function(x, y) {
sum(unique(setdiff(x, NA)) %in% y)
}
pr_DB$set_entry(FUN=myfun, names="myfun", distance=FALSE, loop=TRUE)
similar <- proxy::simil(df[, -1L], method="myfun")
res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)
print(res)
id_x id_y similar
1 B A 2
2 c A 0
3 c B 1
它在我的机器上似乎确实快了一些:
df <- data.frame(id = sample(10e2),
V1 = sample(1:15, 10e2, replace = TRUE),
V2 = sample(2:16, 10e2, replace = TRUE),
V3 = sample(3:17, 10e2, replace = TRUE))
system.time({
similar <- proxy::simil(df[, -1L], method="myfun")
res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)
})
user system elapsed
7.84 0.05 7.92