高效的 jaccard 相似度 DocumentTermMatrix
Efficient jaccard similarity DocumentTermMatrix
我想要一种有效计算 tm::DocumentTermMatrix
文档之间 Jaccard 相似度的方法。我可以通过 slam 包对余弦相似度做类似的事情,如 CrossValidated 上的 I came across another question and response 所示,这是 R 特定的但关于矩阵代数不一定是最有效的途径。我尝试使用更高效的 slam 函数来实现该解决方案,但没有得到与我使用效率较低的方法将 DTM 强制转换为矩阵并使用 proxy::dist
时相同的解决方案.
如何有效计算 R 中大型 DocumentTermMatrix 文档之间的 Jaccard 相似度?
#数据和包
library(Matrix);library(proxy);library(tm);library(slam);library(Matrix)
mat <- structure(list(i = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 1L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), j = c(1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 4L, 5L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), v = c(1,
1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1), nrow = 4L,
ncol = 12L, dimnames = structure(list(Docs = c("1", "2",
"3", "4"), Terms = c("computer", "is", "fun", "not", "too",
"no", "it's", "dumb", "what", "should", "we", "do")), .Names = c("Docs",
"Terms"))), .Names = c("i", "j", "v", "nrow", "ncol", "dimnames"
), class = c("DocumentTermMatrix", "simple_triplet_matrix"), weighting = c("term frequency",
"tf"))
#低效计算(预期输出)
proxy::dist(as.matrix(mat), method = 'jaccard')
## 1 2 3
## 2 0.000
## 3 0.875 0.875
## 4 1.000 1.000 1.000
#我的尝试
A <- slam::tcrossprod_simple_triplet_matrix(mat)
im <- which(A > 0, arr.ind=TRUE)
b <- slam::row_sums(mat)
Aim <- A[im]
stats::as.dist(Matrix::sparseMatrix(
i = im[,1],
j = im[,2],
x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
dims = dim(A)
))
## 1 2 3
## 2 2.0
## 3 0.1 0.1
## 4 0.0 0.0 0.0
输出不匹配。
仅供参考原文如下:
c("Computer is fun. Not too fun.", "Computer is fun. Not too fun.",
"No it's not, it's dumb.", "What should we do?")
如 proxy::dist
解决方案。
编辑
请注意,即使在中等大小的 DTM 上,矩阵也会变得很大。这是 vegan 包的示例。注意 4 分钟解决余弦相似度约为 5 秒的问题。
library(qdap); library(quanteda);library(vegan);library(slam)
x <- quanteda::convert(quanteda::dfm(rep(pres_debates2012$dialogue), stem = FALSE,
verbose = FALSE, removeNumbers = FALSE), to = 'tm')
## <<DocumentTermMatrix (documents: 2912, terms: 3368)>>
## Non-/sparse entries: 37836/9769780
## Sparsity : 100%
## Maximal term length: 16
## Weighting : term frequency (tf)
tic <- Sys.time()
jaccard_dist_mat <- vegan::vegdist(as.matrix(x), method = 'jaccard')
Sys.time() - tic #Time difference of 4.01837 mins
tic <- Sys.time()
tdm <- t(x)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(tdm)/(sqrt(col_sums(tdm^2) %*% t(col_sums(tdm^2))))
Sys.time() - tic #Time difference of 5.024992 secs
vegan
包中的 vegdist()
怎么样?
它使用 C-Code 并且大约是。比代理快 10 倍:
library(vegan)
vegdist(as.matrix(mat), method = 'jaccard')
## 1 2 3
## 2 0.0
## 3 0.9 0.9
## 4 1.0 1.0 1.0
library(microbenchmark)
matt <- as.matrix(mat)
microbenchmark(proxy::dist(matt, method = 'jaccard'),
vegdist(matt, method = 'jaccard'))
## Unit: microseconds
## expr min lq mean
## proxy::dist(matt, method = "jaccard") 4879.338 4995.2755 5133.9305
## vegdist(matt, method = "jaccard") 587.935 633.2625 703.8335
## median uq max neval
## 5069.203 5157.520 7549.346 100
## 671.466 723.569 1305.357 100
使用 stringdist
包中的 stringdistmatrix
并使用 nthread
选项并行地 运行 它可以大大加快速度。平均比余弦相似度测试慢 6 秒。
library(qdap)
library(slam)
library(stringdist)
data(pres_debates2012)
x <- quanteda::convert(quanteda::dfm(rep(pres_debates2012$dialogue), stem = FALSE,
verbose = FALSE, removeNumbers = FALSE), to = 'tm')
tic <- Sys.time()
tdm <- t(x)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(tdm)/(sqrt(col_sums(tdm^2) %*% t(col_sums(tdm^2))))
Sys.time() - tic #Time difference of 4.069233 secs
tic <- Sys.time()
t <- stringdistmatrix(pres_debates2012$dialogue, method = "jaccard", nthread = 4)
Sys.time() - tic #Time difference of 10.18158 secs
Jaccard 度量是 SETS 之间的度量,输入矩阵应为 binary。 very first line 表示:
## common values:
A = tcrossprod(m)
在词袋的情况下DTM
这不是公共值的数量!
library(text2vec)
library(magrittr)
library(Matrix)
jaccard_similarity <- function(m) {
A <- tcrossprod(m)
im <- which(A > 0, arr.ind=TRUE, useNames = F)
b <- rowSums(m)
Aim <- A[im]
sparseMatrix(
i = im[,1],
j = im[,2],
x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
dims = dim(A)
)
}
jaccard_distance <- function(m) {
1 - jaccard_similarity(m)
}
cosine <- function(m) {
m_normalized <- m / sqrt(rowSums(m ^ 2))
tcrossprod(m_normalized)
}
基准:
data("movie_review")
tokens <- movie_review$review %>% tolower %>% word_tokenizer
dtm <- create_dtm(itoken(tokens), hash_vectorizer(hash_size = 2**16))
dim(dtm)
# 5000 65536
system.time(dmt_cos <- cosine(dtm))
# user system elapsed
# 2.524 0.169 2.693
system.time( {
dtm_binary <- transform_binary(dtm)
# or simply
# dtm_binary <- sign(dtm)
dtm_jac <- jaccard_similarity(dtm_binary)
})
# user system elapsed
# 11.398 1.599 12.996
max(dtm_jac)
# 1
dim(dtm_jac)
# 5000 5000
编辑 2016-07-01:
在不需要从 dgCMatrix
转换为 dgTMatrix
和 [=30 时,从 text2vec 0.4 查看更快的版本(~2.85x =]~1.75x 当需要专栏时 dgCMatrix
)
jaccard_dist_text2vec_04 <- function(x, y = NULL, format = 'dgCMatrix') {
if (!inherits(x, 'sparseMatrix'))
stop("at the moment jaccard distance defined only for sparse matrices")
# union x
rs_x = rowSums(x)
if (is.null(y)) {
# intersect x
RESULT = tcrossprod(x)
rs_y = rs_x
} else {
if (!inherits(y, 'sparseMatrix'))
stop("at the moment jaccard distance defined only for sparse matrices")
# intersect x y
RESULT = tcrossprod(x, y)
# union y
rs_y = rowSums(y)
}
RESULT = as(RESULT, 'dgTMatrix')
# add 1 to indices because of zero-based indices in sparse matrices
# 1 - (...) because we calculate distance, not similarity
RESULT@x <- 1 - RESULT@x / (rs_x[RESULT@i + 1L] + rs_y[RESULT@j + 1L] - RESULT@x)
if (!inherits(RESULT, format))
RESULT = as(RESULT, format)
RESULT
}
system.time( {
dtm_binary <- transform_binary(dtm)
dtm_jac <-jaccard_dist(dtm_binary, format = 'dgTMatrix')
})
# user system elapsed
# 4.075 0.517 4.593
system.time( {
dtm_binary <- transform_binary(dtm)
dtm_jac <-jaccard_dist(dtm_binary, format = 'dgCMatrix')
})
# user system elapsed
# 6.571 0.939 7.516
我想要一种有效计算 tm::DocumentTermMatrix
文档之间 Jaccard 相似度的方法。我可以通过 slam 包对余弦相似度做类似的事情,如 CrossValidated 上的 proxy::dist
时相同的解决方案.
如何有效计算 R 中大型 DocumentTermMatrix 文档之间的 Jaccard 相似度?
#数据和包
library(Matrix);library(proxy);library(tm);library(slam);library(Matrix)
mat <- structure(list(i = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 1L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), j = c(1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 4L, 5L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), v = c(1,
1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1), nrow = 4L,
ncol = 12L, dimnames = structure(list(Docs = c("1", "2",
"3", "4"), Terms = c("computer", "is", "fun", "not", "too",
"no", "it's", "dumb", "what", "should", "we", "do")), .Names = c("Docs",
"Terms"))), .Names = c("i", "j", "v", "nrow", "ncol", "dimnames"
), class = c("DocumentTermMatrix", "simple_triplet_matrix"), weighting = c("term frequency",
"tf"))
#低效计算(预期输出)
proxy::dist(as.matrix(mat), method = 'jaccard')
## 1 2 3
## 2 0.000
## 3 0.875 0.875
## 4 1.000 1.000 1.000
#我的尝试
A <- slam::tcrossprod_simple_triplet_matrix(mat)
im <- which(A > 0, arr.ind=TRUE)
b <- slam::row_sums(mat)
Aim <- A[im]
stats::as.dist(Matrix::sparseMatrix(
i = im[,1],
j = im[,2],
x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
dims = dim(A)
))
## 1 2 3
## 2 2.0
## 3 0.1 0.1
## 4 0.0 0.0 0.0
输出不匹配。
仅供参考原文如下:
c("Computer is fun. Not too fun.", "Computer is fun. Not too fun.",
"No it's not, it's dumb.", "What should we do?")
如 proxy::dist
解决方案。
编辑
请注意,即使在中等大小的 DTM 上,矩阵也会变得很大。这是 vegan 包的示例。注意 4 分钟解决余弦相似度约为 5 秒的问题。
library(qdap); library(quanteda);library(vegan);library(slam)
x <- quanteda::convert(quanteda::dfm(rep(pres_debates2012$dialogue), stem = FALSE,
verbose = FALSE, removeNumbers = FALSE), to = 'tm')
## <<DocumentTermMatrix (documents: 2912, terms: 3368)>>
## Non-/sparse entries: 37836/9769780
## Sparsity : 100%
## Maximal term length: 16
## Weighting : term frequency (tf)
tic <- Sys.time()
jaccard_dist_mat <- vegan::vegdist(as.matrix(x), method = 'jaccard')
Sys.time() - tic #Time difference of 4.01837 mins
tic <- Sys.time()
tdm <- t(x)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(tdm)/(sqrt(col_sums(tdm^2) %*% t(col_sums(tdm^2))))
Sys.time() - tic #Time difference of 5.024992 secs
vegan
包中的 vegdist()
怎么样?
它使用 C-Code 并且大约是。比代理快 10 倍:
library(vegan)
vegdist(as.matrix(mat), method = 'jaccard')
## 1 2 3
## 2 0.0
## 3 0.9 0.9
## 4 1.0 1.0 1.0
library(microbenchmark)
matt <- as.matrix(mat)
microbenchmark(proxy::dist(matt, method = 'jaccard'),
vegdist(matt, method = 'jaccard'))
## Unit: microseconds
## expr min lq mean
## proxy::dist(matt, method = "jaccard") 4879.338 4995.2755 5133.9305
## vegdist(matt, method = "jaccard") 587.935 633.2625 703.8335
## median uq max neval
## 5069.203 5157.520 7549.346 100
## 671.466 723.569 1305.357 100
使用 stringdist
包中的 stringdistmatrix
并使用 nthread
选项并行地 运行 它可以大大加快速度。平均比余弦相似度测试慢 6 秒。
library(qdap)
library(slam)
library(stringdist)
data(pres_debates2012)
x <- quanteda::convert(quanteda::dfm(rep(pres_debates2012$dialogue), stem = FALSE,
verbose = FALSE, removeNumbers = FALSE), to = 'tm')
tic <- Sys.time()
tdm <- t(x)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(tdm)/(sqrt(col_sums(tdm^2) %*% t(col_sums(tdm^2))))
Sys.time() - tic #Time difference of 4.069233 secs
tic <- Sys.time()
t <- stringdistmatrix(pres_debates2012$dialogue, method = "jaccard", nthread = 4)
Sys.time() - tic #Time difference of 10.18158 secs
Jaccard 度量是 SETS 之间的度量,输入矩阵应为 binary。 very first line 表示:
## common values:
A = tcrossprod(m)
在词袋的情况下DTM
这不是公共值的数量!
library(text2vec)
library(magrittr)
library(Matrix)
jaccard_similarity <- function(m) {
A <- tcrossprod(m)
im <- which(A > 0, arr.ind=TRUE, useNames = F)
b <- rowSums(m)
Aim <- A[im]
sparseMatrix(
i = im[,1],
j = im[,2],
x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
dims = dim(A)
)
}
jaccard_distance <- function(m) {
1 - jaccard_similarity(m)
}
cosine <- function(m) {
m_normalized <- m / sqrt(rowSums(m ^ 2))
tcrossprod(m_normalized)
}
基准:
data("movie_review")
tokens <- movie_review$review %>% tolower %>% word_tokenizer
dtm <- create_dtm(itoken(tokens), hash_vectorizer(hash_size = 2**16))
dim(dtm)
# 5000 65536
system.time(dmt_cos <- cosine(dtm))
# user system elapsed
# 2.524 0.169 2.693
system.time( {
dtm_binary <- transform_binary(dtm)
# or simply
# dtm_binary <- sign(dtm)
dtm_jac <- jaccard_similarity(dtm_binary)
})
# user system elapsed
# 11.398 1.599 12.996
max(dtm_jac)
# 1
dim(dtm_jac)
# 5000 5000
编辑 2016-07-01:
在不需要从 dgCMatrix
转换为 dgTMatrix
和 [=30 时,从 text2vec 0.4 查看更快的版本(~2.85x =]~1.75x 当需要专栏时 dgCMatrix
)
jaccard_dist_text2vec_04 <- function(x, y = NULL, format = 'dgCMatrix') {
if (!inherits(x, 'sparseMatrix'))
stop("at the moment jaccard distance defined only for sparse matrices")
# union x
rs_x = rowSums(x)
if (is.null(y)) {
# intersect x
RESULT = tcrossprod(x)
rs_y = rs_x
} else {
if (!inherits(y, 'sparseMatrix'))
stop("at the moment jaccard distance defined only for sparse matrices")
# intersect x y
RESULT = tcrossprod(x, y)
# union y
rs_y = rowSums(y)
}
RESULT = as(RESULT, 'dgTMatrix')
# add 1 to indices because of zero-based indices in sparse matrices
# 1 - (...) because we calculate distance, not similarity
RESULT@x <- 1 - RESULT@x / (rs_x[RESULT@i + 1L] + rs_y[RESULT@j + 1L] - RESULT@x)
if (!inherits(RESULT, format))
RESULT = as(RESULT, format)
RESULT
}
system.time( {
dtm_binary <- transform_binary(dtm)
dtm_jac <-jaccard_dist(dtm_binary, format = 'dgTMatrix')
})
# user system elapsed
# 4.075 0.517 4.593
system.time( {
dtm_binary <- transform_binary(dtm)
dtm_jac <-jaccard_dist(dtm_binary, format = 'dgCMatrix')
})
# user system elapsed
# 6.571 0.939 7.516