在 R 中向量化这些嵌套的 for 循环
Vectorize these nested for loops in R
我通常只要稍加思考就能弄清楚如何进行矢量化,但尽管阅读了一堆 Whosebug 的问答,我仍然感到困惑!
我想用合适的 apply 函数替换这些嵌套的 for 循环,但是如果对我遗漏的整个问题有一些明显不同的方法,请随时告诉我!
在测试的上下文中思考这个例子,其中第一行是关键,随后的每一行都是学生的答案。作为输出,我想要一个数组,其中每个正确答案为 1,每个错误答案为 0。 for 循环可以工作,但是当您扩展到数千行和几列时会非常慢。
这是我的可重现示例,在此先感谢您的帮助!
#build sample data
dat <- array(dim=c(9,6))
for (n in 1:9){
dat[n,1:6] <- c(paste("ID00",n,sep=""),
sample(c("A","B","C","D"), size=5, replace=TRUE))}
dat[3,4]<-NA
key<-c("key","A","B","B","C","D")
dat <- rbind(key,dat)
>dat
[,1] [,2] [,3] [,4] [,5] [,6]
"key" "A" "B" "B" "C" "D"
"ID001" "B" "A" "D" "B" "C"
"ID002" "C" "C" "C" "B" "B"
"ID003" "A" "C" NA "D" "D"
"ID004" "D" "B" "D" "A" "A"
"ID005" "A" "C" "A" "C" "A"
"ID006" "D" "D" "B" "B" "A"
"ID007" "B" "D" "A" "D" "A"
"ID008" "D" "D" "B" "D" "A"
"ID009" "D" "C" "B" "D" "D"
#score file
dat2 <- array(dim=c(9,5))
for (row in 2:10){
for (column in 2:6){
if (is.na(dat[row,column])){
p <- NA
}else if (dat[row,column]==dat[1,column]){
p <- 1
}else p <- 0
dat2[row-1,column-1]<-p
}
}
> dat2
[,1] [,2] [,3] [,4] [,5]
[1,] 0 0 0 0 0
[2,] 0 0 0 0 0
[3,] 1 0 NA 0 1
[4,] 0 1 0 0 0
[5,] 1 0 0 1 0
[6,] 0 0 1 0 0
[7,] 0 0 0 0 0
[8,] 0 0 1 0 0
[9,] 0 0 1 0 1
设置可重复性种子:
set.seed(1)
dat <- array(dim=c(9,6))
for (n in 1:9){
dat[n,1:6] <- c(paste("ID00",n,sep=""),
sample(c("A","B","C","D"), size=5, replace=TRUE))}
dat[3,4]<-NA
key<-c("key","A","B","B","C","D")
dat <- rbind(key,dat)
这将完成工作:
key <- rep(dat[1, -1], each = nrow(dat) - 1L) ## expand "key" row
dummy <- (dat[-1, -1] == key) + 0L ## vectorized / element-wise "=="
基本上我们想要一个向量化的"=="
。但我们需要先将 dat[1,-1]
扩展到 dat[-1,-1]
的相同维度。最后 + 0L
将 TRUE / FALSE
矩阵强制转换为 1 / 0
矩阵。
# [,1] [,2] [,3] [,4] [,5]
# 0 1 0 0 0
# 0 0 0 1 0
# 1 0 NA 0 1
# 0 0 0 0 1
# 0 0 0 0 0
# 0 0 1 0 0
# 0 0 1 0 1
# 0 0 0 1 0
# 0 0 0 1 0
我还没有检查 Gregor 的基准测试脚本。但这是我的。
set.seed(1)
dat <- matrix(sample(LETTERS[4], 1000 * 1000, TRUE), 1000)
key <- sample(LETTERS[1:4], 1000, TRUE)
microbenchmark(rep(key, each = 1000) == dat, t(t(dat) == key))
#Unit: milliseconds
# expr min lq mean median uq
# rep(key, each = 1000) == dat 32.16888 34.01138 42.61639 35.57526 40.27944
# t(t(dat) == key) 50.93348 52.96008 63.74475 56.04706 60.38750
# max neval cld
# 81.96044 100 a
# 106.54916 100 b
我的方法与 Gregor 的方法之间的唯一区别是 rep(, each)
扩展 v.s。 rep_len
展开。两种扩展都消耗相同数量的内存,扩展后,"=="
以列方式完成。我预测额外的开销将由两个 t()
引起,基准测试结果似乎证明了这一点。希望结果不依赖于平台。
这和哲园的回答基本一致(先向量化==
再强制转回数值),我只是先转置矩阵,没有展开key
由于矩阵 stored/operated 按列而不是按行排列,如果键是列并且每个学生也是列,向量回收就可以了。
在生成数据之前使用 set.seed(1)
...
key = dat[1, -1]
tdat = t(dat[-1, -1])
t((tdat == key) + 0L)
# [,1] [,2] [,3] [,4] [,5]
# 0 1 0 0 0
# 0 0 0 1 0
# 1 0 NA 0 1
# 0 0 0 0 1
# 0 0 0 0 0
# 0 0 1 0 0
# 0 0 1 0 1
# 0 0 0 1 0
# 0 0 0 1 0
如果您改为将第一列更改为行名称,则可以轻松保留它们,而不会因为学生 ID 不是 'key'
而将其标记为不正确的风险。这也使得最后的总结更好:
row.names(dat) = dat[, 1]
dat = dat[, -1]
key = dat[1, ]
tdat = t(dat[-1, ])
result = t((tdat == key) + 0)
result
# [,1] [,2] [,3] [,4] [,5]
# ID001 0 1 0 0 0
# ID002 0 0 0 1 0
# ID003 1 0 NA 0 1
# ID004 0 0 0 0 1
# ID005 0 0 0 0 0
# ID006 0 0 1 0 0
# ID007 0 0 1 0 1
# ID008 0 0 0 1 0
# ID009 0 0 0 1 0
rowSums(result)
# ID001 ID002 ID003 ID004 ID005 ID006 ID007 ID008 ID009
# 1 1 NA 1 0 1 2 1 1
简化输入和运行中等规模数据的基准测试,两者都非常快。双转置要快一点。
gregor = function(key, dat) {
t(t(dat) == key)
}
zheyuan = function(key, dat) {
dat == rep(key, each = nrow(dat))
}
library(microbenchmark)
nr = 10000
nc = 1000
key = sample(1:10, nc, replace = T)
dat = matrix(sample(1:10, nr * nc, replace = T), nrow = nr)
print(microbenchmark(gregor(key, dat), zheyuan(key, dat)), signif = 4)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# gregor(key, dat) 104.5 113.2 135.5970 128.2 144.5 336.2 100 a
# zheyuan(key, dat) 196.0 202.8 215.7822 207.0 224.9 394.4 100 b
identical(gregor(key, dat), zheyan(key, dat))
# [1] TRUE
如果你想在一行中不使用 for
或 apply
,请尝试
dat2 <- matrix(as.numeric(dat==rep(dat[1,],each=nrow(dat))),nrow=nrow(dat))[-1,-1]
我通常只要稍加思考就能弄清楚如何进行矢量化,但尽管阅读了一堆 Whosebug 的问答,我仍然感到困惑! 我想用合适的 apply 函数替换这些嵌套的 for 循环,但是如果对我遗漏的整个问题有一些明显不同的方法,请随时告诉我!
在测试的上下文中思考这个例子,其中第一行是关键,随后的每一行都是学生的答案。作为输出,我想要一个数组,其中每个正确答案为 1,每个错误答案为 0。 for 循环可以工作,但是当您扩展到数千行和几列时会非常慢。
这是我的可重现示例,在此先感谢您的帮助!
#build sample data
dat <- array(dim=c(9,6))
for (n in 1:9){
dat[n,1:6] <- c(paste("ID00",n,sep=""),
sample(c("A","B","C","D"), size=5, replace=TRUE))}
dat[3,4]<-NA
key<-c("key","A","B","B","C","D")
dat <- rbind(key,dat)
>dat
[,1] [,2] [,3] [,4] [,5] [,6]
"key" "A" "B" "B" "C" "D"
"ID001" "B" "A" "D" "B" "C"
"ID002" "C" "C" "C" "B" "B"
"ID003" "A" "C" NA "D" "D"
"ID004" "D" "B" "D" "A" "A"
"ID005" "A" "C" "A" "C" "A"
"ID006" "D" "D" "B" "B" "A"
"ID007" "B" "D" "A" "D" "A"
"ID008" "D" "D" "B" "D" "A"
"ID009" "D" "C" "B" "D" "D"
#score file
dat2 <- array(dim=c(9,5))
for (row in 2:10){
for (column in 2:6){
if (is.na(dat[row,column])){
p <- NA
}else if (dat[row,column]==dat[1,column]){
p <- 1
}else p <- 0
dat2[row-1,column-1]<-p
}
}
> dat2
[,1] [,2] [,3] [,4] [,5]
[1,] 0 0 0 0 0
[2,] 0 0 0 0 0
[3,] 1 0 NA 0 1
[4,] 0 1 0 0 0
[5,] 1 0 0 1 0
[6,] 0 0 1 0 0
[7,] 0 0 0 0 0
[8,] 0 0 1 0 0
[9,] 0 0 1 0 1
设置可重复性种子:
set.seed(1)
dat <- array(dim=c(9,6))
for (n in 1:9){
dat[n,1:6] <- c(paste("ID00",n,sep=""),
sample(c("A","B","C","D"), size=5, replace=TRUE))}
dat[3,4]<-NA
key<-c("key","A","B","B","C","D")
dat <- rbind(key,dat)
这将完成工作:
key <- rep(dat[1, -1], each = nrow(dat) - 1L) ## expand "key" row
dummy <- (dat[-1, -1] == key) + 0L ## vectorized / element-wise "=="
基本上我们想要一个向量化的"=="
。但我们需要先将 dat[1,-1]
扩展到 dat[-1,-1]
的相同维度。最后 + 0L
将 TRUE / FALSE
矩阵强制转换为 1 / 0
矩阵。
# [,1] [,2] [,3] [,4] [,5]
# 0 1 0 0 0
# 0 0 0 1 0
# 1 0 NA 0 1
# 0 0 0 0 1
# 0 0 0 0 0
# 0 0 1 0 0
# 0 0 1 0 1
# 0 0 0 1 0
# 0 0 0 1 0
我还没有检查 Gregor 的基准测试脚本。但这是我的。
set.seed(1)
dat <- matrix(sample(LETTERS[4], 1000 * 1000, TRUE), 1000)
key <- sample(LETTERS[1:4], 1000, TRUE)
microbenchmark(rep(key, each = 1000) == dat, t(t(dat) == key))
#Unit: milliseconds
# expr min lq mean median uq
# rep(key, each = 1000) == dat 32.16888 34.01138 42.61639 35.57526 40.27944
# t(t(dat) == key) 50.93348 52.96008 63.74475 56.04706 60.38750
# max neval cld
# 81.96044 100 a
# 106.54916 100 b
我的方法与 Gregor 的方法之间的唯一区别是 rep(, each)
扩展 v.s。 rep_len
展开。两种扩展都消耗相同数量的内存,扩展后,"=="
以列方式完成。我预测额外的开销将由两个 t()
引起,基准测试结果似乎证明了这一点。希望结果不依赖于平台。
这和哲园的回答基本一致(先向量化==
再强制转回数值),我只是先转置矩阵,没有展开key
由于矩阵 stored/operated 按列而不是按行排列,如果键是列并且每个学生也是列,向量回收就可以了。
在生成数据之前使用 set.seed(1)
...
key = dat[1, -1]
tdat = t(dat[-1, -1])
t((tdat == key) + 0L)
# [,1] [,2] [,3] [,4] [,5]
# 0 1 0 0 0
# 0 0 0 1 0
# 1 0 NA 0 1
# 0 0 0 0 1
# 0 0 0 0 0
# 0 0 1 0 0
# 0 0 1 0 1
# 0 0 0 1 0
# 0 0 0 1 0
如果您改为将第一列更改为行名称,则可以轻松保留它们,而不会因为学生 ID 不是 'key'
而将其标记为不正确的风险。这也使得最后的总结更好:
row.names(dat) = dat[, 1]
dat = dat[, -1]
key = dat[1, ]
tdat = t(dat[-1, ])
result = t((tdat == key) + 0)
result
# [,1] [,2] [,3] [,4] [,5]
# ID001 0 1 0 0 0
# ID002 0 0 0 1 0
# ID003 1 0 NA 0 1
# ID004 0 0 0 0 1
# ID005 0 0 0 0 0
# ID006 0 0 1 0 0
# ID007 0 0 1 0 1
# ID008 0 0 0 1 0
# ID009 0 0 0 1 0
rowSums(result)
# ID001 ID002 ID003 ID004 ID005 ID006 ID007 ID008 ID009
# 1 1 NA 1 0 1 2 1 1
简化输入和运行中等规模数据的基准测试,两者都非常快。双转置要快一点。
gregor = function(key, dat) {
t(t(dat) == key)
}
zheyuan = function(key, dat) {
dat == rep(key, each = nrow(dat))
}
library(microbenchmark)
nr = 10000
nc = 1000
key = sample(1:10, nc, replace = T)
dat = matrix(sample(1:10, nr * nc, replace = T), nrow = nr)
print(microbenchmark(gregor(key, dat), zheyuan(key, dat)), signif = 4)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# gregor(key, dat) 104.5 113.2 135.5970 128.2 144.5 336.2 100 a
# zheyuan(key, dat) 196.0 202.8 215.7822 207.0 224.9 394.4 100 b
identical(gregor(key, dat), zheyan(key, dat))
# [1] TRUE
如果你想在一行中不使用 for
或 apply
,请尝试
dat2 <- matrix(as.numeric(dat==rep(dat[1,],each=nrow(dat))),nrow=nrow(dat))[-1,-1]