尝试使用 Apply 处理 144k 行,但结果太慢
Trying to process 144k Rows Using Apply, but results too slow
我正在用 R 编写朴素贝叶斯的自定义修改版本,并且 运行 由于正在处理的数据大小而陷入 运行 时间问题。我需要处理 ~145k 行,每行包含 95 个元素。我目前正在使用以下函数来获取朴素贝叶斯的第一步。
probGen <- function(x, i)
{
return(1/(sqrt(2*pi*sdBreakdown[i,]^2)
*exp(-((x - meanBreakdown[i,])^2)/(2*(sdBreakdown[i,]^2)))))
}
在此函数中,sdBreakdown 和 meanBreakdown 是每个可能解决方案的聚合值。每次应用 运行 时,我们都会得到每个给定列的概率。应用是 运行 如下矩阵,其中每一行是我们试图分类的另一个元素。
test.1 <- t(apply(temp,MARGIN=1,FUN=probGen, 1))
test.2 <- t(apply(temp,MARGIN=1,FUN=probGen, 2))
test.3 <- t(apply(temp,MARGIN=1,FUN=probGen, 3))
test.4 <- t(apply(temp,MARGIN=1,FUN=probGen, 4))
test.5 <- t(apply(temp,MARGIN=1,FUN=probGen, 5))
test.6 <- t(apply(temp,MARGIN=1,FUN=probGen, 6))
test.7 <- t(apply(temp,MARGIN=1,FUN=probGen, 7))
test.8 <- t(apply(temp,MARGIN=1,FUN=probGen, 8))
test.9 <- t(apply(temp,MARGIN=1,FUN=probGen, 9))
这是我目前调用每个应用程序的方式。这为每个可能的分类 1-9 给出了每个元素的概率。我不想使用开箱即用的朴素贝叶斯,因为我试图更好地理解 R 并且我想尝试一些潜在的准确性改进。
我不确定如何更及时地制作这个 运行,尽管按照编码需要几个小时,如果我正在积极从事其他项目,可能需要 7 或 8 个小时运行s.
编辑:
澄清本例中的数据。
temp 是 145kx95 矩阵,其中每一行是一个要分类的项目,每一列是用数字表示的质量。
meanBreakdown 是一个 9x95 矩阵,每一行是一个不同的分类,每一列对应于该分类的平均质量。
sdBreakdown 与 meanBreakdown 相同,除了存储标准差而不是平均值。
并行处理似乎可行,但我认为(显然我错了)数据集大到没有必要。
编辑 2:这是完整的代码。如果它是特别糟糕的 R 代码,请原谅我。我一直是 C 开发人员,所以 R 是一个很大的思维转变,我只用 R 做了一些小项目来学习来龙去脉。
training <- read.csv(file = 'data\train.csv', sep=',', header=T)
negativeOne <- function(x)
{
x <- pmin(1, x)
return(1-mean(x))
}
pullZeros <- function(x)
{
x <- ifelse(x == 0, 1, 0)
return(mean(x))
}
trainingSet <- function(x)
{
x <- ifelse(x == 0, NA, x)
return(mean(x, na.rm=T))
}
trainingSetSd <- function(x)
{
x <- ifelse(x == 0, NA, x)
return(sd(x, na.rm=T))
}
positiveBreakDown <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=trainingSet)
positiveBreakDownSd <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=trainingSetSd)
negativeBreakDown <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=negativeOne)
meanBreakdown <- positiveBreakDown[,colnames(positiveBreakDown)[grepl("feat",colnames(positiveBreakDown))]]
sdBreakdown <- positiveBreakDownSd[,colnames(positiveBreakDownSd)[grepl("feat",colnames(positiveBreakDownSd))]]
probGen <- function(x, i)
{
return(1/(sqrt(2*pi*sdBreakdown[i,]^2)
*exp(-((x - meanBreakdown[i,])^2)/(2*(sdBreakdown[i,]^2)))))
}
test <- read.csv(file = 'data\test.csv', sep=',', header=T)
PosTest <- test[,colnames(test)[grepl("feat",colnames(test))]]
NegTest <- aggregate(x=test[,colnames(test)[grepl("feat",colnames(test))]],
by=list(test$id), FUN=pullZeros)
NegTest$Group.1 <- NULL
temp <- PosTest
sweepTest.1 <- t(apply(temp,MARGIN=1,FUN=probGen, 1))
sweepTest.2 <- t(apply(temp,MARGIN=1,FUN=probGen, 2))
sweepTest.3 <- t(apply(temp,MARGIN=1,FUN=probGen, 3))
sweepTest.4 <- t(apply(temp,MARGIN=1,FUN=probGen, 4))
sweepTest.5 <- t(apply(temp,MARGIN=1,FUN=probGen, 5))
sweepTest.6 <- t(apply(temp,MARGIN=1,FUN=probGen, 6))
sweepTest.7 <- t(apply(temp,MARGIN=1,FUN=probGen, 7))
sweepTest.8 <- t(apply(temp,MARGIN=1,FUN=probGen, 8))
sweepTest.9 <- t(apply(temp,MARGIN=1,FUN=probGen, 9))
temp <- NegTest
temp$Group.1 <- NULL
N.sweepTest.1 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[1, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.2 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[2, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.3 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[3, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.4 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[4, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.5 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[5, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.6 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[6, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.7 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[7, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.8 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[8, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.9 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[9, grepl("feat",colnames(positiveBreakDown))]),`*`)
sweepTest.1 <- (-1*(N.sweepTest.1 - 1)*sweepTest.1) + N.sweepTest.1
sweepTest.2 <- (-1*(N.sweepTest.2 - 1)*sweepTest.2) + N.sweepTest.2
sweepTest.3 <- (-1*(N.sweepTest.3 - 1)*sweepTest.3) + N.sweepTest.3
sweepTest.4 <- (-1*(N.sweepTest.4 - 1)*sweepTest.4) + N.sweepTest.4
sweepTest.5 <- (-1*(N.sweepTest.5 - 1)*sweepTest.5) + N.sweepTest.5
sweepTest.6 <- (-1*(N.sweepTest.6 - 1)*sweepTest.6) + N.sweepTest.6
sweepTest.7 <- (-1*(N.sweepTest.7 - 1)*sweepTest.7) + N.sweepTest.7
sweepTest.8 <- (-1*(N.sweepTest.8 - 1)*sweepTest.8) + N.sweepTest.8
sweepTest.9 <- (-1*(N.sweepTest.9 - 1)*sweepTest.9) + N.sweepTest.9
rm(N.sweepTest.1,N.sweepTest.2,N.sweepTest.3,N.sweepTest.4,N.sweepTest.5,N.sweepTest.6,N.sweepTest.7,N.sweepTest.8,N.sweepTest.9)
dist <- 1:9
for(i in 1:9)
{
dist[i] <- nrow(training[training$target == paste0("Class_",i),])
}
res1 <- dist[1]*apply(t(sweepTest.1), MARGIN=2, FUN=prod)
res2 <- dist[2]*apply(t(sweepTest.2), MARGIN=2, FUN=prod)
res3 <- dist[3]*apply(t(sweepTest.3), MARGIN=2, FUN=prod)
res4 <- dist[4]*apply(t(sweepTest.4), MARGIN=2, FUN=prod)
res5 <- dist[5]*apply(t(sweepTest.5), MARGIN=2, FUN=prod)
res6 <- dist[6]*apply(t(sweepTest.6), MARGIN=2, FUN=prod)
res7 <- dist[7]*apply(t(sweepTest.7), MARGIN=2, FUN=prod)
res8 <- dist[8]*apply(t(sweepTest.8), MARGIN=2, FUN=prod)
res9 <- dist[9]*apply(t(sweepTest.9), MARGIN=2, FUN=prod)
rm(sweepTest.1,sweepTest.2,sweepTest.3,sweepTest.4,sweepTest.5,sweepTest.6,sweepTest.7,sweepTest.8,sweepTest.9)
interRes <- data.frame(Class_1 = res1, Class_2 = res2,Class_3 = res3,
Class_4 = res4,Class_5 = res5,Class_6 = res6,
Class_7 = res7,Class_8 = res8,Class_9 = res9)
rm(res1,res2,res3,res4,res5,res6,res7,res8,res9)
temp <- apply(t(interRes), MARGIN=2, FUN=sum)
tempRes <- interRes/temp
data<- data.frame(id=test$id)
data <- cbind(data,tempRes)
fname <- file.choose()
write.table(data, fname, row.names=FALSE, sep=",")
检查 parallel
包和 mcmapply
或 mclapply
到 运行 apply
并行调用。如所写,您的代码是 运行ning 顺序(即您必须先完成所有 1 分类,然后才能进入 2,依此类推)。
据我了解,您运行对相同的数据使用相同的函数,但参数不同。与其进行多次 apply
调用,不如重组函数以允许使用 mcmapply
- 这允许您使用 apply
功能,但迭代多个参数。
您需要正确矢量化您的代码。有了这么简单的功能,就不需要使用 apply
了,它基本上只是一个 for
循环。
首先我们生成一些虚假数据:
rm(list = ls())
set.seed(1)
# Dimensions of data and some faux data
n <- 144000
m <- 95
temp <- matrix(rnorm(n*m), nrow = n, ncol = m)
meanBreakdown <- matrix(seq(-1, 1, l = 9*m), 9, m) # Matrix of means
sdBreakdown <- matrix(seq(1, 2, l = 9*m), 9, m) # Matrix of std. deviations
让我们为您的版本计时 i = 1
。我冒昧地让它更具可读性。另外,我想我发现了一个错误(如果函数只是高斯密度)。无论如何,
probGen <- function(x, means, sds) { # NOTE THAT THIS HAS CHANGED
return(1/sqrt(2*pi*sds^2)*exp(-(1/(2*sds^2))*(x - means)^2) )
}
i <- 1
t1 <- system.time({
res1 <- t(apply(temp, 1, probGen, mean = meanBreakdown[i,],
sds = sdBreakdown[i,]))
})
print(res1[1:5, 1:7])
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#[1,] 0.3720575 0.38038806 0.385805475 0.36747185 0.32253028 0.3008070 0.37473829
#[2,] 0.1980087 0.02837476 0.019424716 0.03520653 0.25872889 0.2223151 0.05506068
#[3,] 0.3935892 0.24920567 0.116377580 0.13580043 0.07012818 0.1682480 0.35898510
#[4,] 0.0137505 0.37288236 0.002338961 0.21928922 0.36341271 0.0250388 0.05103852
#[5,] 0.1648476 0.32981193 0.031723978 0.12681473 0.25509082 0.1959218 0.35277957
print(t1)
# user system elapsed
# 3.452 0.205 3.662
这是一个替代版本,我们利用矩阵以列为主的方式与 R 的复制规则一起存储:
probGen2 <- function(x, means, sds) {
return(t(1/sqrt(2*pi*sds^2)*exp(-(1/(2*sds^2))*(t(x) - means)^2)))
}
i <- 1
t2 <- system.time({
res2 <- probGen2(x = temp, means = meanBreakdown[i, ],
sds = sdBreakdown[i, ])
})
print(res2[1:5, 1:7])
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#[1,] 0.3720575 0.38038806 0.385805475 0.36747185 0.32253028 0.3008070 0.37473829
#[2,] 0.1980087 0.02837476 0.019424716 0.03520653 0.25872889 0.2223151 0.05506068
#[3,] 0.3935892 0.24920567 0.116377580 0.13580043 0.07012818 0.1682480 0.35898510
#[4,] 0.0137505 0.37288236 0.002338961 0.21928922 0.36341271 0.0250388 0.05103852
#[5,] 0.1648476 0.32981193 0.031723978 0.12681473 0.25509082 0.1959218 0.35277957
print(t2)
# user system elapsed
# 0.499 0.014 0.515
如您所见,我们已经对一些非常简单的更改进行了相当大的加速。
您显然可以将其与并行计算结合起来以获得进一步的速度提升。
最后,让我们检查一下是否确实是一样的:
all.equal(res1, res2)
# [1] TRUE
我正在用 R 编写朴素贝叶斯的自定义修改版本,并且 运行 由于正在处理的数据大小而陷入 运行 时间问题。我需要处理 ~145k 行,每行包含 95 个元素。我目前正在使用以下函数来获取朴素贝叶斯的第一步。
probGen <- function(x, i)
{
return(1/(sqrt(2*pi*sdBreakdown[i,]^2)
*exp(-((x - meanBreakdown[i,])^2)/(2*(sdBreakdown[i,]^2)))))
}
在此函数中,sdBreakdown 和 meanBreakdown 是每个可能解决方案的聚合值。每次应用 运行 时,我们都会得到每个给定列的概率。应用是 运行 如下矩阵,其中每一行是我们试图分类的另一个元素。
test.1 <- t(apply(temp,MARGIN=1,FUN=probGen, 1))
test.2 <- t(apply(temp,MARGIN=1,FUN=probGen, 2))
test.3 <- t(apply(temp,MARGIN=1,FUN=probGen, 3))
test.4 <- t(apply(temp,MARGIN=1,FUN=probGen, 4))
test.5 <- t(apply(temp,MARGIN=1,FUN=probGen, 5))
test.6 <- t(apply(temp,MARGIN=1,FUN=probGen, 6))
test.7 <- t(apply(temp,MARGIN=1,FUN=probGen, 7))
test.8 <- t(apply(temp,MARGIN=1,FUN=probGen, 8))
test.9 <- t(apply(temp,MARGIN=1,FUN=probGen, 9))
这是我目前调用每个应用程序的方式。这为每个可能的分类 1-9 给出了每个元素的概率。我不想使用开箱即用的朴素贝叶斯,因为我试图更好地理解 R 并且我想尝试一些潜在的准确性改进。
我不确定如何更及时地制作这个 运行,尽管按照编码需要几个小时,如果我正在积极从事其他项目,可能需要 7 或 8 个小时运行s.
编辑:
澄清本例中的数据。
temp 是 145kx95 矩阵,其中每一行是一个要分类的项目,每一列是用数字表示的质量。
meanBreakdown 是一个 9x95 矩阵,每一行是一个不同的分类,每一列对应于该分类的平均质量。
sdBreakdown 与 meanBreakdown 相同,除了存储标准差而不是平均值。
并行处理似乎可行,但我认为(显然我错了)数据集大到没有必要。
编辑 2:这是完整的代码。如果它是特别糟糕的 R 代码,请原谅我。我一直是 C 开发人员,所以 R 是一个很大的思维转变,我只用 R 做了一些小项目来学习来龙去脉。
training <- read.csv(file = 'data\train.csv', sep=',', header=T)
negativeOne <- function(x)
{
x <- pmin(1, x)
return(1-mean(x))
}
pullZeros <- function(x)
{
x <- ifelse(x == 0, 1, 0)
return(mean(x))
}
trainingSet <- function(x)
{
x <- ifelse(x == 0, NA, x)
return(mean(x, na.rm=T))
}
trainingSetSd <- function(x)
{
x <- ifelse(x == 0, NA, x)
return(sd(x, na.rm=T))
}
positiveBreakDown <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=trainingSet)
positiveBreakDownSd <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=trainingSetSd)
negativeBreakDown <- aggregate(x=training[,colnames(training)[grepl("feat",colnames(training))]],
by=list(training$target), FUN=negativeOne)
meanBreakdown <- positiveBreakDown[,colnames(positiveBreakDown)[grepl("feat",colnames(positiveBreakDown))]]
sdBreakdown <- positiveBreakDownSd[,colnames(positiveBreakDownSd)[grepl("feat",colnames(positiveBreakDownSd))]]
probGen <- function(x, i)
{
return(1/(sqrt(2*pi*sdBreakdown[i,]^2)
*exp(-((x - meanBreakdown[i,])^2)/(2*(sdBreakdown[i,]^2)))))
}
test <- read.csv(file = 'data\test.csv', sep=',', header=T)
PosTest <- test[,colnames(test)[grepl("feat",colnames(test))]]
NegTest <- aggregate(x=test[,colnames(test)[grepl("feat",colnames(test))]],
by=list(test$id), FUN=pullZeros)
NegTest$Group.1 <- NULL
temp <- PosTest
sweepTest.1 <- t(apply(temp,MARGIN=1,FUN=probGen, 1))
sweepTest.2 <- t(apply(temp,MARGIN=1,FUN=probGen, 2))
sweepTest.3 <- t(apply(temp,MARGIN=1,FUN=probGen, 3))
sweepTest.4 <- t(apply(temp,MARGIN=1,FUN=probGen, 4))
sweepTest.5 <- t(apply(temp,MARGIN=1,FUN=probGen, 5))
sweepTest.6 <- t(apply(temp,MARGIN=1,FUN=probGen, 6))
sweepTest.7 <- t(apply(temp,MARGIN=1,FUN=probGen, 7))
sweepTest.8 <- t(apply(temp,MARGIN=1,FUN=probGen, 8))
sweepTest.9 <- t(apply(temp,MARGIN=1,FUN=probGen, 9))
temp <- NegTest
temp$Group.1 <- NULL
N.sweepTest.1 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[1, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.2 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[2, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.3 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[3, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.4 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[4, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.5 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[5, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.6 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[6, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.7 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[7, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.8 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[8, grepl("feat",colnames(positiveBreakDown))]),`*`)
N.sweepTest.9 <- sweep(as.matrix(temp),MARGIN=2,
as.numeric(negativeBreakDown[9, grepl("feat",colnames(positiveBreakDown))]),`*`)
sweepTest.1 <- (-1*(N.sweepTest.1 - 1)*sweepTest.1) + N.sweepTest.1
sweepTest.2 <- (-1*(N.sweepTest.2 - 1)*sweepTest.2) + N.sweepTest.2
sweepTest.3 <- (-1*(N.sweepTest.3 - 1)*sweepTest.3) + N.sweepTest.3
sweepTest.4 <- (-1*(N.sweepTest.4 - 1)*sweepTest.4) + N.sweepTest.4
sweepTest.5 <- (-1*(N.sweepTest.5 - 1)*sweepTest.5) + N.sweepTest.5
sweepTest.6 <- (-1*(N.sweepTest.6 - 1)*sweepTest.6) + N.sweepTest.6
sweepTest.7 <- (-1*(N.sweepTest.7 - 1)*sweepTest.7) + N.sweepTest.7
sweepTest.8 <- (-1*(N.sweepTest.8 - 1)*sweepTest.8) + N.sweepTest.8
sweepTest.9 <- (-1*(N.sweepTest.9 - 1)*sweepTest.9) + N.sweepTest.9
rm(N.sweepTest.1,N.sweepTest.2,N.sweepTest.3,N.sweepTest.4,N.sweepTest.5,N.sweepTest.6,N.sweepTest.7,N.sweepTest.8,N.sweepTest.9)
dist <- 1:9
for(i in 1:9)
{
dist[i] <- nrow(training[training$target == paste0("Class_",i),])
}
res1 <- dist[1]*apply(t(sweepTest.1), MARGIN=2, FUN=prod)
res2 <- dist[2]*apply(t(sweepTest.2), MARGIN=2, FUN=prod)
res3 <- dist[3]*apply(t(sweepTest.3), MARGIN=2, FUN=prod)
res4 <- dist[4]*apply(t(sweepTest.4), MARGIN=2, FUN=prod)
res5 <- dist[5]*apply(t(sweepTest.5), MARGIN=2, FUN=prod)
res6 <- dist[6]*apply(t(sweepTest.6), MARGIN=2, FUN=prod)
res7 <- dist[7]*apply(t(sweepTest.7), MARGIN=2, FUN=prod)
res8 <- dist[8]*apply(t(sweepTest.8), MARGIN=2, FUN=prod)
res9 <- dist[9]*apply(t(sweepTest.9), MARGIN=2, FUN=prod)
rm(sweepTest.1,sweepTest.2,sweepTest.3,sweepTest.4,sweepTest.5,sweepTest.6,sweepTest.7,sweepTest.8,sweepTest.9)
interRes <- data.frame(Class_1 = res1, Class_2 = res2,Class_3 = res3,
Class_4 = res4,Class_5 = res5,Class_6 = res6,
Class_7 = res7,Class_8 = res8,Class_9 = res9)
rm(res1,res2,res3,res4,res5,res6,res7,res8,res9)
temp <- apply(t(interRes), MARGIN=2, FUN=sum)
tempRes <- interRes/temp
data<- data.frame(id=test$id)
data <- cbind(data,tempRes)
fname <- file.choose()
write.table(data, fname, row.names=FALSE, sep=",")
检查 parallel
包和 mcmapply
或 mclapply
到 运行 apply
并行调用。如所写,您的代码是 运行ning 顺序(即您必须先完成所有 1 分类,然后才能进入 2,依此类推)。
据我了解,您运行对相同的数据使用相同的函数,但参数不同。与其进行多次 apply
调用,不如重组函数以允许使用 mcmapply
- 这允许您使用 apply
功能,但迭代多个参数。
您需要正确矢量化您的代码。有了这么简单的功能,就不需要使用 apply
了,它基本上只是一个 for
循环。
首先我们生成一些虚假数据:
rm(list = ls())
set.seed(1)
# Dimensions of data and some faux data
n <- 144000
m <- 95
temp <- matrix(rnorm(n*m), nrow = n, ncol = m)
meanBreakdown <- matrix(seq(-1, 1, l = 9*m), 9, m) # Matrix of means
sdBreakdown <- matrix(seq(1, 2, l = 9*m), 9, m) # Matrix of std. deviations
让我们为您的版本计时 i = 1
。我冒昧地让它更具可读性。另外,我想我发现了一个错误(如果函数只是高斯密度)。无论如何,
probGen <- function(x, means, sds) { # NOTE THAT THIS HAS CHANGED
return(1/sqrt(2*pi*sds^2)*exp(-(1/(2*sds^2))*(x - means)^2) )
}
i <- 1
t1 <- system.time({
res1 <- t(apply(temp, 1, probGen, mean = meanBreakdown[i,],
sds = sdBreakdown[i,]))
})
print(res1[1:5, 1:7])
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#[1,] 0.3720575 0.38038806 0.385805475 0.36747185 0.32253028 0.3008070 0.37473829
#[2,] 0.1980087 0.02837476 0.019424716 0.03520653 0.25872889 0.2223151 0.05506068
#[3,] 0.3935892 0.24920567 0.116377580 0.13580043 0.07012818 0.1682480 0.35898510
#[4,] 0.0137505 0.37288236 0.002338961 0.21928922 0.36341271 0.0250388 0.05103852
#[5,] 0.1648476 0.32981193 0.031723978 0.12681473 0.25509082 0.1959218 0.35277957
print(t1)
# user system elapsed
# 3.452 0.205 3.662
这是一个替代版本,我们利用矩阵以列为主的方式与 R 的复制规则一起存储:
probGen2 <- function(x, means, sds) {
return(t(1/sqrt(2*pi*sds^2)*exp(-(1/(2*sds^2))*(t(x) - means)^2)))
}
i <- 1
t2 <- system.time({
res2 <- probGen2(x = temp, means = meanBreakdown[i, ],
sds = sdBreakdown[i, ])
})
print(res2[1:5, 1:7])
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#[1,] 0.3720575 0.38038806 0.385805475 0.36747185 0.32253028 0.3008070 0.37473829
#[2,] 0.1980087 0.02837476 0.019424716 0.03520653 0.25872889 0.2223151 0.05506068
#[3,] 0.3935892 0.24920567 0.116377580 0.13580043 0.07012818 0.1682480 0.35898510
#[4,] 0.0137505 0.37288236 0.002338961 0.21928922 0.36341271 0.0250388 0.05103852
#[5,] 0.1648476 0.32981193 0.031723978 0.12681473 0.25509082 0.1959218 0.35277957
print(t2)
# user system elapsed
# 0.499 0.014 0.515
如您所见,我们已经对一些非常简单的更改进行了相当大的加速。 您显然可以将其与并行计算结合起来以获得进一步的速度提升。
最后,让我们检查一下是否确实是一样的:
all.equal(res1, res2)
# [1] TRUE