如何在 SparkR 中加速此 R 代码
how to speed this R code in SparkR
我有以下代码可在常规 R 中运行,但我想移植到 SparkR 以加快速度:
sample_data<- structure(list(idnty_frst_nm = structure(c(3L, 2L), .Label = c("","I", "Ima"), class = "factor"), idnty_last_nm = structure(c(2L,2L), .Label = c("", "NINJA"), class = "factor"), PERSON_IDENTIFIER_VALUE = c(9021099834,9021099834), RECEIPT_NUMBER = structure(c(2L, 2L), .Label = c("","MISC1234567"), class = "factor"), a_nbr.1 = structure(c(2L,2L), .Label = c("", "A079577094"), class = "factor"), ssn = structure(c(2L,2L), .Label = c("", "123-456-7891"), class = "factor"), idnty_dob = structure(c(2L,2L), .Label = c("", "1/2/2020"), class = "factor"), a_nbr = structure(c(2L,2L), .Label = c("", "AO79577094"), class = "factor")), .Names = c("idnty_frst_nm","idnty_last_nm", "PERSON_IDENTIFIER_VALUE", "RECEIPT_NUMBER","a_nbr.1", "ssn", "idnty_dob", "a_nbr"), row.names = 1:2, class = "data.frame")
sample_data$name<-paste(sample_data$idnty_frst_nm, sample_data$idnty_last_nm)
innov.df<-sample_data
rownames(innov.df)<-1:nrow(innov.df)
#Checking for duplicate names, using 'agrep' function and storing intermediate
#results in table 'p'
p<-data.frame(a=integer(),b=integer())
k<-1
for(i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if(i!=j)
{
if((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3]))
{
p[k,1]<-min(i,j)
p[k,2]<-max(i,j)
k<-k+1
}
}
}
}
p<-unique(p)
任何关于如何加快速度的想法都将不胜感激!
这里只是一些您可以如何加快该过程的小示例,当然还有更好的示例。我试图维护您的循环结构并对其进行了优化。
我不会在循环中附加到数据帧,而是在开头创建一个列表并为其赋值,最后将它们绑定在一起。这已经为您带来了大约 40% 的加速。
下一个函数只是在第二个 if 语句中使用 as.character()
,因为它们是其他因素。这又带来了 10%。
最后一个函数使用 矩阵而不是 data.frame 并将值附加到它。最后一个实现了上面的所有优点:)
到现在为止,没有使用 data.tables 或应用函数甚至并行处理,这肯定会进一步加快代码速度,但我相信其他人对这些更有经验并且可能会表现得更好做法。
fdf <- function(){
p <- data.frame(a=integer(),b=integer())
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3])) {
p[k,1] <- min(i,j)
p[k,2] <- max(i,j)
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
flist <- function(){
p <- list()
for (i in 1:nrow(innov.df)) {
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2])==(innov.df[j,2])&&
(innov.df[i,3])==(innov.df[j,3]))
{
p[[k]] <- cbind(a=min(i,j), b=max(i,j))
k <- k+1
}
}
}
}
p <- data.frame(Filter(Negate(is.null), p))
colnames(p) <- c("a","b")
p
}
flistOpti <- function(){
p <- list()
for (i in 1:nrow(innov.df)) {
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if (as.character(innov.df[i,"idnty_last_nm"])==as.character(innov.df[j,"idnty_last_nm"]) &&
(as.character(innov.df[i,"PERSON_IDENTIFIER_VALUE"])==as.character(innov.df[j,"PERSON_IDENTIFIER_VALUE"]))) {
p[[k]] <- cbind(a=min(i,j), b=max(i,j))
k <- k+1
}
}
}
}
p <- data.frame(Filter(Negate(is.null), p))
colnames(p) <- c("a","b")
p
}
fma <- function(){
p <- matrix(c(a=integer(),b=integer()), ncol=2)
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3])) {
p <- c(p, cbind(min(i,j),max(i,j)))
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
fmaOptim <- function(){
p <- matrix(c(a=integer(),b=integer()), ncol=2)
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if (as.character(innov.df[i,"idnty_last_nm"])==as.character(innov.df[j,"idnty_last_nm"]) &&
(as.character(innov.df[i,"PERSON_IDENTIFIER_VALUE"])==as.character(innov.df[j,"PERSON_IDENTIFIER_VALUE"]))) {
p <- c(p, cbind(min(i,j),max(i,j)))
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
library(microbenchmark)
mc <- microbenchmark(times = 1000,
fdfR = fdf(),
flistR = flist(),
flistOptiR = flistOpti(),
fmaR = fma(),
fmaOptimR = fmaOptim()
)
mc
Unit: microseconds
expr min lq mean median uq max neval cld
fdfR 782.725 831.056 1059.789348 883.3505 1018.7130 24732.664 1000 c
flistR 449.700 470.089 576.289776 489.5335 545.9820 23600.677 1000 b
flistOptiR 394.949 417.227 581.230449 435.1620 492.9325 45357.250 1000 b
fmaR 370.784 389.664 484.542990 402.5015 452.3420 24713.785 1000 ab
fmaOptimR 313.015 335.670 431.991723 349.8295 393.0620 25033.973 1000 a
您也可以将 nrow(innov.df)
放在 for(i in 1:nrow(innov.df))
之外,否则它会计算每个循环中的行数。这并没有加快时间,但示例中的行数仅为 2,因此显示一些改进可能太小了。这个小例子表明,随着行数的增加,需要进行一些小的性能优化。
df <- data.frame(
x = runif(100000, 1,10),
id = 1:100000
)
mc <- microbenchmark(times = 1000,
Nrow = {for (i in 1:nrow(df)) {}},
NoNrow = for (i in 1:100000) {}
)
mc
Unit: milliseconds
expr min lq mean median uq max neval cld
Nrow 3.895500 4.060691 4.311444202 4.1584845 4.3051750 10.454830 1000 b
NoNrow 3.384633 3.516409 3.734800425 3.6017420 3.7303085 36.621923 1000 a
我有以下代码可在常规 R 中运行,但我想移植到 SparkR 以加快速度:
sample_data<- structure(list(idnty_frst_nm = structure(c(3L, 2L), .Label = c("","I", "Ima"), class = "factor"), idnty_last_nm = structure(c(2L,2L), .Label = c("", "NINJA"), class = "factor"), PERSON_IDENTIFIER_VALUE = c(9021099834,9021099834), RECEIPT_NUMBER = structure(c(2L, 2L), .Label = c("","MISC1234567"), class = "factor"), a_nbr.1 = structure(c(2L,2L), .Label = c("", "A079577094"), class = "factor"), ssn = structure(c(2L,2L), .Label = c("", "123-456-7891"), class = "factor"), idnty_dob = structure(c(2L,2L), .Label = c("", "1/2/2020"), class = "factor"), a_nbr = structure(c(2L,2L), .Label = c("", "AO79577094"), class = "factor")), .Names = c("idnty_frst_nm","idnty_last_nm", "PERSON_IDENTIFIER_VALUE", "RECEIPT_NUMBER","a_nbr.1", "ssn", "idnty_dob", "a_nbr"), row.names = 1:2, class = "data.frame")
sample_data$name<-paste(sample_data$idnty_frst_nm, sample_data$idnty_last_nm)
innov.df<-sample_data
rownames(innov.df)<-1:nrow(innov.df)
#Checking for duplicate names, using 'agrep' function and storing intermediate
#results in table 'p'
p<-data.frame(a=integer(),b=integer())
k<-1
for(i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if(i!=j)
{
if((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3]))
{
p[k,1]<-min(i,j)
p[k,2]<-max(i,j)
k<-k+1
}
}
}
}
p<-unique(p)
任何关于如何加快速度的想法都将不胜感激!
这里只是一些您可以如何加快该过程的小示例,当然还有更好的示例。我试图维护您的循环结构并对其进行了优化。
我不会在循环中附加到数据帧,而是在开头创建一个列表并为其赋值,最后将它们绑定在一起。这已经为您带来了大约 40% 的加速。
下一个函数只是在第二个 if 语句中使用 as.character()
,因为它们是其他因素。这又带来了 10%。
最后一个函数使用 矩阵而不是 data.frame 并将值附加到它。最后一个实现了上面的所有优点:)
到现在为止,没有使用 data.tables 或应用函数甚至并行处理,这肯定会进一步加快代码速度,但我相信其他人对这些更有经验并且可能会表现得更好做法。
fdf <- function(){
p <- data.frame(a=integer(),b=integer())
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3])) {
p[k,1] <- min(i,j)
p[k,2] <- max(i,j)
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
flist <- function(){
p <- list()
for (i in 1:nrow(innov.df)) {
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2])==(innov.df[j,2])&&
(innov.df[i,3])==(innov.df[j,3]))
{
p[[k]] <- cbind(a=min(i,j), b=max(i,j))
k <- k+1
}
}
}
}
p <- data.frame(Filter(Negate(is.null), p))
colnames(p) <- c("a","b")
p
}
flistOpti <- function(){
p <- list()
for (i in 1:nrow(innov.df)) {
for (j in agrep(innov.df$name[i], innov.df$name,
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if (as.character(innov.df[i,"idnty_last_nm"])==as.character(innov.df[j,"idnty_last_nm"]) &&
(as.character(innov.df[i,"PERSON_IDENTIFIER_VALUE"])==as.character(innov.df[j,"PERSON_IDENTIFIER_VALUE"]))) {
p[[k]] <- cbind(a=min(i,j), b=max(i,j))
k <- k+1
}
}
}
}
p <- data.frame(Filter(Negate(is.null), p))
colnames(p) <- c("a","b")
p
}
fma <- function(){
p <- matrix(c(a=integer(),b=integer()), ncol=2)
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if ((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3])) {
p <- c(p, cbind(min(i,j),max(i,j)))
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
fmaOptim <- function(){
p <- matrix(c(a=integer(),b=integer()), ncol=2)
for (i in 1:nrow(innov.df)){
for (j in agrep(innov.df$name[i], innov.df$name, ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)) {
if (i!=j) {
if (as.character(innov.df[i,"idnty_last_nm"])==as.character(innov.df[j,"idnty_last_nm"]) &&
(as.character(innov.df[i,"PERSON_IDENTIFIER_VALUE"])==as.character(innov.df[j,"PERSON_IDENTIFIER_VALUE"]))) {
p <- c(p, cbind(min(i,j),max(i,j)))
k <- k+1
}
}
}
}
p <- data.frame(matrix(p[!is.na(p)],ncol=2, byrow = T))
colnames(p) <- c("a","b")
p
}
library(microbenchmark)
mc <- microbenchmark(times = 1000,
fdfR = fdf(),
flistR = flist(),
flistOptiR = flistOpti(),
fmaR = fma(),
fmaOptimR = fmaOptim()
)
mc
Unit: microseconds
expr min lq mean median uq max neval cld
fdfR 782.725 831.056 1059.789348 883.3505 1018.7130 24732.664 1000 c
flistR 449.700 470.089 576.289776 489.5335 545.9820 23600.677 1000 b
flistOptiR 394.949 417.227 581.230449 435.1620 492.9325 45357.250 1000 b
fmaR 370.784 389.664 484.542990 402.5015 452.3420 24713.785 1000 ab
fmaOptimR 313.015 335.670 431.991723 349.8295 393.0620 25033.973 1000 a
您也可以将 nrow(innov.df)
放在 for(i in 1:nrow(innov.df))
之外,否则它会计算每个循环中的行数。这并没有加快时间,但示例中的行数仅为 2,因此显示一些改进可能太小了。这个小例子表明,随着行数的增加,需要进行一些小的性能优化。
df <- data.frame(
x = runif(100000, 1,10),
id = 1:100000
)
mc <- microbenchmark(times = 1000,
Nrow = {for (i in 1:nrow(df)) {}},
NoNrow = for (i in 1:100000) {}
)
mc
Unit: milliseconds
expr min lq mean median uq max neval cld
Nrow 3.895500 4.060691 4.311444202 4.1584845 4.3051750 10.454830 1000 b
NoNrow 3.384633 3.516409 3.734800425 3.6017420 3.7303085 36.621923 1000 a