通过内部函数调用提高循环性能
Improving loop performance with function call inside
library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);
dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
AsOfDate,
CashFlow FROM dbo.Accounts')
resdatatable = as.data.table(res)
odbcCloseAll();
sppv <- function(i, n) {
return((1 + i / 100) ^ (-n))
}
npv <- function(x, i) {
npv = c()
for (k in 1:length(i)) {
pvs = x * sppv(i[k], 1:length(x))
npv = c(npv, sum(pvs))
}
return(npv)
}
xirr <- function(cashflow, dates) {
if (length(cashflow) != length(dates)) {
stop("length(cashflow) != length(dates)")
}
cashflow_adj <- c(cashflow[1])
for (i in 1:(length(cashflow) - 1)) {
d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")
# There are no checks about the monotone values of dates
# put a check in here if the interval is negative
interval <- as.integer(d2 - d1)
if (length(interval) > 0 && !is.na(interval)) {
cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
}
}
left = -10
right = 10
epsilon = 1e-8
while (abs(right - left) > 2 * epsilon) {
midpoint = (right + left) / 2
if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
left = midpoint
} else {
right = midpoint
}
}
irr = (right + left) / 2 / 100
irr <- irr * 365
# Annualized yield (return) reflecting compounding effect of daily returns
irr <- (1 + irr / 365) ^ 365 - 1
irr
}
groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));
groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);
groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);
resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));
datalist = result <- vector("list", length(groupedCompanyNames));
for (i in groupedCompanyNames) {
datesForCompany <- groupedDatesPerCompany[i];
dates <- datesForCompany[[i]];
cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
cashFlows <- cashFlowsForCompany[[i]];
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
datalist[[i]] <- newRow;
}
resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);
print(finalDataFrame);
因此,为了提供上下文,我正在尝试执行以下操作:
- 使用 RODBC 连接从数据库中获取数据
- 获取唯一的公司名称
- 拆分每个公司的现金流量和日期
- 用已知行数初始化数据table,这样它就不需要
逐步增长。
- 遍历唯一的公司名称并调用函数在列表中得到 xirr
公司的现金流量和日期。
- 将包含公司名称和 XIRR 值的每一行添加到新数据table。
- 使用 rbindlist。
这是我正在使用的源数据示例
Company_ID CashFlow AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D 368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -1529180.0000 2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6791.0000 2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -187375.0000 2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -215902.0000 2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -217409.0000 2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -191830.0000 2012-05-31 00:00:00.000
我是 R 的新手 - 大约有 2000 个独特的公司名称,平均 50 个日期,每个现金流量组合 = 100000 个记录,循环处理大约需要 28 秒。
我研究过使用 asParallel 库并使用 foreach 但这似乎对速度没有任何影响。如果我去掉函数 xirr 的调用,那么循环就会被处理并立即完成。
xirr 需要异常处理,因为有时无法迭代计算 xirr 值。
我知道在 R 中循环并不是真正的最佳实践 - 关于如何对其进行向量化以获得更好的性能有什么建议吗?
为了提高性能,我使用了doParallel库。
library(doParallel)
cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
registerDoParallel(cl)
而不是 for 循环,我将逻辑放入 foreach
resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {
company_id <- groupedCompanyNames[n];
datesForCompany <- groupedDatesPerCompany[n];
dates <- unsplit(datesForCompany, company_id);
cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
cashFlows <- unsplit(cashFlowsForCompany, company_id);
#now calculate the xirr for the values
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
}
registerDoSEQ();
当我 运行 将我的完整数据集(4000 家公司)及其日期和现金流量放入其中时。总共 400000 行原始循环花费了大约 10 分钟。使用 foreach 循环并利用机器中的额外内核,操作耗时 60 秒。
我希望有人能够在此基础上提出进一步的性能提升建议,但我认为这是一个很好的改进。
library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);
dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
AsOfDate,
CashFlow FROM dbo.Accounts')
resdatatable = as.data.table(res)
odbcCloseAll();
sppv <- function(i, n) {
return((1 + i / 100) ^ (-n))
}
npv <- function(x, i) {
npv = c()
for (k in 1:length(i)) {
pvs = x * sppv(i[k], 1:length(x))
npv = c(npv, sum(pvs))
}
return(npv)
}
xirr <- function(cashflow, dates) {
if (length(cashflow) != length(dates)) {
stop("length(cashflow) != length(dates)")
}
cashflow_adj <- c(cashflow[1])
for (i in 1:(length(cashflow) - 1)) {
d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")
# There are no checks about the monotone values of dates
# put a check in here if the interval is negative
interval <- as.integer(d2 - d1)
if (length(interval) > 0 && !is.na(interval)) {
cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
}
}
left = -10
right = 10
epsilon = 1e-8
while (abs(right - left) > 2 * epsilon) {
midpoint = (right + left) / 2
if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
left = midpoint
} else {
right = midpoint
}
}
irr = (right + left) / 2 / 100
irr <- irr * 365
# Annualized yield (return) reflecting compounding effect of daily returns
irr <- (1 + irr / 365) ^ 365 - 1
irr
}
groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));
groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);
groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);
resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));
datalist = result <- vector("list", length(groupedCompanyNames));
for (i in groupedCompanyNames) {
datesForCompany <- groupedDatesPerCompany[i];
dates <- datesForCompany[[i]];
cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
cashFlows <- cashFlowsForCompany[[i]];
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
datalist[[i]] <- newRow;
}
resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);
print(finalDataFrame);
因此,为了提供上下文,我正在尝试执行以下操作:
- 使用 RODBC 连接从数据库中获取数据
- 获取唯一的公司名称
- 拆分每个公司的现金流量和日期
- 用已知行数初始化数据table,这样它就不需要 逐步增长。
- 遍历唯一的公司名称并调用函数在列表中得到 xirr 公司的现金流量和日期。
- 将包含公司名称和 XIRR 值的每一行添加到新数据table。
- 使用 rbindlist。
这是我正在使用的源数据示例
Company_ID CashFlow AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D 368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -1529180.0000 2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6791.0000 2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -187375.0000 2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -215902.0000 2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -217409.0000 2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -191830.0000 2012-05-31 00:00:00.000
我是 R 的新手 - 大约有 2000 个独特的公司名称,平均 50 个日期,每个现金流量组合 = 100000 个记录,循环处理大约需要 28 秒。
我研究过使用 asParallel 库并使用 foreach 但这似乎对速度没有任何影响。如果我去掉函数 xirr 的调用,那么循环就会被处理并立即完成。
xirr 需要异常处理,因为有时无法迭代计算 xirr 值。
我知道在 R 中循环并不是真正的最佳实践 - 关于如何对其进行向量化以获得更好的性能有什么建议吗?
为了提高性能,我使用了doParallel库。
library(doParallel)
cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
registerDoParallel(cl)
而不是 for 循环,我将逻辑放入 foreach
resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {
company_id <- groupedCompanyNames[n];
datesForCompany <- groupedDatesPerCompany[n];
dates <- unsplit(datesForCompany, company_id);
cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
cashFlows <- unsplit(cashFlowsForCompany, company_id);
#now calculate the xirr for the values
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
}
registerDoSEQ();
当我 运行 将我的完整数据集(4000 家公司)及其日期和现金流量放入其中时。总共 400000 行原始循环花费了大约 10 分钟。使用 foreach 循环并利用机器中的额外内核,操作耗时 60 秒。
我希望有人能够在此基础上提出进一步的性能提升建议,但我认为这是一个很好的改进。