使用 Parallel/Multi-core 个进程的回归

Question

我正尝试在 Linux 集群上运行 R 中的以下代码。我想要使用全部处理能力（指定 cores/nodes/memory）。代码基本上低于运行s 基于 GAM 回归的预测并保存结果作为多组数据的CSV文件（这里我只显示两个）。

我可以通过 Rmpi/snow/doParallel 等并行包传递此代码吗？谢谢！

数据

dput(head(cont))
structure(list(citycode = c(101L, 101L, 101L, 101L, 101L, 101L
), year = c(1970L, 1970L, 1970L, 1970L, 1970L, 1970L), week = 1:6, 
    wk = 1:6, month = c(1L, 1L, 1L, 1L, 2L, 2L), day = c(10L, 
    17L, 24L, 31L, 7L, 14L), countall = c(17L, 25L, 32L, 25L, 
    23L, 25L), city = structure(c(10L, 10L, 10L, 10L, 10L, 10L
    ), .Label = c("AKRON", "ALBANY", "ALBUQUERQUE", "ALLENTOWN", 
    "ATLANTA", "AUSTIN", "BALTIMORE", "BERKELEY", "BIRMINGHAM", 
    "BOSTON", "BRIDGEPORT", "BUFFALO", "CAMBRIDGE", "CAMDEN", 
    "CANTON", "CHARLOTTE", "CHATTANOOGA", "CHICAGO", "CINCINNATI", 
    "CLEVELAND", "COLUMBUS", "DALLAS", "DAYTON", "DENVER", "DETROIT", 
    "DULUTH", "ELIZABETH", "ERIE", "EVANSVILLE", "FLINT", "FRESNO", 
    "GARY", "GLENDALE", "HARTFORD", "HONOLULU", "HOUSTON", "INDIANAPOLIS", 
    "JACKSONVILLE", "KNOXVILLE", "LINCOLN", "LOUISVILLE", "LOWELL", 
    "LYNN", "MADISON", "MEMPHIS", "MIAMI", "MILWAUKEE", "MINNEAPOLIS", 
    "MOBILE", "MONTGOMERY", "NASHVILLE", "NEWARK", "NORFOLK", 
    "OAKLAND", "OGDEN", "OMAHA", "PASADENA", "PATERSON", "PEORIA", 
    "PHILADELPHIA", "PHOENIX", "PITTSBURG", "PORTLAND", "PROVIDENCE", 
    "PUEBLO", "READING", "RICHMOND", "ROCHESTER", "ROCKFORD", 
    "SACRAMENTO", "SAVANNAH", "SCHENECTADY", "SCRANTON", "SEATTLE", 
    "SHREVEPORT", "SOMERVILLE", "SPOKANE", "SPRINGFIELD", "SYRACUSE", 
    "TACOMA", "TAMPA", "TOLEDO", "TRENTON", "TUCSON", "TULSA", 
    "UTICA", "WASHINGTON", "WATERBURY", "WICHITA", "WILMINGTON", 
    "WORCESTER", "YONKERS", "YOUNGSTOWN"), class = "factor"), 
    tmax = c(-4.92166666666663, -5.63428571428569, -2.52714285714284, 
    -4.97571428571427, 0.498571428571446, 1.29857142857147), 
    tmin = c(-13.8816666666667, -13.8142857142857, -12.9971428571429, 
    -14.36, -9.03142857142855, -9.3342857142857), tmean = c(-9.57770833333332, 
    -9.87196428571428, -8.20696428571427, -8.79892857142856, 
    -3.58249999999998, -4.0480357142857), hmax = c(0.0021666666666667, 
    0.0021428571428571, 0.0028571428571429, 0.0025714285714286, 
    0.0044285714285714, 0.0047142857142857), hmin = c(0.0011666666666667, 
    0.001, 0.001, 0.0007142857142857, 0.0012857142857143, 0.0017142857142857
    ), hmean = c(0.001625, 0.0014464285714286, 0.0017678571428571, 
    0.0018214285714286, 0.0028928571428571, 0.0029107142857143
    ), population = c(3900000L, 3900000L, 3900000L, 3900000L, 
    3900000L, 3900000L), lncount = c(2.83321334405622, 3.2188758248682, 
    3.46573590279973, 3.2188758248682, 3.13549421592915, 3.2188758248682
    ), lnpop = c(15.176487, 15.176487, 15.176487, 15.176487, 
    15.176487, 15.176487), count_pop = c(4.35897435897e-06, 6.41025641026e-06, 
    8.20512820513e-06, 6.41025641026e-06, 5.89743589744e-06, 
    6.41025641026e-06), lncount_pop = c(-12.3432737670437, -11.9576112862317, 
    -11.7107512083001, -11.9576112862317, -12.0409928951707, 
    -11.9576112862317), uniqueid = c(43L, 43L, 43L, 43L, 43L, 
    43L), msax = structure(c(9L, 9L, 9L, 9L, 9L, 9L), .Label = c("Akron, OH", 
    "Albany-Schenectady-Troy, NY", "Albuquerque, NM", "Allentown-Bethlehem-Easton, PA-NJ", 
    "Atlanta-Sandy Springs-Roswell, GA", "Austin-Round Rock, TX", 
    "Baltimore-Columbia-Towson, MD", "Birmingham-Hoover, AL", 
    "Boston-Cambridge-Newton, MA-NH", "Bridgeport-Stamford-Norwalk, CT", 
    "Buffalo-Cheektowaga-Niagara Falls, NY", "Canton-Massillon, OH", 
    "Charlotte-Concord-Gastonia, NC-SC", "Chattanooga, TN-GA", 
    "Chicago-Naperville-Elgin, IL-IN-WI", "Cincinnati, OH-KY-IN", 
    "Cleveland-Elyria, OH", "Columbus, OH", "Dallas-Fort Worth-Arlington, TX", 
    "Dayton, OH", "Denver-Aurora-Lakewood, CO", "Detroit-Warren-Dearborn, MI", 
    "Duluth, MN-WI", "Erie, PA", "Evansville, IN-KY", "Flint, MI", 
    "Fresno, CA", "Hartford-West Hartford-East Hartford, CT", 
    "Houston-The Woodlands-Sugar Land, TX", "Indianapolis-Carmel-Anderson, IN", 
    "Jacksonville, FL", "Knoxville, TN", "Lincoln, NE", "Los Angeles-Long Beach-Anaheim, CA", 
    "Louisville/Jefferson County, KY-IN", "Madison, WI", "Memphis, TN-MS-AR", 
    "Miami-Fort Lauderdale-West Palm Beach, FL", "Milwaukee-Waukesha-West Allis, WI", 
    "Minneapolis-St. Paul-Bloomington, MN-WI", "Mobile, AL", 
    "Montgomery, AL", "Nashville-Davidson--Murfreesboro--Franklin, TN", 
    "New Haven-Milford, CT", "New York-Newark-Jersey City, NY-NJ-PA", 
    "Ogden-Clearfield, UT", "Omaha-Council Bluffs, NE-IA", "Peoria, IL", 
    "Philadelphia-Camden-Wilmington, PA-NJ-DE-MD", "Phoenix-Mesa-Scottsdale, AZ", 
    "Pittsburgh, PA", "Portland-Vancouver-Hillsboro, OR-WA", 
    "Providence-Warwick, RI-MA", "Pueblo, CO", "Reading, PA", 
    "Richmond, VA", "Rochester, NY", "Rockford, IL", "Sacramento--Roseville--Arden-Arcade, CA", 
    "San Francisco-Oakland-Hayward, CA", "Savannah, GA", "Scranton--Wilkes-Barre--Hazleton, PA", 
    "Seattle-Tacoma-Bellevue, WA", "Shreveport-Bossier City, LA", 
    "Spokane-Spokane Valley, WA", "Springfield, MA", "Syracuse, NY", 
    "Tampa-St. Petersburg-Clearwater, FL", "Toledo, OH", "Trenton, NJ", 
    "Tucson, AZ", "Tulsa, OK", "Urban Honolulu, HI", "Utica-Rome, NY", 
    "Virginia Beach-Norfolk-Newport News, VA-NC", "Washington-Arlington-Alexandria, DC-VA-MD-WV", 
    "Wichita, KS", "Worcester, MA-CT", "Youngstown-Warren-Boardman, OH-PA"
    ), class = "factor"), income = c(4686L, 4686L, 4686L, 4686L, 
    4686L, 4686L), deflator = c(22.805, 22.805, 22.805, 22.805, 
    22.805, 22.805), real_inc = c(20548.131, 20548.131, 20548.131, 
    20548.131, 20548.131, 20548.131), lnincome = c(9.9305248, 
    9.9305248, 9.9305248, 9.9305248, 9.9305248, 9.9305248), latitude = c(42.358433, 
    42.358433, 42.358433, 42.358433, 42.358433, 42.358433), longitude = c(-71.059776, 
    -71.059776, -71.059776, -71.059776, -71.059776, -71.059776
    ), tmax_tmean = c(0.51386683488134, 0.570736030967926, 0.307926630257402, 
    0.565490928278604, -0.139168577410035, -0.320790506859599
    ), tmin_tmean = c(1.44937245774694, 1.39934518748982, 1.58367240366414, 
    1.63201688517271, 2.52098494666535, 2.30588027703031), hmax_hmean = c(1.33333333333334, 
    1.48148148148148, 1.61616161616162, 1.41176470588235, 1.53086419753087, 
    1.61963190184049), hmin_hmean = c(0.71794871794872, 0.691358024691359, 
    0.565656565656567, 0.392156862745098, 0.444444444444446, 
    0.588957055214722), yearwk = 197001:197006, l1tmax = c(NA, 
    -4.9216666, -5.6342859, -2.5271428, -4.9757142, 0.49857143
    ), l2tmax = c(NA, NA, -4.9216666, -5.6342859, -2.5271428, 
    -4.9757142), l3tmax = c(NA, NA, NA, -4.9216666, -5.6342859, 
    -2.5271428), l1tmin = c(NA, -13.881667, -13.814285, -12.997143, 
    -14.36, -9.0314283), l2tmin = c(NA, NA, -13.881667, -13.814285, 
    -12.997143, -14.36), l3tmin = c(NA, NA, NA, -13.881667, -13.814285, 
    -12.997143), l1tmean = c(NA, -9.5777082, -9.8719645, -8.2069645, 
    -8.7989283, -3.5825), l2tmean = c(NA, NA, -9.5777082, -9.8719645, 
    -8.2069645, -8.7989283), l3tmean = c(NA, NA, NA, -9.5777082, 
    -9.8719645, -8.2069645), l1hmax = c(NA, 0.0021666666, 0.0021428571, 
    0.0028571428, 0.0025714287, 0.0044285716), l2hmax = c(NA, 
    NA, 0.0021666666, 0.0021428571, 0.0028571428, 0.0025714287
    ), l3hmax = c(NA, NA, NA, 0.0021666666, 0.0021428571, 0.0028571428
    ), l1hmin = c(NA, 0.0011666666, 0.001, 0.001, 0.00071428571, 
    0.0012857143), l2hmin = c(NA, NA, 0.0011666666, 0.001, 0.001, 
    0.00071428571), l3hmin = c(NA, NA, NA, 0.0011666666, 0.001, 
    0.001), l1hmean = c(NA, 0.001625, 0.0014464286, 0.0017678571, 
    0.0018214285, 0.0028928572), l2hmean = c(NA, NA, 0.001625, 
    0.0014464286, 0.0017678571, 0.0018214285), l3hmean = c(NA, 
    NA, NA, 0.001625, 0.0014464286, 0.0017678571), l1count_pop = c(NA, 
    4.3589744e-06, 6.4102564e-06, 8.2051283e-06, 6.4102564e-06, 
    5.897436e-06), l2count_pop = c(NA, NA, 4.3589744e-06, 6.4102564e-06, 
    8.2051283e-06, 6.4102564e-06), l3count_pop = c(NA, NA, NA, 
    4.3589744e-06, 6.4102564e-06, 8.2051283e-06), l4tmax = c(NA, 
    NA, NA, NA, -4.9216666, -5.6342859), l4tmin = c(NA, NA, NA, 
    NA, -13.881667, -13.814285), l4tmean = c(NA, NA, NA, NA, 
    -9.5777082, -9.8719645), l4hmax = c(NA, NA, NA, NA, 0.0021666666, 
    0.0021428571), l4hmean = c(NA, NA, NA, NA, 0.001625, 0.0014464286
    ), l4hmin = c(NA, NA, NA, NA, 0.0011666666, 0.001)), .Names = c("citycode", 
"year", "week", "wk", "month", "day", "countall", "city", "tmax", 
"tmin", "tmean", "hmax", "hmin", "hmean", "population", "lncount", 
"lnpop", "count_pop", "lncount_pop", "uniqueid", "msax", "income", 
"deflator", "real_inc", "lnincome", "latitude", "longitude", 
"tmax_tmean", "tmin_tmean", "hmax_hmean", "hmin_hmean", "yearwk", 
"l1tmax", "l2tmax", "l3tmax", "l1tmin", "l2tmin", "l3tmin", "l1tmean", 
"l2tmean", "l3tmean", "l1hmax", "l2hmax", "l3hmax", "l1hmin", 
"l2hmin", "l3hmin", "l1hmean", "l2hmean", "l3hmean", "l1count_pop", 
"l2count_pop", "l3count_pop", "l4tmax", "l4tmin", "l4tmean", 
"l4hmax", "l4hmean", "l4hmin"), .internal.selfref = <pointer: (nil)>, row.names = c(NA, 
6L), class = c("data.table", "data.frame"))

#####################
library(data.table)
library(mgcv)
library(reshape2)
library(dplyr)
library(tidyr)
library(lubridate)
library(DataCombine)
#
gam_max_count_wk <- gam(count_pop ~ factor(citycode) + factor(year) +
factor(week) + s(lnincome) + s(tmax) +
s(hmax),data=cont,na.action="na.omit", method="ML")

#
# Historic
temp_hist <- read.csv("/work/sd00815/giss_historic/giss_temp_hist.csv")
humid_hist <- read.csv("/work/sd00815/giss_historic/giss_hum_hist.csv")
#
temp_hist <- as.data.table(temp_hist)
humid_hist <- as.data.table(humid_hist)
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_hist, mykey)
setkeyv(humid_hist, mykey)
#
hist<- merge(temp_hist, humid_hist, by=mykey)
#
hist$X.x <- NULL
hist$X.y <- NULL
#
# Max
hist_max <- hist
hist_max$FIPS <- hist_max$year <- hist_max$month <- hist_max$tmin <-
hist_max$tmean <- hist_max$hmin <- hist_max$hmean <- NULL
#
# Adding Factors
hist_max$citycode <- rep(101,nrow(hist_max))
hist_max$year <- rep(2010,nrow(hist_max))
hist_max$lnincome <- rep(10.262,nrow(hist_max))
#
# Predictions
pred_hist_max <- predict.gam(gam_max_count_wk,hist_max)
#
pred_hist_max <- as.data.table(pred_hist_max)
pred_hist_max <- cbind(hist, pred_hist_max)
pred_hist_max$tmax <- pred_hist_max$tmean <- pred_hist_max$tmin <-
pred_hist_max$hmean <- pred_hist_max$hmax <- pred_hist_max$hmin <- NULL
#
# Aggregate by FIPS
max_hist <- pred_hist_max %>%
  group_by(FIPS) %>%
  summarise(pred_hist = mean(pred_hist_max))
#
### Future
## 4.5
# 4.5_2021_2050
temp_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2021_2050_temp.csv")
humid_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2021_2050_temp.csv")
#
# Max
temp_sim <- as.data.table(temp_sim)
setnames(temp_sim, "max", "tmax")
setnames(temp_sim, "min", "tmin")
setnames(temp_sim, "avg", "tmean")
#
humid_sim <- as.data.table(humid_sim)
setnames(humid_sim, "max", "hmax")
setnames(humid_sim, "min", "hmin")
setnames(humid_sim, "avg", "hmean")
#
temp_sim$X <- NULL
humid_sim$X <- NULL
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_sim, mykey)
setkeyv(humid_sim, mykey)
#
sim <- merge(temp_sim, humid_sim, by=mykey)
#
sim_max <- sim
#
sim_max$FIPS <- sim_max$year <- sim_max$month <- sim_max$tmin <-
sim_max$tmean <- sim_max$hmin <- sim_max$hmean <- NULL
#
# Adding Factors
sim_max$citycode <- rep(101,nrow(sim_max))
sim_max$year <- rep(2010,nrow(sim_max))
sim_max$week <- rep(1,nrow(sim_max))
sim_max$lnincome <- rep(10.262,nrow(sim_max))
#
# Predictions
pred_sim_max <- predict.gam(gam_max_count_wk,sim_max)
#
pred_sim_max <- as.data.table(pred_sim_max)
pred_sim_max <- cbind(sim, pred_sim_max)
pred_sim_max$tmax <- pred_sim_max$tmean <- pred_sim_max$tmin <-
pred_sim_max$hmean <- pred_sim_max$hmax <- pred_sim_max$hmin <- NULL
#
# Aggregate by FIPS
max_sim <- pred_sim_max %>%
  group_by(FIPS) %>%
  summarise(pred_sim = mean(pred_sim_max))
#
# Merge with Historical Data
max_hist$FIPS <- as.factor(max_hist$FIPS)
max_sim$FIPS <- as.factor(max_sim$FIPS)
#
mykey1<- c("FIPS")
setkeyv(max_hist, mykey1)
setkeyv(max_sim, mykey1)
max_change <- merge(max_hist, max_sim, by=mykey1)
max_change$change <-
((max_change$pred_sim-max_change$pred_hist)/max_change$pred_hist)*100
#
write.csv(max_change, file =
"/work/sd00815/projections_data/year_wk_fe/giss/max/giss_4.5_2021_2050.csv")



# 4.5_2081_2100
temp_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2081_2100_temp.csv")
humid_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2081_2100_temp.csv")
#
# Max
temp_sim <- as.data.table(temp_sim)
setnames(temp_sim, "max", "tmax")
setnames(temp_sim, "min", "tmin")
setnames(temp_sim, "avg", "tmean")
#
humid_sim <- as.data.table(humid_sim)
setnames(humid_sim, "max", "hmax")
setnames(humid_sim, "min", "hmin")
setnames(humid_sim, "avg", "hmean")
#
temp_sim$X <- NULL
humid_sim$X <- NULL
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_sim, mykey)
setkeyv(humid_sim, mykey)
#
sim <- merge(temp_sim, humid_sim, by=mykey)
#
sim_max <- sim
#
sim_max$FIPS <- sim_max$year <- sim_max$month <- sim_max$tmin <-
sim_max$tmean <- sim_max$hmin <- sim_max$hmean <- NULL
#
# Adding Factors
sim_max$citycode <- rep(101,nrow(sim_max))
sim_max$year <- rep(2010,nrow(sim_max))
sim_max$week <- rep(1,nrow(sim_max))
sim_max$lnincome <- rep(10.262,nrow(sim_max))
#
# Predictions
pred_sim_max <- predict.gam(gam_max_count_wk,sim_max)
#
pred_sim_max <- as.data.table(pred_sim_max)
pred_sim_max <- cbind(sim, pred_sim_max)
pred_sim_max$tmax <- pred_sim_max$tmean <- pred_sim_max$tmin <-
pred_sim_max$hmean <- pred_sim_max$hmax <- pred_sim_max$hmin <- NULL
#
# Aggregate by FIPS
max_sim <- pred_sim_max %>%
  group_by(FIPS) %>%
  summarise(pred_sim = mean(pred_sim_max))
#
# Merge with Historical Data
max_hist$FIPS <- as.factor(max_hist$FIPS)
max_sim$FIPS <- as.factor(max_sim$FIPS)
#
mykey1<- c("FIPS")
setkeyv(max_hist, mykey1)
setkeyv(max_sim, mykey1)
max_change <- merge(max_hist, max_sim, by=mykey1)
max_change$change <-
((max_change$pred_sim-max_change$pred_hist)/max_change$pred_hist)*100
#
write.csv(max_change, file =
"/work/sd00815/projections_data/year_wk_fe/giss/max/giss_4.5_2081_2100.csv")

####################

Answer 1

您知道您的示例在 gam 调用后立即在 read.csv 死亡吗？

不管怎样，我很快修改并测量了我自己的 GAM 代码，可以告诉你使用相对于单线程标准 R.

，带有 MKL 库的 Microsoft R 在 GAM 调用中为我提供了 2 到 3 倍的加速

从 here 安装它，连同 MKL，并在代码的顶部设置实际内核数（不是 HT 内核）

if (require("RevoUtilsMath")) {
    setMKLthreads(4)
}

看看进展如何

更新

可以肯定的是，它会加速线性代数相关的东西，当然，这里没有问题。但是看看你的代码，我会说它可能由 I/O、所有那些 read.csv 和 write.csv 调用主导，总加速可能微乎其微。转向未格式化的数据可能会节省更多时间...

使用 Parallel/Multi-core 个进程的回归

Regressions using Parallel/Multi-core Processes

parallel-processing

multicore

r

gam

snow

数据