如何将 for 循环转换为 lapply 函数以进行并行测试?
How to convert a for-loop to lapply function for parallel testing purposes?
我一直在研究 for 循环的 advantages/disadvantages 与 apply()
函数族的对比,答案并不明确(apply()
总是比 for-循环可能不正确,具体取决于具体情况)。所以我想根据我的实际数据测试各种选项。
下面是一个 for 循环,在我看来非常简单,但我不确定如何用 lapply()
替换它。我假设 lapply()
是正确的,因为 for 循环会生成一个列表对象。
我需要 运行 此分析所针对的实际数据是一个包含 250 万行、30 多列的数据框,因此我想 运行 针对各种选项进行速度测试。
任何解释都会很有帮助。我在网上找到的示例解释不多,或者 for 循环示例过于复杂,我希望学习使用 apply()
系列函数,因为它们看起来非常有用并且比 for 循环更易于阅读。
这是简化的 for 循环代码,带有示例数据框,运行出于示例目的是正确的:
# Set up data frame to perform migration analysis on:
data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
df <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(df) <- unique(data$Flags)
names(df) <- unique(data$Flags)
return(df)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
df <- setTable(data)
for (i in unique(data$ID)){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(df) == id_from)
row <- which(row.names(df) == id_to)
df[row, column] <- ifelse(is.na(df[row, column]), 1, df[row, column] + 1)
}
return(df)
}
# Now to run the function:
test1 <- migration(data, from=1, to=3)
编辑: 包裹在允许指定从 & 到:
的函数中
library(data.table)
DF <- data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
)
migration <- function(DT, from=1, to=3){
setDT(DT)
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(from_flag = unique_flags, to_flag = unique_flags)))
dcast(DT[, .(from_flag = Flags[Period == from], to_flag = Flags[Period == to]), by = ID][
,.N, c("from_flag", "to_flag")][
all_flags, on = c("from_flag", "to_flag")], to_flag ~ from_flag, value.var = "N")
}
migration(DF, 1, 3)
说到speed in R, you can almost always count on library(data.table):
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- dcast(DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID][
,.N, c("first_flag", "last_flag")][
all_flags, on = c("first_flag", "last_flag")], last_flag ~ first_flag, value.var = "N")
print(resultDT)
一步一步:
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID] # find relevant flags
resultDT <- resultDT[,.N, c("first_flag", "last_flag")] # count transitions
resultDT <- resultDT[all_flags, on = c("first_flag", "last_flag")] # merge all combinations
resultDT <- dcast(resultDT, last_flag ~ first_flag, value.var = "N") # dcast
print(resultDT)
关于 lapply
你可以这样做(我更喜欢 data.table
):
# Set up data frame to perform migration analysis on:
input_data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
DF <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(DF) <- unique(data$Flags)
names(DF) <- unique(data$Flags)
return(DF)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
DF <- setTable(data)
lapply(seq_along(unique(data$ID)), function(i){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF) == id_from)
row <- which(row.names(DF) == id_to)
DF[row, column] <<- ifelse(is.na(DF[row, column]), 1, DF[row, column] + 1)
})
return(DF)
}
# Now to run the function:
test1 <- migration(input_data, from=1, to=3)
我一直在研究 for 循环的 advantages/disadvantages 与 apply()
函数族的对比,答案并不明确(apply()
总是比 for-循环可能不正确,具体取决于具体情况)。所以我想根据我的实际数据测试各种选项。
下面是一个 for 循环,在我看来非常简单,但我不确定如何用 lapply()
替换它。我假设 lapply()
是正确的,因为 for 循环会生成一个列表对象。
我需要 运行 此分析所针对的实际数据是一个包含 250 万行、30 多列的数据框,因此我想 运行 针对各种选项进行速度测试。
任何解释都会很有帮助。我在网上找到的示例解释不多,或者 for 循环示例过于复杂,我希望学习使用 apply()
系列函数,因为它们看起来非常有用并且比 for 循环更易于阅读。
这是简化的 for 循环代码,带有示例数据框,运行出于示例目的是正确的:
# Set up data frame to perform migration analysis on:
data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
df <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(df) <- unique(data$Flags)
names(df) <- unique(data$Flags)
return(df)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
df <- setTable(data)
for (i in unique(data$ID)){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(df) == id_from)
row <- which(row.names(df) == id_to)
df[row, column] <- ifelse(is.na(df[row, column]), 1, df[row, column] + 1)
}
return(df)
}
# Now to run the function:
test1 <- migration(data, from=1, to=3)
编辑: 包裹在允许指定从 & 到:
的函数中library(data.table)
DF <- data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
)
migration <- function(DT, from=1, to=3){
setDT(DT)
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(from_flag = unique_flags, to_flag = unique_flags)))
dcast(DT[, .(from_flag = Flags[Period == from], to_flag = Flags[Period == to]), by = ID][
,.N, c("from_flag", "to_flag")][
all_flags, on = c("from_flag", "to_flag")], to_flag ~ from_flag, value.var = "N")
}
migration(DF, 1, 3)
说到speed in R, you can almost always count on library(data.table):
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- dcast(DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID][
,.N, c("first_flag", "last_flag")][
all_flags, on = c("first_flag", "last_flag")], last_flag ~ first_flag, value.var = "N")
print(resultDT)
一步一步:
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID] # find relevant flags
resultDT <- resultDT[,.N, c("first_flag", "last_flag")] # count transitions
resultDT <- resultDT[all_flags, on = c("first_flag", "last_flag")] # merge all combinations
resultDT <- dcast(resultDT, last_flag ~ first_flag, value.var = "N") # dcast
print(resultDT)
关于 lapply
你可以这样做(我更喜欢 data.table
):
# Set up data frame to perform migration analysis on:
input_data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
DF <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(DF) <- unique(data$Flags)
names(DF) <- unique(data$Flags)
return(DF)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
DF <- setTable(data)
lapply(seq_along(unique(data$ID)), function(i){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF) == id_from)
row <- which(row.names(DF) == id_to)
DF[row, column] <<- ifelse(is.na(DF[row, column]), 1, DF[row, column] + 1)
})
return(DF)
}
# Now to run the function:
test1 <- migration(input_data, from=1, to=3)