为什么应用于数据框的 lapply() 函数不会产生与其等效的 for 循环相同的结果?
Why does this lapply() function applied to a dataframe not produce the same results as its for-loop equivalent?
在下面的可重现代码中,自定义 balTransit()
函数 正确地 使用 for 循环填充值转换 table,而自定义 balTransit_1()
函数应该使用 lapply()
做同样的事情,但它不起作用。我在实施 lapply()
时做错了什么? 运行 代码,您将看到以下结果:
balTransit(正确结果):
> test
X1 X0 X2
X1 0 0 3
X0 0 50 0
X2 5 0 0
balTransit_1(不正确,全为 0):
> test_1
X1 X0 X2
X1 0 0 0
X0 0 0 0
X2 0 0 0
增强说明:
我在这里的主要 objective 是学习如何使用 apply()
系列函数,以获得它们的好处。我一直在学习简单的教程。辅助 objective 是从基础数据帧生成转换矩阵。一旦我用 lapply()
(或另一个最合适的 apply()
函数table)解决了这个问题,我将 运行 各种选项(for-loop()
, data.table()
, lapply()
, 等)对照2.5m行的实际数据集进行速度测试。
我正在做的是创建一个转换矩阵(技术上这里是一个数据框),显示在用户指定的时间段内从一个“标志”类别到另一个“标志”类别的值(余额)的流动.因此,在我的“for-loop”可重现示例中,用户指定的“From”周期为 1,“To”周期为 3。然后生成转换矩阵,如图所示 post在底部编辑。
昨天 post 解决了这个转换计数问题。这 post 处理过渡值。
可重现代码:
# Set up data frame:
data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1")
)
# Function to set-up base transition table:
transMat <- function(data){
DF <- data.frame(matrix(0, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(DF) <- unique(data$Flags)
names(DF) <- unique(data$Flags)
return(DF)
}
# Function to populate cells of transition table, using for-loop:
balTransit <- function(data, from=1, to=3){
DF <- transMat(data)
for (i in unique(data$ID)){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF) == id_from)
row <- which(row.names(DF) == id_to)
val <- (data$Values[(data$ID == i & data$Period == from)])
DF[row, column] <- val + DF[row,column]
}
return(DF)
}
# Function to populate cells of transition table, using lapply:
balTransit_1 <- function(data, from=1, to=3){
DF_1 <- transMat(data)
lapply(seq_along(unique(data$ID)), function(i){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF_1) == id_from)
row <- which(row.names(DF_1) == id_to)
val <- (data$Values[(data$ID == i & data$Period == from)])
DF_1[row, column] <- DF_1[row, column] + val
})
return(DF_1)
}
# Run the 2 functions:
test <- balTransit(data,1,3)
test
test_1 <- balTransit_1(data,1,3)
test_1
要使您的 lapply 代码正常工作,只需将 <-
替换为 <<-
:
DF_1[row, column] <<- DF_1[row, column] + val
请参阅 ?assignOps
了解更多信息。
但是,在这种情况下,我再次不推荐 lapply
(通常应避免使用 <<-
)
这是一个data.table
方法:
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- dcast(
data = DT[, .(first_flag = first(Flags), last_flag = last(Flags), first_value = first(Values)), by = ID][
all_flags, on = c("first_flag", "last_flag")],
last_flag ~ first_flag,
fun.aggregate = sum,
value.var = "first_value"
)
for (col_i in seq_len(ncol(resultDT))){
set(resultDT, which(is.na(resultDT[[col_i]])), col_i, 0)
}
print(resultDT)
结果:
last_flag X0 X1 X2
1: X0 50 0 0
2: X1 0 0 3
3: X2 0 5 0
# step by step ------------------------------------------------------------
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1", "X2","X1","X1")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- DT[, .(first_flag = first(Flags), last_flag = last(Flags), first_value = first(Values)), by = ID] # find relevant flags
resultDT <- resultDT[all_flags, on = c("first_flag", "last_flag")] # merge all combinations
resultDT <- dcast(resultDT, last_flag ~ first_flag, fun.aggregate = sum, value.var = "first_value") # dcast
for (col_i in seq_len(ncol(resultDT))){
set(resultDT, which(is.na(resultDT[[col_i]])), col_i, 0)
}
print(resultDT)
在下面的可重现代码中,自定义 balTransit()
函数 正确地 使用 for 循环填充值转换 table,而自定义 balTransit_1()
函数应该使用 lapply()
做同样的事情,但它不起作用。我在实施 lapply()
时做错了什么? 运行 代码,您将看到以下结果:
balTransit(正确结果):
> test
X1 X0 X2
X1 0 0 3
X0 0 50 0
X2 5 0 0
balTransit_1(不正确,全为 0):
> test_1
X1 X0 X2
X1 0 0 0
X0 0 0 0
X2 0 0 0
增强说明:
我在这里的主要 objective 是学习如何使用 apply()
系列函数,以获得它们的好处。我一直在学习简单的教程。辅助 objective 是从基础数据帧生成转换矩阵。一旦我用 lapply()
(或另一个最合适的 apply()
函数table)解决了这个问题,我将 运行 各种选项(for-loop()
, data.table()
, lapply()
, 等)对照2.5m行的实际数据集进行速度测试。
我正在做的是创建一个转换矩阵(技术上这里是一个数据框),显示在用户指定的时间段内从一个“标志”类别到另一个“标志”类别的值(余额)的流动.因此,在我的“for-loop”可重现示例中,用户指定的“From”周期为 1,“To”周期为 3。然后生成转换矩阵,如图所示 post在底部编辑。
昨天 post
可重现代码:
# Set up data frame:
data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1")
)
# Function to set-up base transition table:
transMat <- function(data){
DF <- data.frame(matrix(0, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(DF) <- unique(data$Flags)
names(DF) <- unique(data$Flags)
return(DF)
}
# Function to populate cells of transition table, using for-loop:
balTransit <- function(data, from=1, to=3){
DF <- transMat(data)
for (i in unique(data$ID)){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF) == id_from)
row <- which(row.names(DF) == id_to)
val <- (data$Values[(data$ID == i & data$Period == from)])
DF[row, column] <- val + DF[row,column]
}
return(DF)
}
# Function to populate cells of transition table, using lapply:
balTransit_1 <- function(data, from=1, to=3){
DF_1 <- transMat(data)
lapply(seq_along(unique(data$ID)), function(i){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF_1) == id_from)
row <- which(row.names(DF_1) == id_to)
val <- (data$Values[(data$ID == i & data$Period == from)])
DF_1[row, column] <- DF_1[row, column] + val
})
return(DF_1)
}
# Run the 2 functions:
test <- balTransit(data,1,3)
test
test_1 <- balTransit_1(data,1,3)
test_1
要使您的 lapply 代码正常工作,只需将 <-
替换为 <<-
:
DF_1[row, column] <<- DF_1[row, column] + val
请参阅 ?assignOps
了解更多信息。
但是,在这种情况下,我再次不推荐 lapply
(通常应避免使用 <<-
)
这是一个data.table
方法:
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- dcast(
data = DT[, .(first_flag = first(Flags), last_flag = last(Flags), first_value = first(Values)), by = ID][
all_flags, on = c("first_flag", "last_flag")],
last_flag ~ first_flag,
fun.aggregate = sum,
value.var = "first_value"
)
for (col_i in seq_len(ncol(resultDT))){
set(resultDT, which(is.na(resultDT[[col_i]])), col_i, 0)
}
print(resultDT)
结果:
last_flag X0 X1 X2
1: X0 50 0 0
2: X1 0 0 3
3: X2 0 5 0
# step by step ------------------------------------------------------------
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 50, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X1","X0","X2","X0","X2","X0", "X2","X1","X1", "X2","X1","X1")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- DT[, .(first_flag = first(Flags), last_flag = last(Flags), first_value = first(Values)), by = ID] # find relevant flags
resultDT <- resultDT[all_flags, on = c("first_flag", "last_flag")] # merge all combinations
resultDT <- dcast(resultDT, last_flag ~ first_flag, fun.aggregate = sum, value.var = "first_value") # dcast
for (col_i in seq_len(ncol(resultDT))){
set(resultDT, which(is.na(resultDT[[col_i]])), col_i, 0)
}
print(resultDT)