从包含数千个标签的 excel 书中读取数据
Reading data from excel book with many thousand tabs
我正在从单个 xlsx 文件中读取数据,数据存储在每个工作簿文件的 10-20 千 个单独的选项卡中。第一个 sheet 包含主数据 table,包括指向包含更多数据的各个选项卡的链接。
基于列的 'tabbed' 数据在附加到主数据之前进行汇总和转置。
主数据 table 本身很大(10'000 行 x 数百列),附加数据选项卡本身很小(几列乘以 10 到几 '00 行)。
使用 XLConnect
程序包在调用 loadWorkbook()
(R 3.4.0、RStudio 1.1.383、64 位、8G 机器)时已经内存不足,否则我可以按照这些方法工作.
因为我需要从单独的选项卡加载,所以我目前使用嵌套的 for() 循环来加载每个单独的选项卡数据。但是,根据我的选项卡数量,每个循环需要将近一分钟,使总执行时间接近一周!使用嵌套的 for() 循环显然也是不整洁的,所以我怀疑有一种更简洁(更快)的方法来实现这一点,但看不到它。
我已将链接读入 R 中的专用 df (linkReferences
)。
数据源不是我的,所以我坚持使用提供的输入。
问题完全与读取 sheet 的速度有关,它似乎随着文件中 sheet 的数量(以及文件大小)的增长而增长。
我正在寻找任何解决方案来加快速度,并使用独立的最小示例进行了更新。
在我的电脑上:n = 10
给出 time/sheet 0.16 秒,n = 100
~0.56 sec/sheet 和 n = 1000
~3 sec/sheet,这与我的相似在我的真实数据中看到(<10 sec/sheet for 16k sheets)
library(tidyverse)
number_of_sheets= 100
# =========================================================================
# CREATE SAMPLE FILE . Layout similar to actual data
library(openxlsx)
my.sheets.file <- "sampleXLSX.xlsx"
linkReferences <- data_frame( sheet = str_c("Data ",seq(1:number_of_sheets)) )
wb <- write.xlsx(linkReferences, file=my.sheets.file)
sample_header <-data.frame( head_name = c("head1", "head2","head3","head4","head5") ,
head_text = c("text1", "text2","text3","text4","text5") )
set.seed(31415)
for (i in 1:number_of_sheets) {
cat(i,"..")
sheet_name_i <- paste0("Data ",i)
addWorksheet(wb, sheetName = sheet_name_i)
writeData(wb, sheet=sheet_name_i, sample_header, startCol = "B", startRow=2)
n = ceiling( runif(1)*200 )
sample_data <- data_frame(A=seq(1:n),
B= runif(n),
C= sample(seq(1:5),n,replace=TRUE))
writeData(wb, sheet=sheet_name_i, sample_data, startCol = "B", startRow=10)
}
saveWorkbook(wb, file=my.sheets.file, overwrite=TRUE)
#===========================================================================
# THIS IS THE ACTUAL QUESTION
# Read from file with many tabs
library(readxl)
library(stringr)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
start.row=1
start_time <- Sys.time()
for (i in start.row:linkReferences.nrows){
cat("i=",i, " / ",linkReferences.nrows,"\n")
start_time_i=Sys.time()
linked_data <- read_xlsx(my.sheets.file,
sheet=as.character(linkReferences[i,"sheet"]),
skip=2,
col_types = c("text","text","text"),
col_names=FALSE)
print(Sys.time()-start_time_i) # This takes 99% of the loop time
linkReferences[i,2:6] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,2:20])
# ________________________________________________________
# BELOW IS ONLY FOR TEST LOOP TIMING STATS IN THIS EXAMPLE
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}
# Conversion of data loaded as character to numeric can all happen outside loop once all data is loaded.
经过一番挖掘:XLConnect()
凭借其矢量化 sheet 读取能力(参见 here),是 clear 赢家,前提是你可以在内存中使用你的工作簿。
我只好一个。减少我的工作簿的大小,和 b。根据@Joshua 的 link 这里将 XLconnect 内存设置为 4GB。
对于上述问题的 1000 sheets 示例:
wb <- loadWorkbook()
用了15秒,
linked_data_lst = readWorksheet()
用了34秒,
从现在的内存列表中提取数据 for (i in 1:nr_linked_data){...}
花费了 86 秒。
总时间为 0.135 sec/sheet(比上面的代码快 22 倍)
#============================================================================
# now read it again
library(stringr)
options(java.parameters = "-Xmx4g" )
library(XLConnect)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
lRcols <- c(match(str_c("A.1"),lRnames):match(str_c("C.5"),lRnames))
lRheadCols <- c((lRcols[1]-5):(lRcols[1]-1))
start_time <- Sys.time()
wb <- loadWorkbook(my.sheets.file)
Sys.time() - start_time
start.row=1
end.row = linkReferences.nrows
start_time0 <- Sys.time()
linked_data_lst = readWorksheet(wb,
sheet=linkReferences[start.row:end.row,][["sheet"]],
startCol = 2,
endCol = 4,
startRow = 3,
header = FALSE)
delta_time <- (Sys.time() - start_time0) %>% print()
nr_linked_data <- length(linked_data_lst)
start_time <- Sys.time()
for (i in 1:nr_linked_data ) {
cat("i=",i, " / ",nr_linked_data,"\n")
linked_data <- as_tibble(linked_data_lst[[i]])
# EVERYTHING BELOW HERE IS EXACTLY SAME AS IN QUESTION CODE
# =========================================================
linkReferences[i,lRheadCols] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
linked_data <- linked_data %>% mutate_all( funs(as.numeric) )
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,lRheadCols[1]:max(lRcols)])
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}
我正在从单个 xlsx 文件中读取数据,数据存储在每个工作簿文件的 10-20 千 个单独的选项卡中。第一个 sheet 包含主数据 table,包括指向包含更多数据的各个选项卡的链接。
基于列的 'tabbed' 数据在附加到主数据之前进行汇总和转置。
主数据 table 本身很大(10'000 行 x 数百列),附加数据选项卡本身很小(几列乘以 10 到几 '00 行)。
使用 XLConnect
程序包在调用 loadWorkbook()
(R 3.4.0、RStudio 1.1.383、64 位、8G 机器)时已经内存不足,否则我可以按照这些方法工作
因为我需要从单独的选项卡加载,所以我目前使用嵌套的 for() 循环来加载每个单独的选项卡数据。但是,根据我的选项卡数量,每个循环需要将近一分钟,使总执行时间接近一周!使用嵌套的 for() 循环显然也是不整洁的,所以我怀疑有一种更简洁(更快)的方法来实现这一点,但看不到它。
我已将链接读入 R 中的专用 df (linkReferences
)。
数据源不是我的,所以我坚持使用提供的输入。
问题完全与读取 sheet 的速度有关,它似乎随着文件中 sheet 的数量(以及文件大小)的增长而增长。
我正在寻找任何解决方案来加快速度,并使用独立的最小示例进行了更新。
在我的电脑上:n = 10
给出 time/sheet 0.16 秒,n = 100
~0.56 sec/sheet 和 n = 1000
~3 sec/sheet,这与我的相似在我的真实数据中看到(<10 sec/sheet for 16k sheets)
library(tidyverse)
number_of_sheets= 100
# =========================================================================
# CREATE SAMPLE FILE . Layout similar to actual data
library(openxlsx)
my.sheets.file <- "sampleXLSX.xlsx"
linkReferences <- data_frame( sheet = str_c("Data ",seq(1:number_of_sheets)) )
wb <- write.xlsx(linkReferences, file=my.sheets.file)
sample_header <-data.frame( head_name = c("head1", "head2","head3","head4","head5") ,
head_text = c("text1", "text2","text3","text4","text5") )
set.seed(31415)
for (i in 1:number_of_sheets) {
cat(i,"..")
sheet_name_i <- paste0("Data ",i)
addWorksheet(wb, sheetName = sheet_name_i)
writeData(wb, sheet=sheet_name_i, sample_header, startCol = "B", startRow=2)
n = ceiling( runif(1)*200 )
sample_data <- data_frame(A=seq(1:n),
B= runif(n),
C= sample(seq(1:5),n,replace=TRUE))
writeData(wb, sheet=sheet_name_i, sample_data, startCol = "B", startRow=10)
}
saveWorkbook(wb, file=my.sheets.file, overwrite=TRUE)
#===========================================================================
# THIS IS THE ACTUAL QUESTION
# Read from file with many tabs
library(readxl)
library(stringr)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
start.row=1
start_time <- Sys.time()
for (i in start.row:linkReferences.nrows){
cat("i=",i, " / ",linkReferences.nrows,"\n")
start_time_i=Sys.time()
linked_data <- read_xlsx(my.sheets.file,
sheet=as.character(linkReferences[i,"sheet"]),
skip=2,
col_types = c("text","text","text"),
col_names=FALSE)
print(Sys.time()-start_time_i) # This takes 99% of the loop time
linkReferences[i,2:6] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,2:20])
# ________________________________________________________
# BELOW IS ONLY FOR TEST LOOP TIMING STATS IN THIS EXAMPLE
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}
# Conversion of data loaded as character to numeric can all happen outside loop once all data is loaded.
经过一番挖掘:XLConnect()
凭借其矢量化 sheet 读取能力(参见 here),是 clear 赢家,前提是你可以在内存中使用你的工作簿。
我只好一个。减少我的工作簿的大小,和 b。根据@Joshua 的 link 这里将 XLconnect 内存设置为 4GB。
对于上述问题的 1000 sheets 示例:
wb <- loadWorkbook()
用了15秒,
linked_data_lst = readWorksheet()
用了34秒,
从现在的内存列表中提取数据 for (i in 1:nr_linked_data){...}
花费了 86 秒。
总时间为 0.135 sec/sheet(比上面的代码快 22 倍)
#============================================================================
# now read it again
library(stringr)
options(java.parameters = "-Xmx4g" )
library(XLConnect)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
lRcols <- c(match(str_c("A.1"),lRnames):match(str_c("C.5"),lRnames))
lRheadCols <- c((lRcols[1]-5):(lRcols[1]-1))
start_time <- Sys.time()
wb <- loadWorkbook(my.sheets.file)
Sys.time() - start_time
start.row=1
end.row = linkReferences.nrows
start_time0 <- Sys.time()
linked_data_lst = readWorksheet(wb,
sheet=linkReferences[start.row:end.row,][["sheet"]],
startCol = 2,
endCol = 4,
startRow = 3,
header = FALSE)
delta_time <- (Sys.time() - start_time0) %>% print()
nr_linked_data <- length(linked_data_lst)
start_time <- Sys.time()
for (i in 1:nr_linked_data ) {
cat("i=",i, " / ",nr_linked_data,"\n")
linked_data <- as_tibble(linked_data_lst[[i]])
# EVERYTHING BELOW HERE IS EXACTLY SAME AS IN QUESTION CODE
# =========================================================
linkReferences[i,lRheadCols] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
linked_data <- linked_data %>% mutate_all( funs(as.numeric) )
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,lRheadCols[1]:max(lRcols)])
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}