从 R 中的 PDF 表格中提取特定数据;如何获得列标题?
Extraction of specific data from PDF tables in R; how to get column headings?
此代码从 PDF 中提取数据表,然后使用 grepl 提取具有特定关键字的数据,在本例中为 'malaria'。它提取行名称,很多错过列标题并放入 NA,我认为是因为长度不同。有没有办法获取标题?
library(tabulizer)
library(purrr)
library(dplyr)
files <- dir(path = ".", pattern = "\.pdf$", full.names = TRUE, recursive = TRUE)
mdata <- list()
for(i in files){
mdata[[i]] <- extract_tables(i)
}
col_names_list <- lapply(mdata[[1]], function(x) x[1,]) # we extract the first row (colnames)
data <- lapply(mdata[[1]], function(x) as.data.frame(x[-1, ]))
data <- map2(mdata, col_names_list, function(x,y) {colnames(x)[0] <- y[0]
x})
searchterms <-c('malaria')#, 'cases')
pattern <- paste(searchterms, collapse = "|")
mdata %>%
map(function(x) x[grepl(pattern, x[,1], ignore.case = TRUE),, drop = FALSE])-> df2
m1<-df2[sapply(df2, nrow)>0] #removes obs=0
super-difficult 拥有 PDF table 提取的通用解决方案(即使对于来自同一机构的 PDF)。
要从示例文档中获取可用形式的 table 3.1(Gdocs 也是共享 PDF 的不错方式),我会这样做:
library(tabulizer)
fil <- "~/Downloads/GBD2016_1_0915 Gambia HIS SERVICE STATISTICS REPORT 2005.pdf"
(table_3_1 <- tabulizer::extract_tables(file = fil, pages = 23)[[1]])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "" "URD" "CRD" "LRD" "NBDE" "NBDW" "WESTERN" "TOTAL"
## [2,] "Total Women Seen" "910" "1964" "749" "1,640" "1143" "8961" "15,367"
## [3,] "Total Men Seen" "936" "613" "530" "687" "150" "2334" "5,250"
## [4,] "Counselled Only" "49" "229" "71" "250" "232" "809" "1640"
## [5,] "Pills" "137" "476" "221" "398" "198" "2857" "4287"
## [6,] "Depo" "286" "725" "405" "456" "511" "3166" "5549"
## [7,] "Condoms" "888" "4247" "1143" "1934" "3300" "11952" "23464"
## [8,] "Foam" "0" "0" "10" "2" "8" "41" "61"
## [9,] "IUCD" "1" "1" "0" "0" "3" "37" "42"
## [10,] "VSC" "0" "0" "0" "0" "0" "0" "0"
## [11,] "Total New" "" "" "" "" "" "" ""
## [12,] "Acceptors" "1312" "5449" "1779" "2790" "1050" "18053" "30433"
table_3_1[1,1] <- "measure" # need to add the colname
(table_3_1 <- as.data.frame(table_3_1[-(11:12),], stringsAsFactors = FALSE))
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 measure URD CRD LRD NBDE NBDW WESTERN TOTAL
## 2 Total Women Seen 910 1964 749 1,640 1143 8961 15,367
## 3 Total Men Seen 936 613 530 687 150 2334 5,250
## 4 Counselled Only 49 229 71 250 232 809 1640
## 5 Pills 137 476 221 398 198 2857 4287
## 6 Depo 286 725 405 456 511 3166 5549
## 7 Condoms 888 4247 1143 1934 3300 11952 23464
## 8 Foam 0 0 10 2 8 41 61
## 9 IUCD 1 1 0 0 3 37 42
## 10 VSC 0 0 0 0 0 0 0
(table_3_1 <- docxtractr::assign_colnames(table_3_1, 1)) # note the docxtractr package dependency
## measure URD CRD LRD NBDE NBDW WESTERN TOTAL
## 1 Total Women Seen 910 1964 749 1,640 1143 8961 15,367
## 2 Total Men Seen 936 613 530 687 150 2334 5,250
## 3 Counselled Only 49 229 71 250 232 809 1640
## 4 Pills 137 476 221 398 198 2857 4287
## 5 Depo 286 725 405 456 511 3166 5549
## 6 Condoms 888 4247 1143 1934 3300 11952 23464
## 7 Foam 0 0 10 2 8 41 61
## 8 IUCD 1 1 0 0 3 37 42
## 9 VSC 0 0 0 0 0 0 0
table_3_1[,2:8] <- lapply(table_3_1[,2:8], readr::parse_number) # note the readr package dependency
str(table_3_1)
## 'data.frame': 9 obs. of 8 variables:
## $ measure: chr "Total Women Seen" "Total Men Seen" "Counselled Only" "Pills" ...
## $ URD : num 910 936 49 137 286 888 0 1 0
## $ CRD : num 1964 613 229 476 725 ...
## $ LRD : num 749 530 71 221 405 ...
## $ NBDE : num 1640 687 250 398 456 ...
## $ NBDW : num 1143 150 232 198 511 ...
## $ WESTERN: num 8961 2334 809 2857 3166 ...
## $ TOTAL : num 15367 5250 1640 4287 5549 ...
类似的习语可用于转换样本 PDF 第 14 页至第 17 页上的 most 中的 table:
pages_14_to_17 <- tabulizer::extract_tables(file = fil, pages = 14:17)
lapply(pages_14_to_17, function(x) {
x[1,1] <- "measure"
x <- as.data.frame(x, stringsAsFactors = FALSE)
x <- docxtractr::assign_colnames(x, 1)
x[,2:ncol(x)] <- lapply(x[,2:ncol(x)], readr::parse_number)
x
}) -> pages_14_to_17
str(pages_14_to_17, 1)
## List of 12
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 0 obs. of 7 variables:
## $ :'data.frame': 9 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 7 obs. of 8 variables:
## $ :'data.frame': 8 obs. of 8 variables:
(没有把所有的数据帧都放到一个answer里保存space).
请注意,列表元素 3 没有行,因为 table 从 14 到 15 换行。没有 "one size fits all" 方法来处理它,这将是脚本中的自定义逻辑来处理返回第 14 页并为其获取 header。
此代码从 PDF 中提取数据表,然后使用 grepl 提取具有特定关键字的数据,在本例中为 'malaria'。它提取行名称,很多错过列标题并放入 NA,我认为是因为长度不同。有没有办法获取标题?
library(tabulizer)
library(purrr)
library(dplyr)
files <- dir(path = ".", pattern = "\.pdf$", full.names = TRUE, recursive = TRUE)
mdata <- list()
for(i in files){
mdata[[i]] <- extract_tables(i)
}
col_names_list <- lapply(mdata[[1]], function(x) x[1,]) # we extract the first row (colnames)
data <- lapply(mdata[[1]], function(x) as.data.frame(x[-1, ]))
data <- map2(mdata, col_names_list, function(x,y) {colnames(x)[0] <- y[0]
x})
searchterms <-c('malaria')#, 'cases')
pattern <- paste(searchterms, collapse = "|")
mdata %>%
map(function(x) x[grepl(pattern, x[,1], ignore.case = TRUE),, drop = FALSE])-> df2
m1<-df2[sapply(df2, nrow)>0] #removes obs=0
super-difficult 拥有 PDF table 提取的通用解决方案(即使对于来自同一机构的 PDF)。
要从示例文档中获取可用形式的 table 3.1(Gdocs 也是共享 PDF 的不错方式),我会这样做:
library(tabulizer)
fil <- "~/Downloads/GBD2016_1_0915 Gambia HIS SERVICE STATISTICS REPORT 2005.pdf"
(table_3_1 <- tabulizer::extract_tables(file = fil, pages = 23)[[1]])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "" "URD" "CRD" "LRD" "NBDE" "NBDW" "WESTERN" "TOTAL"
## [2,] "Total Women Seen" "910" "1964" "749" "1,640" "1143" "8961" "15,367"
## [3,] "Total Men Seen" "936" "613" "530" "687" "150" "2334" "5,250"
## [4,] "Counselled Only" "49" "229" "71" "250" "232" "809" "1640"
## [5,] "Pills" "137" "476" "221" "398" "198" "2857" "4287"
## [6,] "Depo" "286" "725" "405" "456" "511" "3166" "5549"
## [7,] "Condoms" "888" "4247" "1143" "1934" "3300" "11952" "23464"
## [8,] "Foam" "0" "0" "10" "2" "8" "41" "61"
## [9,] "IUCD" "1" "1" "0" "0" "3" "37" "42"
## [10,] "VSC" "0" "0" "0" "0" "0" "0" "0"
## [11,] "Total New" "" "" "" "" "" "" ""
## [12,] "Acceptors" "1312" "5449" "1779" "2790" "1050" "18053" "30433"
table_3_1[1,1] <- "measure" # need to add the colname
(table_3_1 <- as.data.frame(table_3_1[-(11:12),], stringsAsFactors = FALSE))
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 measure URD CRD LRD NBDE NBDW WESTERN TOTAL
## 2 Total Women Seen 910 1964 749 1,640 1143 8961 15,367
## 3 Total Men Seen 936 613 530 687 150 2334 5,250
## 4 Counselled Only 49 229 71 250 232 809 1640
## 5 Pills 137 476 221 398 198 2857 4287
## 6 Depo 286 725 405 456 511 3166 5549
## 7 Condoms 888 4247 1143 1934 3300 11952 23464
## 8 Foam 0 0 10 2 8 41 61
## 9 IUCD 1 1 0 0 3 37 42
## 10 VSC 0 0 0 0 0 0 0
(table_3_1 <- docxtractr::assign_colnames(table_3_1, 1)) # note the docxtractr package dependency
## measure URD CRD LRD NBDE NBDW WESTERN TOTAL
## 1 Total Women Seen 910 1964 749 1,640 1143 8961 15,367
## 2 Total Men Seen 936 613 530 687 150 2334 5,250
## 3 Counselled Only 49 229 71 250 232 809 1640
## 4 Pills 137 476 221 398 198 2857 4287
## 5 Depo 286 725 405 456 511 3166 5549
## 6 Condoms 888 4247 1143 1934 3300 11952 23464
## 7 Foam 0 0 10 2 8 41 61
## 8 IUCD 1 1 0 0 3 37 42
## 9 VSC 0 0 0 0 0 0 0
table_3_1[,2:8] <- lapply(table_3_1[,2:8], readr::parse_number) # note the readr package dependency
str(table_3_1)
## 'data.frame': 9 obs. of 8 variables:
## $ measure: chr "Total Women Seen" "Total Men Seen" "Counselled Only" "Pills" ...
## $ URD : num 910 936 49 137 286 888 0 1 0
## $ CRD : num 1964 613 229 476 725 ...
## $ LRD : num 749 530 71 221 405 ...
## $ NBDE : num 1640 687 250 398 456 ...
## $ NBDW : num 1143 150 232 198 511 ...
## $ WESTERN: num 8961 2334 809 2857 3166 ...
## $ TOTAL : num 15367 5250 1640 4287 5549 ...
类似的习语可用于转换样本 PDF 第 14 页至第 17 页上的 most 中的 table:
pages_14_to_17 <- tabulizer::extract_tables(file = fil, pages = 14:17)
lapply(pages_14_to_17, function(x) {
x[1,1] <- "measure"
x <- as.data.frame(x, stringsAsFactors = FALSE)
x <- docxtractr::assign_colnames(x, 1)
x[,2:ncol(x)] <- lapply(x[,2:ncol(x)], readr::parse_number)
x
}) -> pages_14_to_17
str(pages_14_to_17, 1)
## List of 12
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 0 obs. of 7 variables:
## $ :'data.frame': 9 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 11 obs. of 8 variables:
## $ :'data.frame': 10 obs. of 8 variables:
## $ :'data.frame': 7 obs. of 8 variables:
## $ :'data.frame': 8 obs. of 8 variables:
(没有把所有的数据帧都放到一个answer里保存space).
请注意,列表元素 3 没有行,因为 table 从 14 到 15 换行。没有 "one size fits all" 方法来处理它,这将是脚本中的自定义逻辑来处理返回第 14 页并为其获取 header。