在 R 中使用 pdftools 在字符串后提取特定的 table
Using pdftools in R to extract specific table after a string
我有几个 pdf,我想提取股东 table。如何指定仅提取字符串 'TWENTY LARGEST SHAREHOLDERS' 后出现的 table?
我试过了,但不太确定功能部分。
library("pdftools")
library("tidyverse")
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table <- function(table){
table <- str_split(table, "\n", simplify = TRUE)
table_start <- stringr::str_which(table, "TWENTY LARGEST SHAREHOLDERS")
table <- table[1, (table_start +1 ):(table_end - 1)]
table <- str_replace_all(table, "\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
}
shares <- map_df(raw_text, clean_table)
试试这个。除了一些小问题外,主要的变化是我首先获得了包含所需 table 的页面。顺便说一句:您必须搜索“二十大股东”而不是“二十大股东”。
library(pdftools)
library(tidyverse)
# download pdf
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table1 <- function(raw) {
# Split the single pages
raw <- map(raw, ~ str_split(.x, "\n") %>% unlist())
# Concatenate the splitted pages
raw <- reduce(raw, c)
table_start <- stringr::str_which(tolower(raw), "twenty largest shareholders")
table_end <- stringr::str_which(tolower(raw), "total")
table_end <- table_end[min(which(table_end > table_start))]
table <- raw[(table_start + 3 ):(table_end - 1)]
table <- str_replace_all(table, "\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
data_table
}
shares <- map_df(raw_text, clean_table1)
head(shares)
#> Name Number of Shares
#> 1 J P Morgan Nominees Australia Pty Limited 109,500,852
#> 2 Citicorp Nominees Pty Limited 57,714,777
#> 3 Mr Chris Morris 32,231,000
#> 4 National Nominees Limited 19,355,892
#> 5 Welas Pty Ltd 18,950,000
#> 6 BNP Paribas Nominees Pty Ltd <Agency Lending DRP A/C> 11,520,882
#> Percentage
#> 1 20.17
#> 2 10.63
#> 3 5.94
#> 4 3.56
#> 5 3.49
#> 6 2.12
我有几个 pdf,我想提取股东 table。如何指定仅提取字符串 'TWENTY LARGEST SHAREHOLDERS' 后出现的 table?
我试过了,但不太确定功能部分。
library("pdftools")
library("tidyverse")
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table <- function(table){
table <- str_split(table, "\n", simplify = TRUE)
table_start <- stringr::str_which(table, "TWENTY LARGEST SHAREHOLDERS")
table <- table[1, (table_start +1 ):(table_end - 1)]
table <- str_replace_all(table, "\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
}
shares <- map_df(raw_text, clean_table)
试试这个。除了一些小问题外,主要的变化是我首先获得了包含所需 table 的页面。顺便说一句:您必须搜索“二十大股东”而不是“二十大股东”。
library(pdftools)
library(tidyverse)
# download pdf
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table1 <- function(raw) {
# Split the single pages
raw <- map(raw, ~ str_split(.x, "\n") %>% unlist())
# Concatenate the splitted pages
raw <- reduce(raw, c)
table_start <- stringr::str_which(tolower(raw), "twenty largest shareholders")
table_end <- stringr::str_which(tolower(raw), "total")
table_end <- table_end[min(which(table_end > table_start))]
table <- raw[(table_start + 3 ):(table_end - 1)]
table <- str_replace_all(table, "\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
data_table
}
shares <- map_df(raw_text, clean_table1)
head(shares)
#> Name Number of Shares
#> 1 J P Morgan Nominees Australia Pty Limited 109,500,852
#> 2 Citicorp Nominees Pty Limited 57,714,777
#> 3 Mr Chris Morris 32,231,000
#> 4 National Nominees Limited 19,355,892
#> 5 Welas Pty Ltd 18,950,000
#> 6 BNP Paribas Nominees Pty Ltd <Agency Lending DRP A/C> 11,520,882
#> Percentage
#> 1 20.17
#> 2 10.63
#> 3 5.94
#> 4 3.56
#> 5 3.49
#> 6 2.12