如果没有 table 标签,如何在 R 中使用 html_table 抓取 table?
How to scraping table using html_table in R if there is no table tag?
我一直在尝试从yahoo.finance中抓取tables,当我检查并找到所需的部分时,代码中没有table标签。我可以使用 html_text 函数提取数据,但它不适用于 html_table 函数。
Income Statement
[
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic[2], "/financials?p=", tic[2])
wahis.session <- html_session(link)
p <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[3]')
p <- html_table(p, header = F, trim = T, fill = T)
]2
“[
library(rvest)
library(tidyverse)
tic <- "AAPL"
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic, "/financials?p=", tic)
wahis.session <- html_session(link)
p <- wahis.session
nodes <- p %>% html_nodes(".fi-row")
df = NULL
for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}
matches <- str_match_all(p1%>%html_node('#Col1-1-Financials-Proxy')%>%html_text(),'\d{1,2}/\d{1,2}/\d{4}')
headers <- c('Breakdown','TTM', matches[[1]][,1])
names(df) <- headers
我一直在尝试从yahoo.finance中抓取tables,当我检查并找到所需的部分时,代码中没有table标签。我可以使用 html_text 函数提取数据,但它不适用于 html_table 函数。 Income Statement
[
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic[2], "/financials?p=", tic[2])
wahis.session <- html_session(link)
p <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[3]')
p <- html_table(p, header = F, trim = T, fill = T)
]2
“[library(rvest)
library(tidyverse)
tic <- "AAPL"
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic, "/financials?p=", tic)
wahis.session <- html_session(link)
p <- wahis.session
nodes <- p %>% html_nodes(".fi-row")
df = NULL
for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}
matches <- str_match_all(p1%>%html_node('#Col1-1-Financials-Proxy')%>%html_text(),'\d{1,2}/\d{1,2}/\d{4}')
headers <- c('Breakdown','TTM', matches[[1]][,1])
names(df) <- headers