使用 R,'rvest' 包进行在线报纸数据抓取
Online newspaper data scraping with R, 'rvest' package
我的课程任务是从新闻媒体中抓取数据并进行分析。
这是我第一次使用 R 进行抓取,我在获取数据、检查各种指南方面陷入了数周的困境,所有这些都以有限的输出或错误告终。
首先,我尝试了 Analyticsvidhya 的指南,这是我获得的最清晰的代码。我开始只从报纸的档案中抓取一页:
library('rvest')
library('xml2')
library(dplyr)
url <- 'https://en.trend.az/archive/2021-11-03'
library("rvest")
html <- read_html(url)
headline_html <- html_nodes(html,'.category-article .article-title')
#144 articles according by (c)SelectorGadget
headline <- html_text(headline_html)
#print(headline)
length(headline)
我已经为其他 CSS 选择器尝试过类似的代码,但我无法获得超过 9 个结果。
我认为问题可能出在 URL,因此决定从存档中覆盖几天的一组子页面中抓取。
这是根据
的代码
all_df <- list()
arch_date <- seq(as.Date("2021-11-03"), as.Date("2021-11-13"), by="days")
for(i in 'rchdate'){
url_fonq <- str_c ('https://en.trend.az', "/archive/", arch_date)
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
headline_html <- html_nodes(webpage_fonq,'.category-article .article-title')
headline <- html_text(headline_html)
head(headline)
headline <- str_trim(headline)
head(headline)
length(headline)
...(此处省略其他节点的类似命令)
fonq.df <- data.frame( Num = row_number,
Date = date,
Time = time,
Title = headline,
Category = cat)
all_df <-bind_rows(all_df, fonq.df)
}
这是一个我无法修复的错误:
Error: x
must be a string of length 1
7. stop("x
must be a string of length 1", call. = FALSE)
6. read_xml.character(x, encoding = encoding, ..., as_html = TRUE, options = options)
5. read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options)
4. withCallingHandlers(expr, warning = function(w) if (inherits(w, classes)) tryInvokeRestart("muffleWarning"))
3. suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options))
2. read_html.default(url_fonq)
- read_html(url_fonq)
在我尝试 the DataCamp 的更详细但含糊不清的初学者指南之前,它确实以未解决的错误告终。
url <- 'https://en.trend.az/archive/2021-11-03'
headline_html <- read_html(url)
get_headline <- function(html){
html %>%
# The relevant tag
html_nodes('.category-article .article-title') %>%
html_text() %>%
# Trim additional white space - important function
str_trim() %>%
# Convert the list into a vector
unlist()
}
...(此处省略其他节点的类似命令)
get_data_table <- function(html, company_name){
headline <- get_headline(html)
time <- get_time(html)
combine_data <- tibble(Abstract = headline,
Date = time
)
combined_data %>%
mutate(Trend.AZ = company_name) %>%
select(Trend.AZ, Abstract, Date)
}
get_data_from_url <- function(url, company_name){
html <- read_html(url)
get_data_table(html, company_name)
}
scrape_write_table <- function(url, company_name){
url <- "https://en.trend.az"
arch_date <- seq(as.Date("2021-10-01"), as.Date("2021-11-01"), by="days")
list_of_url <- str_c (url, "/archive/", arch_date)
list_of_url %>%
map(get_data_from_url, company_name) %>%
bind_rows() %>%
write_tsv(str_c(company_name,'.tsv'))
}
scrape_write_table(url, 'Trend.AZ')
# !!!The error was after here!!!
trend_az_tbl <- read_tsv('Trend.AZ')
tail(amz_tbl, 11)
错误:
Error in html_elements(...) : object 'tmp' not found
15. html_elements(...)
14. html_nodes(., ".category-article .article-date")
13. *tmp*
%>% html_nodes(".category-article .article-date")
12. get_time(html)
11. get_data_table(html, company_name)
10. .f(.x[[i]], ...)
9. map(., get_data_from_url, company_name)
8. list2(...)
7. bind_rows(.)
6. is.data.frame(x)
5. stopifnot(is.data.frame(x))
4. write_delim(x, file, delim = "\t", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol,
num_threads = num_threads, progress = progress)
3. write_tsv(., str_c(company_name, ".tsv"))
2. list_of_url %>% map(get_data_from_url, company_name) %>% bind_rows() %>% write_tsv(str_c(company_name, ".tsv"))
- scrape_write_table(url, "Trend.AZ")
对于这 3 个代码中的任何一个的任何评论或建议,我将非常感谢。
我真的很急于转到项目的分析部分,以便能够在课程结束时生成报告。
网页是动态加载的,向下滚动时会加载新文章。因此,您需要 RSelenium
和 rvest
来提取所需的数据。
启动浏览器
library(rvest)
library(RSelenium)
url = 'https://en.trend.az/archive/2021-11-02'
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
remDr$navigate(url)
#click outside in an empty space
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:17){
Sys.sleep(2)
webElem$sendKeysToElement(list(key = "end"))
}
获取文章标题
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>%
html_text()
[1] "Chelsea defeats Malmö with minimum score"
[2] "Iran’s import of COVID-19 vaccine exceeds 146mn doses: IRICA"
[3] "Sadyr Zhaparov, Fumio Kishida discuss topical issues of Kyrgyz-Japanese relations"
[4] "We will definitely see new names at World Championships and World Age Group Competitions in Trampoline Gymnastics in Baku - Farid Gayibov"
[5] "Declaration on forest protection, land use adopted by 105 countries"
[6] "Russian Security Council's chief, CIA director meet in Moscow"
[7] "Israel to exhibit for 1st time at Dubai Airshow"
[8] "Azerbaijan's General Prosecutor's Office continues to take measures on appeal against Armenia"
[9] "Azerbaijani, Russian FMs discuss activity of working group for restoration of communications in South Caucasus"
[10] "Russia holds tenth meeting of joint Azerbaijani-Russian Demarcation Commission"
[11] "Only external reasons cause inflation in Azerbaijan - Gazprombank"
[12] "State Oil Fund of Azerbaijan launches tender for technical vendor support"
获取文章链接
lin = remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes('.category-news-wrapper') %>% html_nodes('.article-link')
获取文章类别、日期和时间
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-meta') %>%
html_text()
[1] "\n Other News\n 2 November 23:55\n "
[2] "\n Society\n 2 November 23:14\n "
[3] "\n Kyrgyzstan\n 2 November 22:55\n "
[4] "\n Society\n 2 November 22:51\n "
[5] "\n Other News\n 2 November 22:26\n "
[6] "\n Russia\n 2 November 21:50\n "
[7] "\n Israel\n 2 November 21:24\n "
[8] "\n Politics\n 2 November 20:50\n "
[9] "\n Politics\n 2 November 20:25\n "
[10] "\n Politics\n 2 November 20:16\n "
我的课程任务是从新闻媒体中抓取数据并进行分析。 这是我第一次使用 R 进行抓取,我在获取数据、检查各种指南方面陷入了数周的困境,所有这些都以有限的输出或错误告终。
首先,我尝试了 Analyticsvidhya 的指南,这是我获得的最清晰的代码。我开始只从报纸的档案中抓取一页:
library('rvest')
library('xml2')
library(dplyr)
url <- 'https://en.trend.az/archive/2021-11-03'
library("rvest")
html <- read_html(url)
headline_html <- html_nodes(html,'.category-article .article-title')
#144 articles according by (c)SelectorGadget
headline <- html_text(headline_html)
#print(headline)
length(headline)
我已经为其他 CSS 选择器尝试过类似的代码,但我无法获得超过 9 个结果。
我认为问题可能出在 URL,因此决定从存档中覆盖几天的一组子页面中抓取。
这是根据
all_df <- list()
arch_date <- seq(as.Date("2021-11-03"), as.Date("2021-11-13"), by="days")
for(i in 'rchdate'){
url_fonq <- str_c ('https://en.trend.az', "/archive/", arch_date)
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
headline_html <- html_nodes(webpage_fonq,'.category-article .article-title')
headline <- html_text(headline_html)
head(headline)
headline <- str_trim(headline)
head(headline)
length(headline)
...(此处省略其他节点的类似命令)
fonq.df <- data.frame( Num = row_number,
Date = date,
Time = time,
Title = headline,
Category = cat)
all_df <-bind_rows(all_df, fonq.df)
}
这是一个我无法修复的错误:
Error:
x
must be a string of length 1 7. stop("x
must be a string of length 1", call. = FALSE) 6. read_xml.character(x, encoding = encoding, ..., as_html = TRUE, options = options) 5. read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options) 4. withCallingHandlers(expr, warning = function(w) if (inherits(w, classes)) tryInvokeRestart("muffleWarning")) 3. suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options)) 2. read_html.default(url_fonq)
- read_html(url_fonq)
在我尝试 the DataCamp 的更详细但含糊不清的初学者指南之前,它确实以未解决的错误告终。
url <- 'https://en.trend.az/archive/2021-11-03'
headline_html <- read_html(url)
get_headline <- function(html){
html %>%
# The relevant tag
html_nodes('.category-article .article-title') %>%
html_text() %>%
# Trim additional white space - important function
str_trim() %>%
# Convert the list into a vector
unlist()
}
...(此处省略其他节点的类似命令)
get_data_table <- function(html, company_name){
headline <- get_headline(html)
time <- get_time(html)
combine_data <- tibble(Abstract = headline,
Date = time
)
combined_data %>%
mutate(Trend.AZ = company_name) %>%
select(Trend.AZ, Abstract, Date)
}
get_data_from_url <- function(url, company_name){
html <- read_html(url)
get_data_table(html, company_name)
}
scrape_write_table <- function(url, company_name){
url <- "https://en.trend.az"
arch_date <- seq(as.Date("2021-10-01"), as.Date("2021-11-01"), by="days")
list_of_url <- str_c (url, "/archive/", arch_date)
list_of_url %>%
map(get_data_from_url, company_name) %>%
bind_rows() %>%
write_tsv(str_c(company_name,'.tsv'))
}
scrape_write_table(url, 'Trend.AZ')
# !!!The error was after here!!!
trend_az_tbl <- read_tsv('Trend.AZ')
tail(amz_tbl, 11)
错误:
Error in html_elements(...) : object 'tmp' not found 15. html_elements(...) 14. html_nodes(., ".category-article .article-date") 13.
*tmp*
%>% html_nodes(".category-article .article-date") 12. get_time(html) 11. get_data_table(html, company_name) 10. .f(.x[[i]], ...) 9. map(., get_data_from_url, company_name) 8. list2(...) 7. bind_rows(.) 6. is.data.frame(x) 5. stopifnot(is.data.frame(x)) 4. write_delim(x, file, delim = "\t", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress) 3. write_tsv(., str_c(company_name, ".tsv")) 2. list_of_url %>% map(get_data_from_url, company_name) %>% bind_rows() %>% write_tsv(str_c(company_name, ".tsv"))
- scrape_write_table(url, "Trend.AZ")
对于这 3 个代码中的任何一个的任何评论或建议,我将非常感谢。 我真的很急于转到项目的分析部分,以便能够在课程结束时生成报告。
网页是动态加载的,向下滚动时会加载新文章。因此,您需要 RSelenium
和 rvest
来提取所需的数据。
启动浏览器
library(rvest)
library(RSelenium)
url = 'https://en.trend.az/archive/2021-11-02'
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
remDr$navigate(url)
#click outside in an empty space
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:17){
Sys.sleep(2)
webElem$sendKeysToElement(list(key = "end"))
}
获取文章标题
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>%
html_text()
[1] "Chelsea defeats Malmö with minimum score"
[2] "Iran’s import of COVID-19 vaccine exceeds 146mn doses: IRICA"
[3] "Sadyr Zhaparov, Fumio Kishida discuss topical issues of Kyrgyz-Japanese relations"
[4] "We will definitely see new names at World Championships and World Age Group Competitions in Trampoline Gymnastics in Baku - Farid Gayibov"
[5] "Declaration on forest protection, land use adopted by 105 countries"
[6] "Russian Security Council's chief, CIA director meet in Moscow"
[7] "Israel to exhibit for 1st time at Dubai Airshow"
[8] "Azerbaijan's General Prosecutor's Office continues to take measures on appeal against Armenia"
[9] "Azerbaijani, Russian FMs discuss activity of working group for restoration of communications in South Caucasus"
[10] "Russia holds tenth meeting of joint Azerbaijani-Russian Demarcation Commission"
[11] "Only external reasons cause inflation in Azerbaijan - Gazprombank"
[12] "State Oil Fund of Azerbaijan launches tender for technical vendor support"
获取文章链接
lin = remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes('.category-news-wrapper') %>% html_nodes('.article-link')
获取文章类别、日期和时间
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-meta') %>%
html_text()
[1] "\n Other News\n 2 November 23:55\n "
[2] "\n Society\n 2 November 23:14\n "
[3] "\n Kyrgyzstan\n 2 November 22:55\n "
[4] "\n Society\n 2 November 22:51\n "
[5] "\n Other News\n 2 November 22:26\n "
[6] "\n Russia\n 2 November 21:50\n "
[7] "\n Israel\n 2 November 21:24\n "
[8] "\n Politics\n 2 November 20:50\n "
[9] "\n Politics\n 2 November 20:25\n "
[10] "\n Politics\n 2 November 20:16\n "