R 函数在 webscraper 中循环访问相同的数据
R function is looping over the same data in webscraper
这是我写的程序
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")]
P <- str_count(html_text(Sort2), pattern = " \d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h1')
Title <- gsub("<h1>\n|\n</h1>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&startpage="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
R_Data <- data.frame()
for(i in 1:Page){ #0:Page-1
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
#R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
paste(URL2)
suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
#return(R_Data)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
我的代码输出如下:
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
我正在尝试抓取以年为参数的期刊。我已经抓取了一页,但是当我应该更改页面时,我的循环只是返回到页面顶部并循环遍历相同的数据。我的代码应该是正确的,但我不明白为什么会这样。提前谢谢你
不是读的一样url。这是您选择了错误的节点,而恰好会产生重复的信息。正如我在上一个问题中提到的,您需要 re-work 您的 Title
功能。下面的Title
re-write会根据class名称和单节点匹配提取实际的文章标题。
请注意删除了您的 sep
arg。代码的其他一些区域看起来可能可以在逻辑方面进行简化。
标题函数:
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
R:
library(rvest)
library(XML)
library(stringr)
# Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
# main function with input as parameter year
findURL <- function(year_chosen) {
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if (Page == 5) {
Page2 <- 0
while (Page < Page2 | Page != Page2) {
Page <- Page2
URL3 <- paste(URL, Page - 1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for (i in 1:Page) {
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URL2))) {
parsedDocument <- read_html(URL2[j])
#print(URL2[j])
#print(Title(parsedDocument))
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
#print(R)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
这是我写的程序
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")]
P <- str_count(html_text(Sort2), pattern = " \d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h1')
Title <- gsub("<h1>\n|\n</h1>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&startpage="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
R_Data <- data.frame()
for(i in 1:Page){ #0:Page-1
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
#R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
paste(URL2)
suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
#return(R_Data)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
我的代码输出如下:
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
我正在尝试抓取以年为参数的期刊。我已经抓取了一页,但是当我应该更改页面时,我的循环只是返回到页面顶部并循环遍历相同的数据。我的代码应该是正确的,但我不明白为什么会这样。提前谢谢你
不是读的一样url。这是您选择了错误的节点,而恰好会产生重复的信息。正如我在上一个问题中提到的,您需要 re-work 您的 Title
功能。下面的Title
re-write会根据class名称和单节点匹配提取实际的文章标题。
请注意删除了您的 sep
arg。代码的其他一些区域看起来可能可以在逻辑方面进行简化。
标题函数:
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
R:
library(rvest)
library(XML)
library(stringr)
# Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
# main function with input as parameter year
findURL <- function(year_chosen) {
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if (Page == 5) {
Page2 <- 0
while (Page < Page2 | Page != Page2) {
Page <- Page2
URL3 <- paste(URL, Page - 1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for (i in 1:Page) {
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URL2))) {
parsedDocument <- read_html(URL2[j])
#print(URL2[j])
#print(Title(parsedDocument))
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
#print(R)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)