使用 R 在 html_table 中的每个单元格中提取链接而不是字符
Extraction of links instead of characters within each cell in a html_table using R
我尝试使用脚本在 R 中使用 rvest 包提取多个 html_table:
library(rvest)
library(dplyr)
library('xml2')
library(tidyverse)
jump <- seq(1, 2, by = 1)
urls <- paste('https://asbdavani.org/horse/foals/', jump, sep="")
out <- vector("character", length = length(urls))
for(i in seq_along(urls)){
derby <- read_html(urls[i], encoding="UTF-8")
out[i] <- derby %>%
html_table(fill = TRUE)
}
first_table <- out[[1]]
在这里,我将其中一个表提取为 first_table:
我想知道如何在第 2、6 和 7 列中获得每个字符的链接,如下所示:
这是一个相当不整洁的实现,但原则上是可行的。你当然可以让它更简洁,我没有根据你的确切示例重新定位和重命名列。
library(rvest)
library(dplyr)
library('xml2')
library(tidyverse)
jump <- seq(1, 2, by = 1)
urls <- paste('https://asbdavani.org/horse/foals/', jump, sep="")
out <- data.frame()
for(i in seq_along(urls)) {
html <- read_html(urls[i], encoding = "UTF-8")
derby <-
html %>% html_elements("td") %>%
html_children() %>%
html_attr('href')
links <-
matrix(derby,
nrow = length(derby) / 3,
ncol = 3,
byrow = T) %>% as.data.frame()
combined <- html %>%
html_table(fill = TRUE) %>% bind_cols(., tibble(
اسب = links$V1,
سیلمی = links$V2,
مادیان = links$V3
))
out <- bind_rows(out, combined)
}
这是一个棘手的问题,因为您需要 select table 中的特定列。 xpath select或“nth-child”提供了这种能力。
为了简化解释,下面的代码将在 1 页的 1 table 上演示解决方案。复制并粘贴到您的代码中应该相对容易。
#Read the page
url<-"https://asbdavani.org/horse/foals/6404"
page <- read_html(url)
#extract the tables from the page
tables <-page %>% html_elements("table")
#In this case we are looking at the second table
#extract each row of the table
rows <-tables[2] %>% html_elements("tr")
#remove the first row since that is the heading
#get the 2nd column from each row
#and parse the "a" html tag from the 2nd column
#retrieve the href link
col2Links <- rows[-1] %>% html_element("td:nth-child(2) a") %>% html_attr("href")
#repeat for columns 6 & 7
col6Links <- rows[-1]%>% html_element("td:nth-child(6) a") %>% html_attr("href")
col7Links <- rows[-1]%>% html_element("td:nth-child(7) a") %>% html_attr("href")
#will need to paste0 "https://asbdavani.org" onto each link.
#col2Links %>% paste0("https://asbdavani.org", .)
#make data.frame
answer <-data.frame(col2Links, col6Links, col7Links)
answer
col2Links col6Links col7Links
1 /horse/performance/13993 /horse/performance/6404 <NA>
2 /horse/performance/13873 /horse/performance/6404 <NA>
3 /horse/performance/533 /horse/performance/6404 /horse/performance/10958
4 /horse/performance/5277 /horse/performance/6404 /horse/performance/11051
5 /horse/performance/5461 /horse/performance/6404 /horse/performance/11049
6 /horse/performance/5602 /horse/performance/6404 /horse/performance/11084
7 /horse/performance/6466 /horse/performance/6404 /horse/performance/11097
8 /horse/performance/11004 /horse/performance/6404 /horse/performance/10994
9 /horse/performance/11113 /horse/performance/6404 /horse/performance/11097
10 /horse/performance/11114 /horse/performance/6404 /horse/performance/11097
11 /horse/performance/11126 /horse/performance/6404 /horse/performance/11119
我尝试使用脚本在 R 中使用 rvest 包提取多个 html_table:
library(rvest)
library(dplyr)
library('xml2')
library(tidyverse)
jump <- seq(1, 2, by = 1)
urls <- paste('https://asbdavani.org/horse/foals/', jump, sep="")
out <- vector("character", length = length(urls))
for(i in seq_along(urls)){
derby <- read_html(urls[i], encoding="UTF-8")
out[i] <- derby %>%
html_table(fill = TRUE)
}
first_table <- out[[1]]
在这里,我将其中一个表提取为 first_table:
我想知道如何在第 2、6 和 7 列中获得每个字符的链接,如下所示:
这是一个相当不整洁的实现,但原则上是可行的。你当然可以让它更简洁,我没有根据你的确切示例重新定位和重命名列。
library(rvest)
library(dplyr)
library('xml2')
library(tidyverse)
jump <- seq(1, 2, by = 1)
urls <- paste('https://asbdavani.org/horse/foals/', jump, sep="")
out <- data.frame()
for(i in seq_along(urls)) {
html <- read_html(urls[i], encoding = "UTF-8")
derby <-
html %>% html_elements("td") %>%
html_children() %>%
html_attr('href')
links <-
matrix(derby,
nrow = length(derby) / 3,
ncol = 3,
byrow = T) %>% as.data.frame()
combined <- html %>%
html_table(fill = TRUE) %>% bind_cols(., tibble(
اسب = links$V1,
سیلمی = links$V2,
مادیان = links$V3
))
out <- bind_rows(out, combined)
}
这是一个棘手的问题,因为您需要 select table 中的特定列。 xpath select或“nth-child”提供了这种能力。
为了简化解释,下面的代码将在 1 页的 1 table 上演示解决方案。复制并粘贴到您的代码中应该相对容易。
#Read the page
url<-"https://asbdavani.org/horse/foals/6404"
page <- read_html(url)
#extract the tables from the page
tables <-page %>% html_elements("table")
#In this case we are looking at the second table
#extract each row of the table
rows <-tables[2] %>% html_elements("tr")
#remove the first row since that is the heading
#get the 2nd column from each row
#and parse the "a" html tag from the 2nd column
#retrieve the href link
col2Links <- rows[-1] %>% html_element("td:nth-child(2) a") %>% html_attr("href")
#repeat for columns 6 & 7
col6Links <- rows[-1]%>% html_element("td:nth-child(6) a") %>% html_attr("href")
col7Links <- rows[-1]%>% html_element("td:nth-child(7) a") %>% html_attr("href")
#will need to paste0 "https://asbdavani.org" onto each link.
#col2Links %>% paste0("https://asbdavani.org", .)
#make data.frame
answer <-data.frame(col2Links, col6Links, col7Links)
answer
col2Links col6Links col7Links
1 /horse/performance/13993 /horse/performance/6404 <NA>
2 /horse/performance/13873 /horse/performance/6404 <NA>
3 /horse/performance/533 /horse/performance/6404 /horse/performance/10958
4 /horse/performance/5277 /horse/performance/6404 /horse/performance/11051
5 /horse/performance/5461 /horse/performance/6404 /horse/performance/11049
6 /horse/performance/5602 /horse/performance/6404 /horse/performance/11084
7 /horse/performance/6466 /horse/performance/6404 /horse/performance/11097
8 /horse/performance/11004 /horse/performance/6404 /horse/performance/10994
9 /horse/performance/11113 /horse/performance/6404 /horse/performance/11097
10 /horse/performance/11114 /horse/performance/6404 /horse/performance/11097
11 /horse/performance/11126 /horse/performance/6404 /horse/performance/11119