使用 rowspan 值提取 html table
Extracting html table with rowspan values
我使用以下代码创建的数据框(使用 RCurl 和 XML 包)将三个字母团队缩写仅在其跨越的第一行中。我可以添加另一个包或附加代码来将数据保存在正确的列中吗?
library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])
这是一个答案。我冒昧的上了数据
library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)
initial_data =
"https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
readLines(encoding = "UTF-8") %>%
readHTMLTable(stringsAsFactors = F) %>%
`[[`(2) %>%
mutate(ID = 1:n(),
test =
V1 %>%
stri_detect_regex("^[A-Z]{2,3}$"))
variable_names = c("franchise",
"years",
"initial_postseason_appearances")
shifts =
initial_data %>%
filter(test %>% `!`) %>%
setNames(c(variable_names,
"trash",
"ID",
"test"))
team_initial =
initial_data %>%
filter(test) %>%
setNames(c("initial_abbreviation",
variable_names,
"ID",
"test")) %>%
bind_rows(shifts) %>%
arrange(ID) %>%
separate(years, c("start", "end")) %>%
mutate(abbreviation = initial_abbreviation %>% na.locf,
split_postseason_appearances =
initial_postseason_appearances %>%
plyr::mapvalues("–", NA) %>%
stri_split_fixed(", ") )
appearance =
team_initial %>%
select(franchise,
split_postseason_appearances) %>%
unnest(split_postseason_appearances) %>%
mutate(postseason_appearance =
split_postseason_appearances %>%
extract_numeric) %>%
select(-split_postseason_appearances)
team =
team_initial %>%
select(abbreviation,
franchise,
start,
end)
考虑一个 XML 包解决方案,需要使用 xpathSApply() with for
loop and if/then
logic. To capture the row-spanned table records, various XPath string functions are used: string-length(), concat(), and substring():
的各种 XPath 表达式
library(XML)
# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()
# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))
for (i in (1:numberofteams+1)) {
# TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {
code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
} else {
# TR NODES W/O LETTER TEAM ABBREVIATION
code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
//table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
}
}
# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code),
team = unlist(team),
year = unlist(year),
postseason = unlist(postseason))
你其实很接近。您唯一需要做的就是在正确的列和行中获取数据,因为某些行已向左移动。您可以按如下方式实现(在 data.table 和 zoo 包的帮助下):
# your original code
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])
# assigning proper names to the columns
names(playoffs) <- c("shortcode","franchise","years","appearances")
# 1. shift the dat columnwise for the rows in which there is no shortcode
# 2. fill the resulting NA's with the last observation
# 3. only keep the last shortcode when the previous ones are the same
# because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
franchise = shortcode,
years = franchise,
appearances = years)
][, shortcode := na.locf(shortcode)
][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]
试试 htmltab:
install.packages("htmltab")
library(htmltab)
purl <- htmlParse(url_source)
htmltab(purl, which = 2)
我使用以下代码创建的数据框(使用 RCurl 和 XML 包)将三个字母团队缩写仅在其跨越的第一行中。我可以添加另一个包或附加代码来将数据保存在正确的列中吗?
library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])
这是一个答案。我冒昧的上了数据
library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)
initial_data =
"https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
readLines(encoding = "UTF-8") %>%
readHTMLTable(stringsAsFactors = F) %>%
`[[`(2) %>%
mutate(ID = 1:n(),
test =
V1 %>%
stri_detect_regex("^[A-Z]{2,3}$"))
variable_names = c("franchise",
"years",
"initial_postseason_appearances")
shifts =
initial_data %>%
filter(test %>% `!`) %>%
setNames(c(variable_names,
"trash",
"ID",
"test"))
team_initial =
initial_data %>%
filter(test) %>%
setNames(c("initial_abbreviation",
variable_names,
"ID",
"test")) %>%
bind_rows(shifts) %>%
arrange(ID) %>%
separate(years, c("start", "end")) %>%
mutate(abbreviation = initial_abbreviation %>% na.locf,
split_postseason_appearances =
initial_postseason_appearances %>%
plyr::mapvalues("–", NA) %>%
stri_split_fixed(", ") )
appearance =
team_initial %>%
select(franchise,
split_postseason_appearances) %>%
unnest(split_postseason_appearances) %>%
mutate(postseason_appearance =
split_postseason_appearances %>%
extract_numeric) %>%
select(-split_postseason_appearances)
team =
team_initial %>%
select(abbreviation,
franchise,
start,
end)
考虑一个 XML 包解决方案,需要使用 xpathSApply() with for
loop and if/then
logic. To capture the row-spanned table records, various XPath string functions are used: string-length(), concat(), and substring():
library(XML)
# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()
# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))
for (i in (1:numberofteams+1)) {
# TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {
code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
} else {
# TR NODES W/O LETTER TEAM ABBREVIATION
code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
//table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
}
}
# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code),
team = unlist(team),
year = unlist(year),
postseason = unlist(postseason))
你其实很接近。您唯一需要做的就是在正确的列和行中获取数据,因为某些行已向左移动。您可以按如下方式实现(在 data.table 和 zoo 包的帮助下):
# your original code
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])
# assigning proper names to the columns
names(playoffs) <- c("shortcode","franchise","years","appearances")
# 1. shift the dat columnwise for the rows in which there is no shortcode
# 2. fill the resulting NA's with the last observation
# 3. only keep the last shortcode when the previous ones are the same
# because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
franchise = shortcode,
years = franchise,
appearances = years)
][, shortcode := na.locf(shortcode)
][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]
试试 htmltab:
install.packages("htmltab")
library(htmltab)
purl <- htmlParse(url_source)
htmltab(purl, which = 2)