从 R 中的列表中提取一些缺失值的 href
Extracting hrefs from a list in R with some missing values
我很想将一组 jekyll 主题的源代码和演示 url 提取到 data.frame
library(rvest)
info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")
data <- info %>%
html_nodes(" #wiki-body li")
data
{xml_nodeset (115)}
[11] <li>Typewriter - (<a href="https://github.com/alixedi/typewriter">source</a>, <a href="http://alixedi.github.io/typewriter">demo</a>)</li>
[12] <li>block-log - (<a href="https://github.com/anandubajith/block-log">source</a>), <a href="https://anandu.net/demo/block-log/">demo</a>)</li>
[13] <li>Otter Pop - (<a href="https://github.com/tybenz/otter-pop">source</a>)</li>
所以我想要一个有 3 列的 data.frame(df),例如
name source demo
Typewriter https://github.com/alixedi/typewriter http://alixedi.github.io/typewriter
我能够将所有 href 提取为向量,但是,如您所见,[13] 中的某些站点没有演示,因此我遇到了困难
有没有简单的方法可以从数据创建 df?可能使用 purrr 库
data_out <- c()
for (i in 1:length(data)) {
row <- data.frame(html_text(data[i]), as.character(html_children(data[[i]]))[1], as.character(html_children(data[[i]]))[2])
data_out <- rbind(data_out, row)
}
names(data_out) <- c("name", "source", "demo")
data_out$name <- gsub(" - [(]source, demo[)]", "", data_out$name)
data_out$source <- gsub("<a href=\"|\">source</a>", "", data_out$source)
data_out$demo <- gsub("<a href=\"|\">demo</a>", "", data_out$demo)
有demo数据的和没有demo数据的可以用xpath分开两组:
withDemo <- info %>%
html_nodes(xpath = "//li[contains(., 'source') and contains(., 'demo')]")
withoutDemo <- info %>%
html_nodes(xpath = "//li[contains(., 'source') and not(contains(.,'demo'))]")
然后,使用源和演示链接为集合创建数据框:
sourceNdemo <- withDemo %>%
html_children() %>% # get all children
html_attr("href") %>% # get the href attributes
matrix(ncol = 2, byrow = TRUE) # 2 pieces of data for each row
sourceNdemo <- setNames(
data.frame(html_text(withDemo), sourceNdemo), # html_text to get "name" column
c("name", "source", "demo"))
然后,为只有源数据的那些创建数据框
source <- withoutDemo %>%
html_children() %>%
html_attr("href")
# set demo = NA for easy rbind-ing
source <- data.frame(name = html_text(withoutDemo), source = source, demo = NA)
rbind
两个数据帧
allInfo <- rbind(sourceNdemo, source)
"name" 列现在包含 "Jalpc - (source, demo)" 和“"Bitwiser-Material (source, demo)" 等条目。您可以使用 gsub:
allInfo$name <- sub("\s(-\s)?\(.+$", "", allInfo$name, perl = TRUE)
这是您的 purrr
左右的答案:
library(rvest)
library(purrr)
library(dplyr)
info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")
themes <- html_nodes(info, xpath=".//div[@class='markdown-body']/*/li")
zero_to_na <- function(x) { ifelse(length(x)==0, NA, x) }
df <- data_frame(name=gsub(" [- ]*\(.*$", "", html_text(themes)),
source=map_chr(themes, ~html_attr(html_nodes(., xpath=".//a[contains(., 'source')]"), "href")),
demo=map_chr(themes, ~zero_to_na(html_attr(html_nodes(., xpath=".//a[contains(., 'demo')]"), "href"))))
glimpse(df)
## Observations: 115
## Variables: 3
## $ name <chr> "Jalpc", "Pixyll", "Jekyll Metro", "Midnight", "Leap Day", "F...
## $ source <chr> "https://github.com/Jack614/jalpc_jekyll_theme", "https://git...
## $ demo <chr> "http://www.jack003.com", "http://pixyll.com/", "http://blog-...
或者:
map_df(themes, function(x) {
data_frame(name=gsub(" [- ]*\(.*$", "", html_text(x)),
source=html_attr(html_nodes(x, xpath=".//a[contains(., 'source')]"), "href"),
demo=zero_to_na(html_attr(html_nodes(x, xpath=".//a[contains(., 'demo')]"), "href")))
})
gsub
/sub
/等等 "name" 你不想要的任何部分。
我很想将一组 jekyll 主题的源代码和演示 url 提取到 data.frame
library(rvest)
info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")
data <- info %>%
html_nodes(" #wiki-body li")
data
{xml_nodeset (115)}
[11] <li>Typewriter - (<a href="https://github.com/alixedi/typewriter">source</a>, <a href="http://alixedi.github.io/typewriter">demo</a>)</li>
[12] <li>block-log - (<a href="https://github.com/anandubajith/block-log">source</a>), <a href="https://anandu.net/demo/block-log/">demo</a>)</li>
[13] <li>Otter Pop - (<a href="https://github.com/tybenz/otter-pop">source</a>)</li>
所以我想要一个有 3 列的 data.frame(df),例如
name source demo
Typewriter https://github.com/alixedi/typewriter http://alixedi.github.io/typewriter
我能够将所有 href 提取为向量,但是,如您所见,[13] 中的某些站点没有演示,因此我遇到了困难
有没有简单的方法可以从数据创建 df?可能使用 purrr 库
data_out <- c()
for (i in 1:length(data)) {
row <- data.frame(html_text(data[i]), as.character(html_children(data[[i]]))[1], as.character(html_children(data[[i]]))[2])
data_out <- rbind(data_out, row)
}
names(data_out) <- c("name", "source", "demo")
data_out$name <- gsub(" - [(]source, demo[)]", "", data_out$name)
data_out$source <- gsub("<a href=\"|\">source</a>", "", data_out$source)
data_out$demo <- gsub("<a href=\"|\">demo</a>", "", data_out$demo)
有demo数据的和没有demo数据的可以用xpath分开两组:
withDemo <- info %>%
html_nodes(xpath = "//li[contains(., 'source') and contains(., 'demo')]")
withoutDemo <- info %>%
html_nodes(xpath = "//li[contains(., 'source') and not(contains(.,'demo'))]")
然后,使用源和演示链接为集合创建数据框:
sourceNdemo <- withDemo %>%
html_children() %>% # get all children
html_attr("href") %>% # get the href attributes
matrix(ncol = 2, byrow = TRUE) # 2 pieces of data for each row
sourceNdemo <- setNames(
data.frame(html_text(withDemo), sourceNdemo), # html_text to get "name" column
c("name", "source", "demo"))
然后,为只有源数据的那些创建数据框
source <- withoutDemo %>%
html_children() %>%
html_attr("href")
# set demo = NA for easy rbind-ing
source <- data.frame(name = html_text(withoutDemo), source = source, demo = NA)
rbind
两个数据帧
allInfo <- rbind(sourceNdemo, source)
"name" 列现在包含 "Jalpc - (source, demo)" 和“"Bitwiser-Material (source, demo)" 等条目。您可以使用 gsub:
allInfo$name <- sub("\s(-\s)?\(.+$", "", allInfo$name, perl = TRUE)
这是您的 purrr
左右的答案:
library(rvest)
library(purrr)
library(dplyr)
info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")
themes <- html_nodes(info, xpath=".//div[@class='markdown-body']/*/li")
zero_to_na <- function(x) { ifelse(length(x)==0, NA, x) }
df <- data_frame(name=gsub(" [- ]*\(.*$", "", html_text(themes)),
source=map_chr(themes, ~html_attr(html_nodes(., xpath=".//a[contains(., 'source')]"), "href")),
demo=map_chr(themes, ~zero_to_na(html_attr(html_nodes(., xpath=".//a[contains(., 'demo')]"), "href"))))
glimpse(df)
## Observations: 115
## Variables: 3
## $ name <chr> "Jalpc", "Pixyll", "Jekyll Metro", "Midnight", "Leap Day", "F...
## $ source <chr> "https://github.com/Jack614/jalpc_jekyll_theme", "https://git...
## $ demo <chr> "http://www.jack003.com", "http://pixyll.com/", "http://blog-...
或者:
map_df(themes, function(x) {
data_frame(name=gsub(" [- ]*\(.*$", "", html_text(x)),
source=html_attr(html_nodes(x, xpath=".//a[contains(., 'source')]"), "href"),
demo=zero_to_na(html_attr(html_nodes(x, xpath=".//a[contains(., 'demo')]"), "href")))
})
gsub
/sub
/等等 "name" 你不想要的任何部分。