在 R 中使用 Selenium 包抓取锚定网站
Scrape Anchored Website with Selenium Package in R
我是 R 的新手,无法从福布斯网站提取数据。
我当前的功能是:
url =
http://www.forbes.com/global2000/list/#page:1_sort:0_direction:asc_search:_filter:All%20industries_filter:All%20countries_filter:All%20states
data = readHTMLTable(url)
然而,福布斯网站在 link 内以“#”符号锚定。我下载了rselenium包来解析我想要的数据,但是我对reselenium不是很了解
有人 advice/expertise 有 reselenium 吗?我如何使用 reselenium 从 Forbes 提取数据?理想情况下,我想从网站的第 1、2 等页面提取数据。
谢谢!
有点老套,但这是我使用 rvest 和 read.delim...
的解决方案
library(rvest)
url <- "http://www.forbes.com/global2000/list/#page:1_sort:0_direction:asc_search:_filter:All%20industries_filter:All%20countries_filter:All%20states"
a <- html(url) %>%
html_nodes("#thelist") %>%
html_text()
con <- textConnection(a)
df <- read.delim(con, sep="\t", header=F, skip=12, stringsAsFactors=F)
close(con)
df$V1[df$V1==""] <- df$V3[df$V1==""]
df$V2 <- df$V3 <- NULL
df <- subset(df, V1!="")
df$index <- 1:nrow(df)
df2 <- data.frame(company=df$V1[df$index%%6==1],
country=df$V1[df$index%%6==2],
sales=df$V1[df$index%%6==3],
profits=df$V1[df$index%%6==4],
assets=df$V1[df$index%%6==5],
market_value=df$V1[df$index%%6==0])
或者使用 API 的另一种方式来填充网页。这会一次性下载所有 2000 家公司。
library(httr)
library(RJSONIO)
url <- "http://www.forbes.com/ajax/load_list/"
query <- list("type" = "organization",
"uri" = "global2000",
"year" = "2014")
response <- httr::GET(url, query=query)
dat_string <- as(response, "character")
dat_list <- RJSONIO::fromJSON(dat_string, asText=TRUE)
df <- data.frame(rank = sapply(dat_list, "[[", 1),
company = sapply(dat_list, "[[", 3),
country=sapply(dat_list, "[[", 10),
sales=sapply(dat_list, "[[", 6),
profits=sapply(dat_list, "[[", 7),
assets=sapply(dat_list, "[[", 8),
market_value=sapply(dat_list, "[[", 9), stringsAsFactors=F)
df <- df[order(df$rank),]
我是 R 的新手,无法从福布斯网站提取数据。
我当前的功能是:
url =
http://www.forbes.com/global2000/list/#page:1_sort:0_direction:asc_search:_filter:All%20industries_filter:All%20countries_filter:All%20states
data = readHTMLTable(url)
然而,福布斯网站在 link 内以“#”符号锚定。我下载了rselenium包来解析我想要的数据,但是我对reselenium不是很了解
有人 advice/expertise 有 reselenium 吗?我如何使用 reselenium 从 Forbes 提取数据?理想情况下,我想从网站的第 1、2 等页面提取数据。
谢谢!
有点老套,但这是我使用 rvest 和 read.delim...
的解决方案library(rvest)
url <- "http://www.forbes.com/global2000/list/#page:1_sort:0_direction:asc_search:_filter:All%20industries_filter:All%20countries_filter:All%20states"
a <- html(url) %>%
html_nodes("#thelist") %>%
html_text()
con <- textConnection(a)
df <- read.delim(con, sep="\t", header=F, skip=12, stringsAsFactors=F)
close(con)
df$V1[df$V1==""] <- df$V3[df$V1==""]
df$V2 <- df$V3 <- NULL
df <- subset(df, V1!="")
df$index <- 1:nrow(df)
df2 <- data.frame(company=df$V1[df$index%%6==1],
country=df$V1[df$index%%6==2],
sales=df$V1[df$index%%6==3],
profits=df$V1[df$index%%6==4],
assets=df$V1[df$index%%6==5],
market_value=df$V1[df$index%%6==0])
或者使用 API 的另一种方式来填充网页。这会一次性下载所有 2000 家公司。
library(httr)
library(RJSONIO)
url <- "http://www.forbes.com/ajax/load_list/"
query <- list("type" = "organization",
"uri" = "global2000",
"year" = "2014")
response <- httr::GET(url, query=query)
dat_string <- as(response, "character")
dat_list <- RJSONIO::fromJSON(dat_string, asText=TRUE)
df <- data.frame(rank = sapply(dat_list, "[[", 1),
company = sapply(dat_list, "[[", 3),
country=sapply(dat_list, "[[", 10),
sales=sapply(dat_list, "[[", 6),
profits=sapply(dat_list, "[[", 7),
assets=sapply(dat_list, "[[", 8),
market_value=sapply(dat_list, "[[", 9), stringsAsFactors=F)
df <- df[order(df$rank),]