Webscraping Rvest 不工作,未检测到表
Webscraping Rvest not working, tables not detected
我正在尝试从 https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps 中抓取数据来制作一个数据框,其中包含所有球员的姓名和他们的统计数据(总体评分、位置、pac、sho、pas、dri、def、phy),但是我的rvest 无法将信息检测为 table.
我试过了:
for(i in 1:10) {
page <- read_html(paste("https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps",sep=""))
}
StatsTable <- page %>%
html_table(fill=TRUE)
head(StatsTable)
这导致打印出 list() 而不是 table。
如何编辑网站上 read_html 和 html_table 检测到数据的 for 循环,以便我可以创建包含玩家统计数据的数据框?
第一页我也试过这样:
first <- read_html("https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps",sep="")
first
tab <- first %>%
html_nodes(".padding-0") %>%
html_text()
tab
### Deletes spaces and \n
tab <- gsub(" ", "", tab)
tab <- gsub("\n", " ", tab)
tab
这样我就得到了第一页的所有数据,但是所有的信息都被放入了字符中。也许是否可以从这些角色中提取名称和统计信息以将其放入数据框中?
这怎么可能?
我更新了代码,因此您可以立即将前十个子页面抓取到一个数据框中。请注意,抓取代码来自@Otto_Kässi 回答,所以所有功劳都归于他!!!
library(rvest)
library(stringr)
library(tidyverse)
url <- "https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps"
p1 <- str_c("https://www.futhead.com/22/players/",'?page=', 1:10)
pages <- paste0(p1,"&level=gold_nif&bin_platform=ps")
df <- tibble(player = character(),
overall= character(),
pac = character(),
sho = character(),
pas = character(),
dri = character(),
def = character(),
phy = character())
for (i in pages) {
i %>% read_html() %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-info']") %>% html_nodes("[class='player-image']") %>%
html_attr("alt") -> player_names
i %>% read_html() %>%
html_nodes("[class='player-right text-center hidden-xs']") %>%
html_nodes("[class='value']") %>%
html_text() %>%
matrix(nrow=length(player_names), ncol=6, byrow=T) -> player_stats
player_names %>% as_tibble() -> player_names
names(player_names) <- 'player'
substr(player_names$player, str_length(player_names$player)-1, str_length(player_names$player)) -> overall
player_names$overall <- overall
as_tibble(player_stats) -> player_stats
names(player_stats) <- c('pac','sho','pas','dri','def','phy')
#bind everything together
bind_cols(player_names, player_stats) -> players
df <- rbind(df, players)
rm(player_names); rm(player_stats); rm(players)
}
df <- df %>% mutate(player = str_replace_all(player, "[:digit:]", "")) %>% mutate_at(vars(2:7), as.numeric)
如果您一次 运行 整个代码,它应该可以工作!
我不认为你可以使用 html_table 完成你想要的。您尝试抓取的页面上的 table 不是 html table 元素。
您会注意到看起来像 table 的东西实际上是 <ul class="list-group list-group-table player-group-table">
。然后,您需要使用不同的 html_node() 命令来获取您想要的信息。即
page %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-info']") %>% html_nodes("[class='player-image']") %>%
html_attr("alt") -> player_names
和
page %>%
html_nodes("[class='player-right text-center hidden-xs']") %>%
html_nodes("[class='value']") %>%
html_text() %>%
matrix(nrow=length(player_names), ncol=6, byrow=T) -> player_stats
捕获玩家位置的一种方法是使用 gsub()
从 player-club-league-name
class 中找到 <strong>
和 </strong>
之间的字符串。
page %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-club-league-name']") %>%
gsub(".*<strong>(.+)</strong>.*", "\1", .) -> positions
最后把所有的东西都做成data.frame:
# make player_names into a tibble and extract overall score
library(tidyverse)
player_names %>% as_tibble() -> player_names
names(player_names) <- 'player'
substr(player_names$player, str_length(player_names$player)-1, str_length(player_names$player)) -> overall
player_names$overall <- overall
# stat names for player_stats
as_tibble(player_stats) -> player_stats
names(player_stats) <- c('pac','sho','pas','dri','def','phy')
#bind everything together
bind_cols(player_names, player_stats) -> players
rm(player_names); rm(player_stats)
结果:
> players
# A tibble: 48 x 8
player overall pac sho pas dri def phy
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Lionel Messi 93 93 85 92 91 95 34 65
2 Robert Lewandowski 92 92 78 92 79 86 44 82
3 C. Ronaldo dos Santos Aveiro 91 91 87 93 82 88 34 75
4 Kevin De Bruyne 91 91 76 86 93 88 64 78
5 Neymar da Silva Santos Jr. 91 91 91 83 86 94 37 63
6 Kylian Mbappé 91 91 97 88 80 92 36 77
7 Harry Kane 90 90 70 91 83 83 47 83
8 N'Golo Kanté 90 90 78 66 75 82 87 83
9 Mohamed Salah 89 89 90 87 81 90 45 75
10 Karim Benzema 89 89 76 86 81 87 39 77
# … with 38 more rows
我正在尝试从 https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps 中抓取数据来制作一个数据框,其中包含所有球员的姓名和他们的统计数据(总体评分、位置、pac、sho、pas、dri、def、phy),但是我的rvest 无法将信息检测为 table.
我试过了:
for(i in 1:10) {
page <- read_html(paste("https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps",sep=""))
}
StatsTable <- page %>%
html_table(fill=TRUE)
head(StatsTable)
这导致打印出 list() 而不是 table。 如何编辑网站上 read_html 和 html_table 检测到数据的 for 循环,以便我可以创建包含玩家统计数据的数据框?
第一页我也试过这样:
first <- read_html("https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps",sep="")
first
tab <- first %>%
html_nodes(".padding-0") %>%
html_text()
tab
### Deletes spaces and \n
tab <- gsub(" ", "", tab)
tab <- gsub("\n", " ", tab)
tab
这样我就得到了第一页的所有数据,但是所有的信息都被放入了字符中。也许是否可以从这些角色中提取名称和统计信息以将其放入数据框中? 这怎么可能?
我更新了代码,因此您可以立即将前十个子页面抓取到一个数据框中。请注意,抓取代码来自@Otto_Kässi 回答,所以所有功劳都归于他!!!
library(rvest)
library(stringr)
library(tidyverse)
url <- "https://www.futhead.com/22/players/?page=1&level=gold_nif&bin_platform=ps"
p1 <- str_c("https://www.futhead.com/22/players/",'?page=', 1:10)
pages <- paste0(p1,"&level=gold_nif&bin_platform=ps")
df <- tibble(player = character(),
overall= character(),
pac = character(),
sho = character(),
pas = character(),
dri = character(),
def = character(),
phy = character())
for (i in pages) {
i %>% read_html() %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-info']") %>% html_nodes("[class='player-image']") %>%
html_attr("alt") -> player_names
i %>% read_html() %>%
html_nodes("[class='player-right text-center hidden-xs']") %>%
html_nodes("[class='value']") %>%
html_text() %>%
matrix(nrow=length(player_names), ncol=6, byrow=T) -> player_stats
player_names %>% as_tibble() -> player_names
names(player_names) <- 'player'
substr(player_names$player, str_length(player_names$player)-1, str_length(player_names$player)) -> overall
player_names$overall <- overall
as_tibble(player_stats) -> player_stats
names(player_stats) <- c('pac','sho','pas','dri','def','phy')
#bind everything together
bind_cols(player_names, player_stats) -> players
df <- rbind(df, players)
rm(player_names); rm(player_stats); rm(players)
}
df <- df %>% mutate(player = str_replace_all(player, "[:digit:]", "")) %>% mutate_at(vars(2:7), as.numeric)
如果您一次 运行 整个代码,它应该可以工作!
我不认为你可以使用 html_table 完成你想要的。您尝试抓取的页面上的 table 不是 html table 元素。
您会注意到看起来像 table 的东西实际上是 <ul class="list-group list-group-table player-group-table">
。然后,您需要使用不同的 html_node() 命令来获取您想要的信息。即
page %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-info']") %>% html_nodes("[class='player-image']") %>%
html_attr("alt") -> player_names
和
page %>%
html_nodes("[class='player-right text-center hidden-xs']") %>%
html_nodes("[class='value']") %>%
html_text() %>%
matrix(nrow=length(player_names), ncol=6, byrow=T) -> player_stats
捕获玩家位置的一种方法是使用 gsub()
从 player-club-league-name
class 中找到 <strong>
和 </strong>
之间的字符串。
page %>%
html_nodes("[class='list-group list-group-table player-group-table']") %>%
html_nodes("[class='player-club-league-name']") %>%
gsub(".*<strong>(.+)</strong>.*", "\1", .) -> positions
最后把所有的东西都做成data.frame:
# make player_names into a tibble and extract overall score
library(tidyverse)
player_names %>% as_tibble() -> player_names
names(player_names) <- 'player'
substr(player_names$player, str_length(player_names$player)-1, str_length(player_names$player)) -> overall
player_names$overall <- overall
# stat names for player_stats
as_tibble(player_stats) -> player_stats
names(player_stats) <- c('pac','sho','pas','dri','def','phy')
#bind everything together
bind_cols(player_names, player_stats) -> players
rm(player_names); rm(player_stats)
结果:
> players
# A tibble: 48 x 8
player overall pac sho pas dri def phy
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Lionel Messi 93 93 85 92 91 95 34 65
2 Robert Lewandowski 92 92 78 92 79 86 44 82
3 C. Ronaldo dos Santos Aveiro 91 91 87 93 82 88 34 75
4 Kevin De Bruyne 91 91 76 86 93 88 64 78
5 Neymar da Silva Santos Jr. 91 91 91 83 86 94 37 63
6 Kylian Mbappé 91 91 97 88 80 92 36 77
7 Harry Kane 90 90 70 91 83 83 47 83
8 N'Golo Kanté 90 90 78 66 75 82 87 83
9 Mohamed Salah 89 89 90 87 81 90 45 75
10 Karim Benzema 89 89 76 86 81 87 39 77
# … with 38 more rows