硒刮 table
Rselenium scraping table
我想提取位于此处“https://www.chess.com/member/magnuscarlsen”的“已完成游戏”table 中的数据。
下面的代码为我提供了一个大小为 0 的列表。Selenium 方面似乎可以正常工作。 Firefox 浏览器在我的桌面上打开并导航到该页面。任何帮助将不胜感激。我已经无计可施了!
rD <- rsDriver(browser="firefox", port=4442L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
signal <- html %>%
html_nodes("table.table-component table-hover archived-games-table")
这里有一种方法可以轻松解决您的问题,因为页面本身只有一个 table。使用 rvest 可以轻松将其取出。请注意,我使用管道是因为我更喜欢它们。你当然可以没有它们。
library(RSelenium)
library(rvest)
rD <- rsDriver(browser="firefox", port=4443L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
##required table
html %>% html_table() %>% .[[1]]
1
如果您不介意没有准确度数据(我认为没有已发布的计算基础),请查看 public APIs from Chess.com。你确实得到了所有的动作信息。
特别是通过 BigChess 包的实现。我修改了下面的示例:
所有游戏:
library(rjson)
library(bigchess)
user <- "magnuscarlsen"
json_file <- paste0("https://api.chess.com/pub/player/", user,"/games/archives")
json_data <- fromJSON(paste(readLines(json_file), collapse = ""))
result <- data.frame()
for(i in json_data$archives)
result <- rbind(result, read.pgn(paste0(i, "/pgn")))
单月:
library(bigchess)
df <- read.pgn("https://api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn")
print(df[df$Date == '2020.12.11'])
- 根据要求添加您的准确度。该页面上的大部分信息实际上可以通过 API 获得:
library(bigchess)
#> Warning: package 'bigchess' was built under R version 4.0.3
library(purrr)
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(stringr)
try_again <- function(link) { #https://blog.r-hub.io/2020/04/07/retry-wheel/
maxtry <- 5
try <- 1
resp <- read_json(link)
while (try <= maxtry && is.null(resp$data)) {
resp <- read_json(.)
try <- try + 1
Sys.sleep(try * .25)
}
return(resp)
}
url <- "https://api.chess.com/pub/player/magnuscarlsen/games/2020/12"
result <- data.frame()
result <- read.pgn(paste0(url, "/pgn"))
#> Warning in readLines(con): incomplete final line found on 'https://
#> api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn'
#> 2021-02-15 20:29:04, successfully imported 47 games
#> 2021-02-15 20:29:04, N moves computed
#> 2021-02-15 20:29:04, extract moves done
#> 2021-02-15 20:29:04, stat moves computed
result <- filter(result, result$Date == "2020.12.11")
data <- read_json(url)
mask <- map(data$games, ~ !is.na(str_match(.x$pgn, 'UTCDate\s\"2020\.12\.11')[, 1])) %>% unlist()
games <- data$games[mask]
games <- paste0("https://www.chess.com/callback/analysis/game/live/", map(games, ~ str_match(.x$url, "\d+")[, 1]), "/all")
df <- map_df(games, ~ {
json_data <- try_again(.x)
tryCatch(
data.frame(
Url = .x,
WhiteAccuracy = json_data$data$analysis$CAPS$white$all,
BlackAccuracy = json_data$data$analysis$CAPS$black$all,
stringsAsFactors = FALSE
),
error = function(e) {
data.frame(
Url = .x,
WhiteAccuracy = NA_integer_,
BlackAccuracy = NA_integer_,
stringsAsFactors = FALSE
)
}
)
})
final <- cbind(result, df)
#> Error in .cbind.ts(list(...), .makeNamesTs(...), dframe = FALSE, union = TRUE): non-time series not of the correct length
由 reprex package (v0.3.0)
于 2021-02-15 创建
我想提取位于此处“https://www.chess.com/member/magnuscarlsen”的“已完成游戏”table 中的数据。
下面的代码为我提供了一个大小为 0 的列表。Selenium 方面似乎可以正常工作。 Firefox 浏览器在我的桌面上打开并导航到该页面。任何帮助将不胜感激。我已经无计可施了!
rD <- rsDriver(browser="firefox", port=4442L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
signal <- html %>%
html_nodes("table.table-component table-hover archived-games-table")
这里有一种方法可以轻松解决您的问题,因为页面本身只有一个 table。使用 rvest 可以轻松将其取出。请注意,我使用管道是因为我更喜欢它们。你当然可以没有它们。
library(RSelenium)
library(rvest)
rD <- rsDriver(browser="firefox", port=4443L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
##required table
html %>% html_table() %>% .[[1]]
1 如果您不介意没有准确度数据(我认为没有已发布的计算基础),请查看 public APIs from Chess.com。你确实得到了所有的动作信息。
特别是通过 BigChess 包的实现。我修改了下面的示例:
所有游戏:
library(rjson)
library(bigchess)
user <- "magnuscarlsen"
json_file <- paste0("https://api.chess.com/pub/player/", user,"/games/archives")
json_data <- fromJSON(paste(readLines(json_file), collapse = ""))
result <- data.frame()
for(i in json_data$archives)
result <- rbind(result, read.pgn(paste0(i, "/pgn")))
单月:
library(bigchess)
df <- read.pgn("https://api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn")
print(df[df$Date == '2020.12.11'])
- 根据要求添加您的准确度。该页面上的大部分信息实际上可以通过 API 获得:
library(bigchess)
#> Warning: package 'bigchess' was built under R version 4.0.3
library(purrr)
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(stringr)
try_again <- function(link) { #https://blog.r-hub.io/2020/04/07/retry-wheel/
maxtry <- 5
try <- 1
resp <- read_json(link)
while (try <= maxtry && is.null(resp$data)) {
resp <- read_json(.)
try <- try + 1
Sys.sleep(try * .25)
}
return(resp)
}
url <- "https://api.chess.com/pub/player/magnuscarlsen/games/2020/12"
result <- data.frame()
result <- read.pgn(paste0(url, "/pgn"))
#> Warning in readLines(con): incomplete final line found on 'https://
#> api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn'
#> 2021-02-15 20:29:04, successfully imported 47 games
#> 2021-02-15 20:29:04, N moves computed
#> 2021-02-15 20:29:04, extract moves done
#> 2021-02-15 20:29:04, stat moves computed
result <- filter(result, result$Date == "2020.12.11")
data <- read_json(url)
mask <- map(data$games, ~ !is.na(str_match(.x$pgn, 'UTCDate\s\"2020\.12\.11')[, 1])) %>% unlist()
games <- data$games[mask]
games <- paste0("https://www.chess.com/callback/analysis/game/live/", map(games, ~ str_match(.x$url, "\d+")[, 1]), "/all")
df <- map_df(games, ~ {
json_data <- try_again(.x)
tryCatch(
data.frame(
Url = .x,
WhiteAccuracy = json_data$data$analysis$CAPS$white$all,
BlackAccuracy = json_data$data$analysis$CAPS$black$all,
stringsAsFactors = FALSE
),
error = function(e) {
data.frame(
Url = .x,
WhiteAccuracy = NA_integer_,
BlackAccuracy = NA_integer_,
stringsAsFactors = FALSE
)
}
)
})
final <- cbind(result, df)
#> Error in .cbind.ts(list(...), .makeNamesTs(...), dframe = FALSE, union = TRUE): non-time series not of the correct length
由 reprex package (v0.3.0)
于 2021-02-15 创建