从 stats.nba.com 抓取表格 - 多种方法无效
Scraping tables from stats.nba.com - multiple approaches not working
我们正在尝试将 table 从这里 - https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season - 抓取到 R 中。这是我们目前所做的尝试:
# get request from API found in network tab - this doesn't work, the request hangs
httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=')
# rvest returns empty nodeset when grabbing tables on page
'https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season' %>%
read_html() %>%
html_nodes('table')
是否可以使用 R 从该网页中抓取主要 table?
编辑:
headers = c(
`authority` = 'www.nba.com',
`cache-control` = 'max-age=0',
`sec-ch-ua` = '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
`sec-ch-ua-mobile` = '?0',
`sec-ch-ua-platform` = '"macOS"',
`upgrade-insecure-requests` = '1',
`user-agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
`accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
`sec-fetch-site` = 'same-origin',
`sec-fetch-mode` = 'navigate',
`sec-fetch-user` = '?1',
`sec-fetch-dest` = 'document',
`accept-language` = 'en-US,en;q=0.9',
`cookie` = 'usprivacy=1YNN; AMCVS_248F210755B762187F000101%40AdobeOrg=1; s_ecid=MCMID%7C39761269548384710744541812242089157146; countryCode=US; s_cc=true; ug=61647d1f0252400a3f87470014d69025; nlhidescores=false; _pbjs_userid_consent_data=3524755945110770; qoscid=524912006.1633975588; qossid=1633975588; client_type=html5; client_version=4.4.0; ugs=1; OptanonAlertBoxClosed=2021-10-12T23:20:24.183Z; at_check=true; _parsely_visitor={%22id%22:%22pid=0cb0a9a5854f45ea8a6d48f74f03e800%22%2C%22session_count%22:1%2C%22last_session_ts%22:1634155541257}; ab.storage.deviceId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%2228d2f640-2ad0-b8e9-b78c-016ba5a85671%22%2C%22c%22%3A1634155541318%2C%22l%22%3A1634155541318%7D; OptanonControl=ccc=US&csc=&cic=0&otvers=6.24.0&pctm=2021-10-12T23%3A20%3A24.183Z®=ccpa&ustcs=1YNN&vers=3.1.5; aam_uuid=39724801183369993254542124123886279717; s_ips=796; mbox=session#70d31bd3ea124acc80cb089a5594528e#1634158760|PC#70d31bd3ea124acc80cb089a5594528e.34_0#1697401700; ab.storage.sessionId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%228dcfd2a2-4419-87f9-7e1c-22cf76830e7e%22%2C%22e%22%3A1634158700129%2C%22c%22%3A1634155541315%2C%22l%22%3A1634156900129%7D; s_tp=2924; s_ppv=nba%253Ateams%253Amain%2C27%2C27%2C796%2C1%2C3; ak_bmsc=2C1E9B2928FD1C90ECFF4A5887776269~000000000000000000000000000000~YAAQrL4cuDCzpVJ8AQAAytvzew1NuriisqR0MtOqexD1CqvqIJKuuhJda9NNGXOBCOjAdMEXnQjL10fYxWYj9HLm2DJdQLQIjLSqvl3faGyPbxWARg6dKwmf4NK/+RENdJTZfsKGTbwUMxTtPRSoR7TmMc3UWE4tAdft14nRiSPZwp/DJjK9NUhLtpTDjCa65HELyeJ7O4M4d98rAu5R7YYZOEVRjz5VRQEGaFBc5u2OlaUpcyFDqUM+j+jII/6xmqgwVRUhX8t8oNmdeiYpfEALo1yewznqZcfOO18htGp4sF3SLPG8bBFvLeGwW118Mu1rVkyeO4PEvC7UFZUc+a7tGNSjGyGe0WSC/0iSjTC+/ikP2BPwMosXe7DxWk/a0vuFtUlw7jArB/YQuYHH61uu8E97UTA=; AMCV_248F210755B762187F000101%40AdobeOrg=359503849%7CMCMID%7C39761269548384710744541812242089157146%7CMCAAMLH-1634771953%7C7%7CMCAAMB-1634771953%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1634174353s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.0.1%7CMCIDTS%7C18914; s_gpv_pageModal=nba%3Astats%3Ateams%3Aadvanced; s_sq=%5B%5BB%5D%5D; akavpau_allowednbamain=1634169266~id=4fd4cabce5336e66bef275d5dd409a10; bm_sv=467DB2784E3DE76FAA9F4CD21DD7DE3C~8bPs2wRiWvWAD8K8MYos9duNZqYto/EQc8HFibswczdPYqofRTJZOTE4Xy1RsB9fJag8YMdv3OOHkVFDGoh7aG8x4Y8eZepOfBGMFtPmQF0Vgg0XNix35HHU2sk9RKCEQujy2BRS4m269Y6fIapqEQ==; OptanonConsent=isIABGlobal=false&datestamp=Wed+Oct+13+2021+19%3A44%3A27+GMT-0400+(Eastern+Daylight+Time)&version=6.24.0&hosts=&consentId=e8a9be54-a345-44df-90e1-eaaf56d98079&interactionCount=2&landingPath=NotLandingPage&groups=BG30%3A1%2Cven%3A1%2Cpad%3A1%2Cpap%3A1%2Ccad%3A1%2Cmap%3A1%2Cdsa%3A1%2CNBAad%3A1%2Creq%3A1%2Csec%3A1%2Cgld%3A1%2Cpcp%3A1%2Cmcp%3A1%2Cmra%3A1%2Ctdc%3A1%2Ccos%3A1%2Cdid%3A1%2Csid%3A1%2Cpdd%3A1%2Cpcd%3A1%2CNBAmt%3A1&AwaitingReconsent=false&geolocation=US%3B'
)
params = list(
`sort` = 'W',
`dir` = '-1',
`Season` = '2020-21',
`SeasonType` = 'Regular Season'
)
res <- httr::GET(url = 'https://www.nba.com/stats/teams/advanced/', httr::add_headers(.headers=headers), query = params)
下面的 returns 是一个 res
变量,但我们现在正在努力从 res
.
中提取内容
如评论中所述,许多 headers 和参数都不需要,但这有效:
library(data.table)
library(magrittr)
headers = c(
`Connection` = 'keep-alive',
`Accept` = 'application/json, text/plain, */*',
`x-nba-stats-token` = 'true',
`DNT` = '1',
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
`x-nba-stats-origin` = 'stats',
`Sec-GPC` = '1',
`Origin` = 'https://www.nba.com',
`Sec-Fetch-Site` = 'same-site',
`Sec-Fetch-Mode` = 'cors',
`Sec-Fetch-Dest` = 'empty',
`Referer` = 'https://www.nba.com/',
`Accept-Language` = 'en-US,en;q=0.9',
`If-Modified-Since` = 'Wed, 13 Oct 2021 23:24:06 GMT'
)
params = list(
`Conference` = '',
`DateFrom` = '',
`DateTo` = '',
`Division` = '',
`GameScope` = '',
`GameSegment` = '',
`LastNGames` = '0',
`LeagueID` = '00',
`Location` = '',
`MeasureType` = 'Advanced',
`Month` = '0',
`OpponentTeamID` = '0',
`Outcome` = '',
`PORound` = '0',
`PaceAdjust` = 'N',
`PerMode` = 'PerGame',
`Period` = '0',
`PlayerExperience` = '',
`PlayerPosition` = '',
`PlusMinus` = 'N',
`Rank` = 'N',
`Season` = '2020-21',
`SeasonSegment` = '',
`SeasonType` = 'Regular Season',
`ShotClockRange` = '',
`StarterBench` = '',
`TeamID` = '0',
`TwoWay` = '0',
`VsConference` = '',
`VsDivision` = ''
)
res <- httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats', httr::add_headers(.headers=headers), query = params)
data <- httr::content(res) %>% .[['resultSets']] %>% .[[1]]
column_names <- data$headers %>% as.character()
dt <- rbindlist(data$rowSet) %>% setnames(column_names)
给出:
head(dt, 2)
TEAM_ID TEAM_NAME GP W L W_PCT MIN E_OFF_RATING OFF_RATING E_DEF_RATING DEF_RATING E_NET_RATING NET_RATING AST_PCT
1: 1610612737 Atlanta Hawks 72 41 31 0.569 3481 113 114.3 110.6 112.1 2.5 2.2 0.591
2: 1610612738 Boston Celtics 72 36 36 0.5 3476 111 113.1 110 111.8 0.9 1.2 0.566
AST_TO AST_RATIO OREB_PCT DREB_PCT REB_PCT TM_TOV_PCT EFG_PCT TS_PCT E_PACE PACE PACE_PER40 POSS PIE GP_RANK W_RANK L_RANK
1: 1.82 17.6 0.284 0.742 0.516 0.133 0.539 0.581 99.9 98.68 82.23 7160 0.511 1 11 11
2: 1.67 17.1 0.289 0.737 0.51 0.141 0.543 0.574 100.7 98.94 82.45 7172 0.501 1 16 16
W_PCT_RANK MIN_RANK OFF_RATING_RANK DEF_RATING_RANK NET_RATING_RANK AST_PCT_RANK AST_TO_RANK AST_RATIO_RANK OREB_PCT_RANK
1: 11 11 9 18 11 18 14 20 6
2: 16 16 10 13 13 27 22 26 3
DREB_PCT_RANK REB_PCT_RANK TM_TOV_PCT_RANK EFG_PCT_RANK TS_PCT_RANK PACE_RANK PIE_RANK CFID CFPARAMS
1: 9 7 10 16 10 22 10 10 Atlanta Hawks
2: 13 10 18 12 16 20 17 10 Boston Celtics
一个RSelenium
解决方案,
library(RSelenium)
library(dply)
library(rvest)
driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]
remDr$navigate('https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season')
# select element
table <- remDr$findElement(using = 'xpath', value = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table')
df1 = table$getPageSource()[[1]]%>%
read_html() %>%
html_table()
[[1]]
# A tibble: 30 x 39
`` TEAM GP W L MIN OffRtg DefRtg NetRtg `AST%` `AST/TO` ASTRatio `OREB%` `DREB%` `REB%` `TOV%` `eFG%` `TS%` PACE PIE POSS `GP RANK`
<int> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <lgl>
1 1 Utah Jazz 72 52 20 3471 116. 108. 9 57.3 1.66 17.3 28.4 75.7 52.9 14.2 56.3 59.7 99.4 54.2 7,193 NA
2 2 Phoenix ~ 72 51 21 3496 116. 110. 5.9 62.2 2.15 19.6 24.8 74.1 50 12.6 56.4 59.7 98 53.4 7,137 NA
3 3 Philadel~ 72 49 23 3486 112. 107 5.5 57.2 1.64 17.2 27.7 73.7 51.1 14.3 54.1 57.9 100. 53.6 7,272 NA
4 4 Brooklyn~ 72 48 24 3481 117. 113. 4.2 62.1 1.98 19.3 25.2 72.6 50.3 13.4 57.5 61 100. 53.2 7,280 NA
5 5 Denver N~ 72 47 25 3496 116. 112. 4.8 62.1 1.99 19.3 29.2 75.1 52.2 13.6 55.7 58.8 97.7 52.5 7,123 NA
6 5 LA Clipp~ 72 47 25 3456 117. 111. 6.1 58.4 1.85 18.1 27 75.4 51.8 13.5 56.4 59.9 97.6 53 7,036 NA
7 7 Milwauke~ 72 46 26 3466 116. 111. 5.8 56.9 1.84 18 26.9 75.5 51.9 13.4 56.6 59.3 103. 53.3 7,423 NA
8 8 Dallas M~ 72 42 30 3461 115. 112. 2.3 55.7 1.9 17.2 25.3 73.4 49.6 12.3 55 58.2 97.9 51 7,062 NA
9 8 Los Ange~ 72 42 30 3491 110. 107. 2.9 60.7 1.62 18 26.9 74.8 51.1 15.2 53.6 56.9 98.8 51.7 7,184 NA
或使用XML
readHTMLTable((table$getPageSource()[[1]]))
我们正在尝试将 table 从这里 - https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season - 抓取到 R 中。这是我们目前所做的尝试:
# get request from API found in network tab - this doesn't work, the request hangs
httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=')
# rvest returns empty nodeset when grabbing tables on page
'https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season' %>%
read_html() %>%
html_nodes('table')
是否可以使用 R 从该网页中抓取主要 table?
编辑:
headers = c(
`authority` = 'www.nba.com',
`cache-control` = 'max-age=0',
`sec-ch-ua` = '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
`sec-ch-ua-mobile` = '?0',
`sec-ch-ua-platform` = '"macOS"',
`upgrade-insecure-requests` = '1',
`user-agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
`accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
`sec-fetch-site` = 'same-origin',
`sec-fetch-mode` = 'navigate',
`sec-fetch-user` = '?1',
`sec-fetch-dest` = 'document',
`accept-language` = 'en-US,en;q=0.9',
`cookie` = 'usprivacy=1YNN; AMCVS_248F210755B762187F000101%40AdobeOrg=1; s_ecid=MCMID%7C39761269548384710744541812242089157146; countryCode=US; s_cc=true; ug=61647d1f0252400a3f87470014d69025; nlhidescores=false; _pbjs_userid_consent_data=3524755945110770; qoscid=524912006.1633975588; qossid=1633975588; client_type=html5; client_version=4.4.0; ugs=1; OptanonAlertBoxClosed=2021-10-12T23:20:24.183Z; at_check=true; _parsely_visitor={%22id%22:%22pid=0cb0a9a5854f45ea8a6d48f74f03e800%22%2C%22session_count%22:1%2C%22last_session_ts%22:1634155541257}; ab.storage.deviceId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%2228d2f640-2ad0-b8e9-b78c-016ba5a85671%22%2C%22c%22%3A1634155541318%2C%22l%22%3A1634155541318%7D; OptanonControl=ccc=US&csc=&cic=0&otvers=6.24.0&pctm=2021-10-12T23%3A20%3A24.183Z®=ccpa&ustcs=1YNN&vers=3.1.5; aam_uuid=39724801183369993254542124123886279717; s_ips=796; mbox=session#70d31bd3ea124acc80cb089a5594528e#1634158760|PC#70d31bd3ea124acc80cb089a5594528e.34_0#1697401700; ab.storage.sessionId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%228dcfd2a2-4419-87f9-7e1c-22cf76830e7e%22%2C%22e%22%3A1634158700129%2C%22c%22%3A1634155541315%2C%22l%22%3A1634156900129%7D; s_tp=2924; s_ppv=nba%253Ateams%253Amain%2C27%2C27%2C796%2C1%2C3; ak_bmsc=2C1E9B2928FD1C90ECFF4A5887776269~000000000000000000000000000000~YAAQrL4cuDCzpVJ8AQAAytvzew1NuriisqR0MtOqexD1CqvqIJKuuhJda9NNGXOBCOjAdMEXnQjL10fYxWYj9HLm2DJdQLQIjLSqvl3faGyPbxWARg6dKwmf4NK/+RENdJTZfsKGTbwUMxTtPRSoR7TmMc3UWE4tAdft14nRiSPZwp/DJjK9NUhLtpTDjCa65HELyeJ7O4M4d98rAu5R7YYZOEVRjz5VRQEGaFBc5u2OlaUpcyFDqUM+j+jII/6xmqgwVRUhX8t8oNmdeiYpfEALo1yewznqZcfOO18htGp4sF3SLPG8bBFvLeGwW118Mu1rVkyeO4PEvC7UFZUc+a7tGNSjGyGe0WSC/0iSjTC+/ikP2BPwMosXe7DxWk/a0vuFtUlw7jArB/YQuYHH61uu8E97UTA=; AMCV_248F210755B762187F000101%40AdobeOrg=359503849%7CMCMID%7C39761269548384710744541812242089157146%7CMCAAMLH-1634771953%7C7%7CMCAAMB-1634771953%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1634174353s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.0.1%7CMCIDTS%7C18914; s_gpv_pageModal=nba%3Astats%3Ateams%3Aadvanced; s_sq=%5B%5BB%5D%5D; akavpau_allowednbamain=1634169266~id=4fd4cabce5336e66bef275d5dd409a10; bm_sv=467DB2784E3DE76FAA9F4CD21DD7DE3C~8bPs2wRiWvWAD8K8MYos9duNZqYto/EQc8HFibswczdPYqofRTJZOTE4Xy1RsB9fJag8YMdv3OOHkVFDGoh7aG8x4Y8eZepOfBGMFtPmQF0Vgg0XNix35HHU2sk9RKCEQujy2BRS4m269Y6fIapqEQ==; OptanonConsent=isIABGlobal=false&datestamp=Wed+Oct+13+2021+19%3A44%3A27+GMT-0400+(Eastern+Daylight+Time)&version=6.24.0&hosts=&consentId=e8a9be54-a345-44df-90e1-eaaf56d98079&interactionCount=2&landingPath=NotLandingPage&groups=BG30%3A1%2Cven%3A1%2Cpad%3A1%2Cpap%3A1%2Ccad%3A1%2Cmap%3A1%2Cdsa%3A1%2CNBAad%3A1%2Creq%3A1%2Csec%3A1%2Cgld%3A1%2Cpcp%3A1%2Cmcp%3A1%2Cmra%3A1%2Ctdc%3A1%2Ccos%3A1%2Cdid%3A1%2Csid%3A1%2Cpdd%3A1%2Cpcd%3A1%2CNBAmt%3A1&AwaitingReconsent=false&geolocation=US%3B'
)
params = list(
`sort` = 'W',
`dir` = '-1',
`Season` = '2020-21',
`SeasonType` = 'Regular Season'
)
res <- httr::GET(url = 'https://www.nba.com/stats/teams/advanced/', httr::add_headers(.headers=headers), query = params)
下面的 returns 是一个 res
变量,但我们现在正在努力从 res
.
如评论中所述,许多 headers 和参数都不需要,但这有效:
library(data.table)
library(magrittr)
headers = c(
`Connection` = 'keep-alive',
`Accept` = 'application/json, text/plain, */*',
`x-nba-stats-token` = 'true',
`DNT` = '1',
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
`x-nba-stats-origin` = 'stats',
`Sec-GPC` = '1',
`Origin` = 'https://www.nba.com',
`Sec-Fetch-Site` = 'same-site',
`Sec-Fetch-Mode` = 'cors',
`Sec-Fetch-Dest` = 'empty',
`Referer` = 'https://www.nba.com/',
`Accept-Language` = 'en-US,en;q=0.9',
`If-Modified-Since` = 'Wed, 13 Oct 2021 23:24:06 GMT'
)
params = list(
`Conference` = '',
`DateFrom` = '',
`DateTo` = '',
`Division` = '',
`GameScope` = '',
`GameSegment` = '',
`LastNGames` = '0',
`LeagueID` = '00',
`Location` = '',
`MeasureType` = 'Advanced',
`Month` = '0',
`OpponentTeamID` = '0',
`Outcome` = '',
`PORound` = '0',
`PaceAdjust` = 'N',
`PerMode` = 'PerGame',
`Period` = '0',
`PlayerExperience` = '',
`PlayerPosition` = '',
`PlusMinus` = 'N',
`Rank` = 'N',
`Season` = '2020-21',
`SeasonSegment` = '',
`SeasonType` = 'Regular Season',
`ShotClockRange` = '',
`StarterBench` = '',
`TeamID` = '0',
`TwoWay` = '0',
`VsConference` = '',
`VsDivision` = ''
)
res <- httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats', httr::add_headers(.headers=headers), query = params)
data <- httr::content(res) %>% .[['resultSets']] %>% .[[1]]
column_names <- data$headers %>% as.character()
dt <- rbindlist(data$rowSet) %>% setnames(column_names)
给出:
head(dt, 2)
TEAM_ID TEAM_NAME GP W L W_PCT MIN E_OFF_RATING OFF_RATING E_DEF_RATING DEF_RATING E_NET_RATING NET_RATING AST_PCT
1: 1610612737 Atlanta Hawks 72 41 31 0.569 3481 113 114.3 110.6 112.1 2.5 2.2 0.591
2: 1610612738 Boston Celtics 72 36 36 0.5 3476 111 113.1 110 111.8 0.9 1.2 0.566
AST_TO AST_RATIO OREB_PCT DREB_PCT REB_PCT TM_TOV_PCT EFG_PCT TS_PCT E_PACE PACE PACE_PER40 POSS PIE GP_RANK W_RANK L_RANK
1: 1.82 17.6 0.284 0.742 0.516 0.133 0.539 0.581 99.9 98.68 82.23 7160 0.511 1 11 11
2: 1.67 17.1 0.289 0.737 0.51 0.141 0.543 0.574 100.7 98.94 82.45 7172 0.501 1 16 16
W_PCT_RANK MIN_RANK OFF_RATING_RANK DEF_RATING_RANK NET_RATING_RANK AST_PCT_RANK AST_TO_RANK AST_RATIO_RANK OREB_PCT_RANK
1: 11 11 9 18 11 18 14 20 6
2: 16 16 10 13 13 27 22 26 3
DREB_PCT_RANK REB_PCT_RANK TM_TOV_PCT_RANK EFG_PCT_RANK TS_PCT_RANK PACE_RANK PIE_RANK CFID CFPARAMS
1: 9 7 10 16 10 22 10 10 Atlanta Hawks
2: 13 10 18 12 16 20 17 10 Boston Celtics
一个RSelenium
解决方案,
library(RSelenium)
library(dply)
library(rvest)
driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]
remDr$navigate('https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season')
# select element
table <- remDr$findElement(using = 'xpath', value = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table')
df1 = table$getPageSource()[[1]]%>%
read_html() %>%
html_table()
[[1]]
# A tibble: 30 x 39
`` TEAM GP W L MIN OffRtg DefRtg NetRtg `AST%` `AST/TO` ASTRatio `OREB%` `DREB%` `REB%` `TOV%` `eFG%` `TS%` PACE PIE POSS `GP RANK`
<int> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <lgl>
1 1 Utah Jazz 72 52 20 3471 116. 108. 9 57.3 1.66 17.3 28.4 75.7 52.9 14.2 56.3 59.7 99.4 54.2 7,193 NA
2 2 Phoenix ~ 72 51 21 3496 116. 110. 5.9 62.2 2.15 19.6 24.8 74.1 50 12.6 56.4 59.7 98 53.4 7,137 NA
3 3 Philadel~ 72 49 23 3486 112. 107 5.5 57.2 1.64 17.2 27.7 73.7 51.1 14.3 54.1 57.9 100. 53.6 7,272 NA
4 4 Brooklyn~ 72 48 24 3481 117. 113. 4.2 62.1 1.98 19.3 25.2 72.6 50.3 13.4 57.5 61 100. 53.2 7,280 NA
5 5 Denver N~ 72 47 25 3496 116. 112. 4.8 62.1 1.99 19.3 29.2 75.1 52.2 13.6 55.7 58.8 97.7 52.5 7,123 NA
6 5 LA Clipp~ 72 47 25 3456 117. 111. 6.1 58.4 1.85 18.1 27 75.4 51.8 13.5 56.4 59.9 97.6 53 7,036 NA
7 7 Milwauke~ 72 46 26 3466 116. 111. 5.8 56.9 1.84 18 26.9 75.5 51.9 13.4 56.6 59.3 103. 53.3 7,423 NA
8 8 Dallas M~ 72 42 30 3461 115. 112. 2.3 55.7 1.9 17.2 25.3 73.4 49.6 12.3 55 58.2 97.9 51 7,062 NA
9 8 Los Ange~ 72 42 30 3491 110. 107. 2.9 60.7 1.62 18 26.9 74.8 51.1 15.2 53.6 56.9 98.8 51.7 7,184 NA
或使用XML
readHTMLTable((table$getPageSource()[[1]]))