从 stats.nba.com 抓取表格 - 多种方法无效

Scraping tables from stats.nba.com - multiple approaches not working

我们正在尝试将 table 从这里 - https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season - 抓取到 R 中。这是我们目前所做的尝试:

# get request from API found in network tab - this doesn't work, the request hangs
httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=')
# rvest returns empty nodeset when grabbing tables on page
'https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season' %>%
  read_html() %>%
  html_nodes('table')

是否可以使用 R 从该网页中抓取主要 table?

编辑:

headers = c(
    `authority` = 'www.nba.com',
    `cache-control` = 'max-age=0',
    `sec-ch-ua` = '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    `sec-ch-ua-mobile` = '?0',
    `sec-ch-ua-platform` = '"macOS"',
    `upgrade-insecure-requests` = '1',
    `user-agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    `accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    `sec-fetch-site` = 'same-origin',
    `sec-fetch-mode` = 'navigate',
    `sec-fetch-user` = '?1',
    `sec-fetch-dest` = 'document',
    `accept-language` = 'en-US,en;q=0.9',
    `cookie` = 'usprivacy=1YNN; AMCVS_248F210755B762187F000101%40AdobeOrg=1; s_ecid=MCMID%7C39761269548384710744541812242089157146; countryCode=US; s_cc=true; ug=61647d1f0252400a3f87470014d69025; nlhidescores=false; _pbjs_userid_consent_data=3524755945110770; qoscid=524912006.1633975588; qossid=1633975588; client_type=html5; client_version=4.4.0; ugs=1; OptanonAlertBoxClosed=2021-10-12T23:20:24.183Z; at_check=true; _parsely_visitor={%22id%22:%22pid=0cb0a9a5854f45ea8a6d48f74f03e800%22%2C%22session_count%22:1%2C%22last_session_ts%22:1634155541257}; ab.storage.deviceId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%2228d2f640-2ad0-b8e9-b78c-016ba5a85671%22%2C%22c%22%3A1634155541318%2C%22l%22%3A1634155541318%7D; OptanonControl=ccc=US&csc=&cic=0&otvers=6.24.0&pctm=2021-10-12T23%3A20%3A24.183Z&reg=ccpa&ustcs=1YNN&vers=3.1.5; aam_uuid=39724801183369993254542124123886279717; s_ips=796; mbox=session#70d31bd3ea124acc80cb089a5594528e#1634158760|PC#70d31bd3ea124acc80cb089a5594528e.34_0#1697401700; ab.storage.sessionId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%228dcfd2a2-4419-87f9-7e1c-22cf76830e7e%22%2C%22e%22%3A1634158700129%2C%22c%22%3A1634155541315%2C%22l%22%3A1634156900129%7D; s_tp=2924; s_ppv=nba%253Ateams%253Amain%2C27%2C27%2C796%2C1%2C3; ak_bmsc=2C1E9B2928FD1C90ECFF4A5887776269~000000000000000000000000000000~YAAQrL4cuDCzpVJ8AQAAytvzew1NuriisqR0MtOqexD1CqvqIJKuuhJda9NNGXOBCOjAdMEXnQjL10fYxWYj9HLm2DJdQLQIjLSqvl3faGyPbxWARg6dKwmf4NK/+RENdJTZfsKGTbwUMxTtPRSoR7TmMc3UWE4tAdft14nRiSPZwp/DJjK9NUhLtpTDjCa65HELyeJ7O4M4d98rAu5R7YYZOEVRjz5VRQEGaFBc5u2OlaUpcyFDqUM+j+jII/6xmqgwVRUhX8t8oNmdeiYpfEALo1yewznqZcfOO18htGp4sF3SLPG8bBFvLeGwW118Mu1rVkyeO4PEvC7UFZUc+a7tGNSjGyGe0WSC/0iSjTC+/ikP2BPwMosXe7DxWk/a0vuFtUlw7jArB/YQuYHH61uu8E97UTA=; AMCV_248F210755B762187F000101%40AdobeOrg=359503849%7CMCMID%7C39761269548384710744541812242089157146%7CMCAAMLH-1634771953%7C7%7CMCAAMB-1634771953%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1634174353s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.0.1%7CMCIDTS%7C18914; s_gpv_pageModal=nba%3Astats%3Ateams%3Aadvanced; s_sq=%5B%5BB%5D%5D; akavpau_allowednbamain=1634169266~id=4fd4cabce5336e66bef275d5dd409a10; bm_sv=467DB2784E3DE76FAA9F4CD21DD7DE3C~8bPs2wRiWvWAD8K8MYos9duNZqYto/EQc8HFibswczdPYqofRTJZOTE4Xy1RsB9fJag8YMdv3OOHkVFDGoh7aG8x4Y8eZepOfBGMFtPmQF0Vgg0XNix35HHU2sk9RKCEQujy2BRS4m269Y6fIapqEQ==; OptanonConsent=isIABGlobal=false&datestamp=Wed+Oct+13+2021+19%3A44%3A27+GMT-0400+(Eastern+Daylight+Time)&version=6.24.0&hosts=&consentId=e8a9be54-a345-44df-90e1-eaaf56d98079&interactionCount=2&landingPath=NotLandingPage&groups=BG30%3A1%2Cven%3A1%2Cpad%3A1%2Cpap%3A1%2Ccad%3A1%2Cmap%3A1%2Cdsa%3A1%2CNBAad%3A1%2Creq%3A1%2Csec%3A1%2Cgld%3A1%2Cpcp%3A1%2Cmcp%3A1%2Cmra%3A1%2Ctdc%3A1%2Ccos%3A1%2Cdid%3A1%2Csid%3A1%2Cpdd%3A1%2Cpcd%3A1%2CNBAmt%3A1&AwaitingReconsent=false&geolocation=US%3B'
  )
  
  params = list(
    `sort` = 'W',
    `dir` = '-1',
    `Season` = '2020-21',
    `SeasonType` = 'Regular Season'
  )
  
res <- httr::GET(url = 'https://www.nba.com/stats/teams/advanced/', httr::add_headers(.headers=headers), query = params)

下面的 returns 是一个 res 变量,但我们现在正在努力从 res.

中提取内容

如评论中所述,许多 headers 和参数都不需要,但这有效:

library(data.table)
library(magrittr)

headers = c(
  `Connection` = 'keep-alive',
  `Accept` = 'application/json, text/plain, */*',
  `x-nba-stats-token` = 'true',
  `DNT` = '1',
  `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
  `x-nba-stats-origin` = 'stats',
  `Sec-GPC` = '1',
  `Origin` = 'https://www.nba.com',
  `Sec-Fetch-Site` = 'same-site',
  `Sec-Fetch-Mode` = 'cors',
  `Sec-Fetch-Dest` = 'empty',
  `Referer` = 'https://www.nba.com/',
  `Accept-Language` = 'en-US,en;q=0.9',
  `If-Modified-Since` = 'Wed, 13 Oct 2021 23:24:06 GMT'
)

params = list(
  `Conference` = '',
  `DateFrom` = '',
  `DateTo` = '',
  `Division` = '',
  `GameScope` = '',
  `GameSegment` = '',
  `LastNGames` = '0',
  `LeagueID` = '00',
  `Location` = '',
  `MeasureType` = 'Advanced',
  `Month` = '0',
  `OpponentTeamID` = '0',
  `Outcome` = '',
  `PORound` = '0',
  `PaceAdjust` = 'N',
  `PerMode` = 'PerGame',
  `Period` = '0',
  `PlayerExperience` = '',
  `PlayerPosition` = '',
  `PlusMinus` = 'N',
  `Rank` = 'N',
  `Season` = '2020-21',
  `SeasonSegment` = '',
  `SeasonType` = 'Regular Season',
  `ShotClockRange` = '',
  `StarterBench` = '',
  `TeamID` = '0',
  `TwoWay` = '0',
  `VsConference` = '',
  `VsDivision` = ''
)

res <- httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats', httr::add_headers(.headers=headers), query = params)
data <- httr::content(res) %>% .[['resultSets']] %>% .[[1]]
column_names <- data$headers %>% as.character()  
dt <- rbindlist(data$rowSet) %>% setnames(column_names)

给出:

head(dt, 2)
      TEAM_ID      TEAM_NAME GP  W  L W_PCT  MIN E_OFF_RATING OFF_RATING E_DEF_RATING DEF_RATING E_NET_RATING NET_RATING AST_PCT
1: 1610612737  Atlanta Hawks 72 41 31 0.569 3481          113      114.3        110.6      112.1          2.5        2.2   0.591
2: 1610612738 Boston Celtics 72 36 36   0.5 3476          111      113.1          110      111.8          0.9        1.2   0.566
   AST_TO AST_RATIO OREB_PCT DREB_PCT REB_PCT TM_TOV_PCT EFG_PCT TS_PCT E_PACE  PACE PACE_PER40 POSS   PIE GP_RANK W_RANK L_RANK
1:   1.82      17.6    0.284    0.742   0.516      0.133   0.539  0.581   99.9 98.68      82.23 7160 0.511       1     11     11
2:   1.67      17.1    0.289    0.737    0.51      0.141   0.543  0.574  100.7 98.94      82.45 7172 0.501       1     16     16
   W_PCT_RANK MIN_RANK OFF_RATING_RANK DEF_RATING_RANK NET_RATING_RANK AST_PCT_RANK AST_TO_RANK AST_RATIO_RANK OREB_PCT_RANK
1:         11       11               9              18              11           18          14             20             6
2:         16       16              10              13              13           27          22             26             3
   DREB_PCT_RANK REB_PCT_RANK TM_TOV_PCT_RANK EFG_PCT_RANK TS_PCT_RANK PACE_RANK PIE_RANK CFID       CFPARAMS
1:             9            7              10           16          10        22       10   10  Atlanta Hawks
2:            13           10              18           12          16        20       17   10 Boston Celtics

一个RSelenium解决方案,

library(RSelenium)
library(dply)
library(rvest)

driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]


remDr$navigate('https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season')
# select element
table <- remDr$findElement(using = 'xpath', value = '/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table')


df1 = table$getPageSource()[[1]]%>% 
  read_html() %>%
  html_table()

[[1]]
# A tibble: 30 x 39
      `` TEAM         GP     W     L   MIN OffRtg DefRtg NetRtg `AST%` `AST/TO` ASTRatio `OREB%` `DREB%` `REB%` `TOV%` `eFG%` `TS%`  PACE   PIE POSS  `GP RANK`
   <int> <chr>     <int> <int> <int> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>    <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <chr> <lgl>    
 1     1 Utah Jazz    72    52    20  3471   116.   108.    9     57.3     1.66     17.3    28.4    75.7   52.9   14.2   56.3  59.7  99.4  54.2 7,193 NA       
 2     2 Phoenix ~    72    51    21  3496   116.   110.    5.9   62.2     2.15     19.6    24.8    74.1   50     12.6   56.4  59.7  98    53.4 7,137 NA       
 3     3 Philadel~    72    49    23  3486   112.   107     5.5   57.2     1.64     17.2    27.7    73.7   51.1   14.3   54.1  57.9 100.   53.6 7,272 NA       
 4     4 Brooklyn~    72    48    24  3481   117.   113.    4.2   62.1     1.98     19.3    25.2    72.6   50.3   13.4   57.5  61   100.   53.2 7,280 NA       
 5     5 Denver N~    72    47    25  3496   116.   112.    4.8   62.1     1.99     19.3    29.2    75.1   52.2   13.6   55.7  58.8  97.7  52.5 7,123 NA       
 6     5 LA Clipp~    72    47    25  3456   117.   111.    6.1   58.4     1.85     18.1    27      75.4   51.8   13.5   56.4  59.9  97.6  53   7,036 NA       
 7     7 Milwauke~    72    46    26  3466   116.   111.    5.8   56.9     1.84     18      26.9    75.5   51.9   13.4   56.6  59.3 103.   53.3 7,423 NA       
 8     8 Dallas M~    72    42    30  3461   115.   112.    2.3   55.7     1.9      17.2    25.3    73.4   49.6   12.3   55    58.2  97.9  51   7,062 NA       
 9     8 Los Ange~    72    42    30  3491   110.   107.    2.9   60.7     1.62     18      26.9    74.8   51.1   15.2   53.6  56.9  98.8  51.7 7,184 NA

或使用XML

readHTMLTable((table$getPageSource()[[1]]))