使用 R (rvest) 抓取 PHP 仪表板

Scraping PHP dashboard with R (rvest)

我正在尝试从官方网站上抓取孟加拉国 COVID-19 数据(检测次数、阳性检测次数、阳性率):http://103.247.238.92/webportal/pages/covid19.php

该网站包含 3 个下拉菜单以获取数据:Select 分区; Select区; Select 数据的时间范围。

到目前为止我已经尝试了以下方法:

url <- "http://103.247.238.92/webportal/pages/covid19.php"
webpage <- read_html(url)

webpage 具有以下内容:

List of 2
 $ node:<externalptr> 
 $ doc :<externalptr> 
 - attr(*, "class")= chr [1:2] "xml_document" "xml_node"

由于这没有帮助,我也尝试了以下基于此

a <- GET(url)
a <- content(a, as="text")
a <- gsub("^angular.callbacks._2\(", "", a)
a <- gsub("\);$", "", a)
df <- fromJSON(a, simplifyDataFrame = TRUE)

以上returns错误如下:

Error: lexical error: invalid char in json text.
                                       <!DOCTYPE html> <!-- This is a 
                     (right here) ------^

所以我什至不知道如何读取数据 - 但在查看网页的来源后,我知道数据就在那里:Safari Website inspector

关于如何读取这些数据有什么建议吗?

此外,如果有人可以帮助我如何选择不同的下拉菜单项,那将不胜感激。最终目标是收集每个部门每个地区最近 12 个月的数据。

tl;dr

该页面发出额外请求以获取该信息。这些额外的请求依赖于 id 的组合;从 option 元素 value 属性中提取的 id,Division 下拉列表中的每个选项,以及从 option 元素 value 属性中提取的 id District 下拉列表中的每个选项。

您可以发出初始请求以获取所有 Division 下拉列表 ID:

divisions <- options_df("#division option:nth-child(n+2)", "division") 

nth-child(n+2)用于排除初始'select'选项。

这个 returns 具有初始 divisionIDs 和友好部门名称的数据框。

然后可以使用这些 ID 来检索关联的 districtIDs(在第一个下拉列表中做出选择后,在第二个下拉列表中可用的选项):

districts <- pmap_dfr(
  list(divisions$divisionID),
  ~ {
    df_districts <- districts_from_updated_session(.x, "district") %>%
      mutate(
        divisionID = .x
      )
    return(df_districts)
  }
)

这个 returns 一个数据帧将 divisionID 映射到所有关联的 districtIDs,以及友好的地区名称:

通过在两个数据框中包含 divisionID,我可以将它们内部连接起来:

div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)

到目前为止,我一直在使用会话对象来提高 tcp 的重用效率。不幸的是,我在文档中找不到任何有关如何更新已打开的会话以允许发送带有动态正文参数的新 POST 请求的内容。相反,我利用 furrr::future_map 尝试通过并行处理获得一些效率:

df <- div_district %>%
  mutate(json = furrr::future_map(divisionID, .f = get_covid_data, districtID))

为了通过 get_covid_data() 获得最终的 covid 数字,我利用了服务器的一些可能奇怪的行为,因为我可以制作 GET,传递 divisionIDdistrictID 在正文中,然后正则表达式输出 jquery 数据表脚本的一部分,将字符串清理成 json 有效字符串,然后将其读入 json 对象存储在 json 最终数据框的列。

json 列内部


R:

library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#> 
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#> 
#>     flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3

## to clean out everything before a run
# rm(list = ls(all = TRUE))
# invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE)) #  @mmfrgmpds

#returns value:text for options e.g. divisions/districts (dropdown)
options_df <- function(css_selector, level) {
  nodes <- session %>% html_nodes(css_selector)
  options <- nodes %>% map_df(~ c(html_attr(., "value"), html_text(.)) %>%
    set_names(paste0(level, "ID"), level))
  return(options)
}

#returns districts associated with division
districts_from_updated_session <- function(division_id, level) {
  session <- jump_to(session, paste0("http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id=", division_id))
  return(options_df("#district option:nth-child(n+2)", level))
}

# returns json object housing latest 12 month covid numbers by divisionID + districtID pairing
get_covid_data <- function(divisionID, districtID) {
  headers <- c(
    "user-agent" = "Mozilla/5.0",
    "if-modified-since" = "Wed, 08 Jul 2020 00:00:00 GMT" # to mitigate for caching
  )

  data <- list("division" = divisionID, "district" = districtID, "period" = "LAST_12_MONTH", "Submit" = "Search")

  r <- httr::GET(url = "http://103.247.238.92/webportal/pages/covid19.php", httr::add_headers(.headers = headers), body = data)

  data <- stringr::str_match(content(r, "text"), "DataTable\((\[[\s\S]+\])\)")[1, 2] %>% #clean up extracted string so can be parsed as valid json
    gsub("role", '"role"', .) %>%
    gsub("'", '"', .) %>%
    gsub(",\s+\]", "]", .) %>%
    str_squish() %>%
    jsonlite::parse_json()
  return(data)
}


url <- "http://103.247.238.92/webportal/pages/covid19.php"
headers <- c("User-Agent" = "Mozilla/4.0", "Referer" = "http://103.247.238.92/webportal/pages/covid19.php")

session <- html_session(url, httr::add_headers(.headers = headers)) #for tcp re-use

divisions <- options_df("#division option:nth-child(n+2)", "division") #nth-child(n+2) to exclude initial 'select' option

districts <- pmap_dfr(
  list(divisions$divisionID),
  ~ {
    df <- districts_from_updated_session(.x, "district") %>%
      mutate(
        divisionID = .x
      )
    return(df)
  }
)

div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)

no_cores <- future::availableCores() - 1

future::plan(future::multisession, workers = no_cores)

df <- div_district %>%
  mutate(json = future_map(divisionID, .f = get_covid_data, districtID))

reprex package (v0.3.0)

于 2021-03-04 创建

Py

import requests, re, ast
from bs4 import BeautifulSoup as bs

def options_dict(soup, css_selector):
    options = {i.text:i['value'] for i in soup.select(css_selector) if i['value']}
    return options

def covid_numbers(text):
    covid_data = p.findall(text)[0]
    covid_data = re.sub(r'\n\s+', '', covid_data.replace("role","'role'"))
    covid_data = ast.literal_eval(covid_data) 
    return covid_data

url = 'http://103.247.238.92/webportal/pages/covid19.php'
regions = {}
result = {}
p = re.compile(r'DataTable\((\[[\s\S]+\])\)')

with requests.Session() as s: 
    s.headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'http://103.247.238.92/webportal/pages/covid19.php'}
    soup = bs(s.get(url).content, 'lxml')
    divisions = options_dict(soup, '#division option')

    for k,v in divisions.items():
        r = s.get(f'http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id={v}')
        soup = bs(r.content, 'lxml')
        districts = options_dict(soup, '#district option')
        regions[k] = districts
   
    s.headers = {'User-Agent': 'Mozilla/5.0','if-modified-since': 'Wed, 08 Jul 2020 22:27:07 GMT'}
    
    for k,v in divisions.items():
        result[k] = {}
        for k2,v2 in regions.items():    
            data = {'division': k2, 'district': v2, 'period': 'LAST_12_MONTH', 'Submit': 'Search'}
            r = s.get('http://103.247.238.92/webportal/pages/covid19.php',  data=data)
            result[k][k2] = covid_numbers(r.text)