网络抓取人口普查数据

Web Scraping Census Data

我正在尝试从基于人口普查数据的统计地图集网站的受教育程度部分的第一个 table 中抓取数据。本质上,我想通过网络从 table 中抓取百分比,然后将这些百分比添加到数据框中,该数据框的最左侧有邮政编码,并且有单独的列用于 HS、无 HS 和更高学位。我正在尝试对纽约市的所有邮政编码执行此操作。

这是我到目前为止想出的代码,你能帮我改进它吗,这样我就可以循环遍历所有邮政编码,并从第一个 [=19] 开始得到一个包含每个教育类别列的数据框=] 纽约市的邮政编码?

这里是link统计图集:https://statisticalatlas.com/place/New-York/New-York/Overview

import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import getpass
import os
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

file_name = 'C:/Users/Nicholas_G/Desktop/Google Drive/Work/Free 
Lance/Political Targeting/Census Data.xlsx'
sheet_name = 'NYC Zip Target'
Census_Data = pd.read_excel(file_name, sheet_name=sheet_name)

zip_list = list(a for a in Census_Data['RESIDENTIAL_ZIP'])

url = "https://statisticalatlas.com/place/New-York/New-York/Overview"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
l = []

for a in zip_list:
   r = requests.get(f"https://statisticalatlas.com/zip/{a}/Educational- 
Attainment")
   s = BeautifulSoup(r.text, 'lxml')
   data = s.find('svg', {'viewBox': '0 0 400 79'})
   value = data.find('svg', {'fill': '#000'})
   l.append(value)

我不太熟悉多处理,否则我会走那条路,但这是我使用 Session

的版本
import requests
import pandas as pd
from bs4 import BeautifulSoup

urlMain = 'https://statisticalatlas.com/place/New-York/New-York/Overview'
urlAttainment = 'https://statisticalatlas.com/zip/{}/Educational-Attainment'

def getPercentages(url):
    res = requests.get(url)
    if res.status_code == 200:
        soup = BeautifulSoup(res.content, "lxml")
        percentages = soup.select('[id="figure/educational-attainment"] rect title')
        percentages = [percentages[0].text,percentages[2].text,percentages[4].text]
        return percentages
    else:
        print(res.status_code, url)
        return []

def getCodes(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    codes = [code.text for code in soup.select('.info-table-contents-div a[href*=zip]')]
    return codes

results = []

with requests.Session() as s:
    zipcodes = getCodes(urlMain)

    for zipcode in zipcodes:
        try:
            row = getPercentages(urlAttainment.format(zipcode))
            row.insert(0, zipcode)
            results.append(row)
        except IndexError as ex:
            print(ex,urlAttainment.format(zipcode))
df = pd.DataFrame(results,columns=['zipcode', 'HD', 'HS', 'NoHS'])
print(df)