我正在尝试抓取 aspx 网站但无法超越 page2

Question

我正在尝试抓取一个 aspx 站点：https://www.aae.org/patients/find.aspx。出于测试目的，请使用 33133 作为邮政编码，使用 100 作为半径。

最初我通过遍历搜索页面来收集个人资料链接，我成功地在第一页获得了前 20 个链接，但无法超出第 1 页，来源说 - 'We\'re Sorry, the Page or File You Were Looking for Cannot be Found'

请看下面我的代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv

profile_links = []

def result_checker(self):
    No_results = self.xpath('//td[@colspan="3"]//p//text()')
    if "No results" in str(No_results):
        print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
        time.sleep(10)
        sys.exit()
    else:
        pass

def Get_data(zipcode, radius):
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Host':'www.tcms.com',
                'Origin':'https://www.aae.org',
                'Referer':'https://www.aae.org/patients/find.aspx'}

    class MyOpener(urllib.request.FancyURLopener):
        version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'

    myopener = MyOpener()
    url = 'https://www.aae.org/patients/find.aspx'
    f = myopener.open(url)
    soup = BeautifulSoup(f,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']

    formData = (
        ('__EVENTVALIDATION', eventvalidation),
        ('__VIEWSTATE', viewstate),
        ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
        ('EktronClientManager',EktronClientManager),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))

    encodedFields = urllib.parse.urlencode(formData)
    f1 = myopener.open(url, encodedFields)
    source = f1.read()
    target = open('sample.txt','w')
    target.write(str(source))
    target.close()
    source1 = html.fromstring(source)
    result_checker(source1)
    links = source1.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            profile_links.append("https://www.aae.org/patients/"+str(each))
        else:
            pass

    j = 2
    soup2 = BeautifulSoup(source,'lxml')
    viewstate = soup2.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']

    while j < 5:
        pages = 'Page$'+str(j)
        print (pages,'\n---------------')
        formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
                    ('__EVENTARGUMENT',pages),
                    ('__VIEWSTATE',viewstate),
                    ('__EVENTVALIDATION',eventvalidation),
                    ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))

        encodedFields1 = urllib.parse.urlencode(formData1)
        f2 = myopener.open(url, encodedFields1)
        source2 = f2.read()
        target = open('sample.txt','w')
        target.write(str(source2))
        target.close()
        source3 = html.fromstring(source2)
        links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links2:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                profile_links.append("https://www.aae.org/patients/"+str(each1))
            else:
                pass
        soup3 = BeautifulSoup(source2,'lxml')
        viewstate = soup3.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
        j+=1

if __name__ == "__main__":
    #Get_data('38132', 5)
    Get_data('33133', 100)

Answer 1

是的 Greg Sadetsky，您关于 cookie 的看法完全正确，需要创建一个会话然后传递所有 POST 具有所需数据参数的请求。

在请求库的帮助下，我能够创建一个会话来存储可跨请求使用的 cookie。

import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html

def Get_data(zipcode, radius):
    All_links = []
    url = 'https://www.aae.org/patients/find.aspx'
    s = requests.Session()
    r = s.get(url)
    #print (r.text.encode('utf-8'))
    soup = BeautifulSoup(r.content,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']
    params = {'EktronClientManager':EktronClientManager,
              '__VIEWSTATE':viewstate,
              '__EVENTVALIDATION':eventvalidation,
              'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
    r2 = s.post(url,data=params)
    source = html.fromstring(r2.content)
    links = source.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            All_links.append("https://www.aae.org/patients/"+str(each))
    #print (r2.content)
    soup1 = BeautifulSoup(r2.content,'lxml')
    viewstate = soup1.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
    j = 2
    while j < 7:
        page = 'Page$'+str(j)
        print (page)
        params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
                   '__EVENTARGUMENT':page,
                   'EktronClientManager':EktronClientManager,
                   '__VIEWSTATE':viewstate,
                   '__EVENTVALIDATION':eventvalidation,
                   'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
        r3 = s.post(url,data=params1)
        source1 = html.fromstring(r3.content)
        links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links1:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                All_links.append("https://www.aae.org/patients/"+str(each1))
        soup2 = BeautifulSoup(r3.content,'lxml')
        viewstate = soup2.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
        EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
        j+=1

Get_data(33133, 100)

我正在尝试抓取 aspx 网站但无法超越 page2

I am trying to scrape a aspx website but unable to go beyond page2

python

urllib

web-scraping

python-3.5