Web Scraper 问题:只能解析字符串

Web Scraper issue: can only parse strings

我最近在 hong kong election platform 上写了一个 2 级爬虫,效果很好。该代码允许我在地区级别的基础上检索信息。代码如下:

from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin


class hongkongelection:
    def __init__(self):
        self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls(self, response):
    raw_tree = etree.HTML(response)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    platform_urls = self.extract_info_urls(response)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)



if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()

尽管如此,因为我想提高我的技能,所以我尝试做一个 3 级爬虫。我想同时抓取 all politicians' platforms in the 18 districts

class hongkongelection:
def __init__(self):
    self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls_district(self, response):
    raw_tree = etree.HTML(response)
    district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
    scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
    #pdf_url = "../eng/intro_to_can/A.html"
    district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
    return district_urls

def extract_info_urls_platform(self, district_urls):
    raw_tree = etree.HTML(district_urls)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    district_urls = self.extract_info_urls_district(response)
    platform_urls = self.extract_info_urls_platform(district_urls)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)


if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()

但是失败了。我想知道我做错了什么。

完整追溯:

Traceback (most recent call last):


 File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
    runner.run()
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
    platform_urls = self.extract_info_urls_platform(district_urls)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
    raw_tree = etree.HTML(district_urls)
  File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
  File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings

感谢您的帮助和时间 - 期待从这个了不起的社区学习!

您试图在不发送请求的情况下直接使用 lxml 解析器抓取内容。我已经对您的 xpaths 进行了一些更改,但这并不是必需的。我还使用生成器来提高效率。确保在脚本中添加此 save_information 方法,因为我不得不将其踢出以查看发生了什么:

import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin

class hongkongelection(object):

    def __init__(self):
        self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'

    def send_request(self, url):
        r = requests.get(url)
        r.raise_for_status()
        return r.text

    def extract_info_urls_district(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
            yield urljoin(url,pdf_url)

    def extract_info_urls_platform(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
            yield urljoin(url,pdf_url) 

    def extract_info(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        dict_result = {}
        dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
        dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
        dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
        dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
        dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
        dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
        return dict_result

    def run(self):
        for district_url in self.extract_info_urls_district(self.url):
            for url in self.extract_info_urls_platform(district_url):
                raw_json = self.extract_info(url)
                raw_json['platform_url'] = url
                print(raw_json)
            time.sleep(random.randint(3,8))


if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()