Web Scraper 问题:只能解析字符串
Web Scraper issue: can only parse strings
我最近在 hong kong election platform 上写了一个 2 级爬虫,效果很好。该代码允许我在地区级别的基础上检索信息。代码如下:
from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls(self, response):
raw_tree = etree.HTML(response)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
platform_urls = self.extract_info_urls(response)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
尽管如此,因为我想提高我的技能,所以我尝试做一个 3 级爬虫。我想同时抓取 all politicians' platforms in the 18 districts。
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls_district(self, response):
raw_tree = etree.HTML(response)
district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
#pdf_url = "../eng/intro_to_can/A.html"
district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
return district_urls
def extract_info_urls_platform(self, district_urls):
raw_tree = etree.HTML(district_urls)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
district_urls = self.extract_info_urls_district(response)
platform_urls = self.extract_info_urls_platform(district_urls)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
但是失败了。我想知道我做错了什么。
完整追溯:
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
runner.run()
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
platform_urls = self.extract_info_urls_platform(district_urls)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
raw_tree = etree.HTML(district_urls)
File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
感谢您的帮助和时间 - 期待从这个了不起的社区学习!
您试图在不发送请求的情况下直接使用 lxml 解析器抓取内容。我已经对您的 xpaths 进行了一些更改,但这并不是必需的。我还使用生成器来提高效率。确保在脚本中添加此 save_information
方法,因为我不得不将其踢出以查看发生了什么:
import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin
class hongkongelection(object):
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def extract_info_urls_district(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
yield urljoin(url,pdf_url)
def extract_info_urls_platform(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
yield urljoin(url,pdf_url)
def extract_info(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def run(self):
for district_url in self.extract_info_urls_district(self.url):
for url in self.extract_info_urls_platform(district_url):
raw_json = self.extract_info(url)
raw_json['platform_url'] = url
print(raw_json)
time.sleep(random.randint(3,8))
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
我最近在 hong kong election platform 上写了一个 2 级爬虫,效果很好。该代码允许我在地区级别的基础上检索信息。代码如下:
from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls(self, response):
raw_tree = etree.HTML(response)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
platform_urls = self.extract_info_urls(response)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
尽管如此,因为我想提高我的技能,所以我尝试做一个 3 级爬虫。我想同时抓取 all politicians' platforms in the 18 districts。
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls_district(self, response):
raw_tree = etree.HTML(response)
district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
#pdf_url = "../eng/intro_to_can/A.html"
district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
return district_urls
def extract_info_urls_platform(self, district_urls):
raw_tree = etree.HTML(district_urls)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
district_urls = self.extract_info_urls_district(response)
platform_urls = self.extract_info_urls_platform(district_urls)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
但是失败了。我想知道我做错了什么。
完整追溯:
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
runner.run()
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
platform_urls = self.extract_info_urls_platform(district_urls)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
raw_tree = etree.HTML(district_urls)
File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
感谢您的帮助和时间 - 期待从这个了不起的社区学习!
您试图在不发送请求的情况下直接使用 lxml 解析器抓取内容。我已经对您的 xpaths 进行了一些更改,但这并不是必需的。我还使用生成器来提高效率。确保在脚本中添加此 save_information
方法,因为我不得不将其踢出以查看发生了什么:
import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin
class hongkongelection(object):
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def extract_info_urls_district(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
yield urljoin(url,pdf_url)
def extract_info_urls_platform(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
yield urljoin(url,pdf_url)
def extract_info(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def run(self):
for district_url in self.extract_info_urls_district(self.url):
for url in self.extract_info_urls_platform(district_url):
raw_json = self.extract_info(url)
raw_json['platform_url'] = url
print(raw_json)
time.sleep(random.randint(3,8))
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()