Python Playwright 的异步不处理所有抓取的页面
Python Playwright's async does not process all of the scraped pages
在 Playwright 中抓取和解析 Javascript 页面。
大约有 100 个 URL,但该过程未全部完成就结束了。
这可能是什么原因造成的?
代码目前有效。
for语法是不是放错地方了?
如果您能告诉我我是否错误地使用了 async,我将不胜感激。
更改为当前代码。
以下命令在 Scrapy 中执行。
scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[@class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[@therapist_id]/a/@href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
# 'report': repo.text
}
我在某些地方看到 .get()
,所以它只从列表中获取第一项 - 即。它从 ~250 位治疗师名单中获得第一位治疗师。也许这就是您获得较少结果的问题。
我发现 therapistlist.php?id=...
使用 JavaScript 从 therapistlist.php?id=...&more
读取所有数据作为 JSON(最后是 &more
)并呈现页面.通过这种方式,我将治疗师列表读取为 JSON 数据而没有 Playwright
,所以我得到结果的速度要快得多。
我在大约 1 分钟内获得了大约 800 名治疗师。
如果您以 CSV 格式写入数据,那么您可能会遇到另一个问题。
在 CSV 中,所有项目都必须具有相同的列 - 如果 Scrapy
看到 {'therapist_name': ...}
和列 therapist_name
而它在 shop data
中没有,那么它会跳过它 - 你可以仅在没有治疗师的商店获取文件。我在商店数据中添加了字段 therapist_name
,现在 CSV 还保存了治疗师。
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[@class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/@href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[@class="viewMore"]/a/@href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
}
在 Playwright 中抓取和解析 Javascript 页面。 大约有 100 个 URL,但该过程未全部完成就结束了。
这可能是什么原因造成的? 代码目前有效。
for语法是不是放错地方了?
如果您能告诉我我是否错误地使用了 async,我将不胜感激。
更改为当前代码。 以下命令在 Scrapy 中执行。 scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[@class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[@therapist_id]/a/@href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
# 'report': repo.text
}
我在某些地方看到 .get()
,所以它只从列表中获取第一项 - 即。它从 ~250 位治疗师名单中获得第一位治疗师。也许这就是您获得较少结果的问题。
我发现 therapistlist.php?id=...
使用 JavaScript 从 therapistlist.php?id=...&more
读取所有数据作为 JSON(最后是 &more
)并呈现页面.通过这种方式,我将治疗师列表读取为 JSON 数据而没有 Playwright
,所以我得到结果的速度要快得多。
我在大约 1 分钟内获得了大约 800 名治疗师。
如果您以 CSV 格式写入数据,那么您可能会遇到另一个问题。
在 CSV 中,所有项目都必须具有相同的列 - 如果 Scrapy
看到 {'therapist_name': ...}
和列 therapist_name
而它在 shop data
中没有,那么它会跳过它 - 你可以仅在没有治疗师的商店获取文件。我在商店数据中添加了字段 therapist_name
,现在 CSV 还保存了治疗师。
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[@class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/@href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[@class="viewMore"]/a/@href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
}