获取每个标签对应的每个文本
Getting each text corresponding to the each tag
我正在尝试从网页的 left-side 列中获取一些数据。目的是使用 scrapy_playwright
单击所有 show more
按钮,并获取属于 show more
列表的每个元素的标题。但是,当我 运行 我的抓取工具时,它会为所有列表重复相同的 header make
。我需要为每组列表获取这些唯一信息。
这是我的刮板:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
for i in range(0, 11):
yield scrapy.Request(
url = url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
PageCoroutine("wait_for_timeout", 5000),
]
),
)
def parse(self, response):
container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
for items in container:
for values in test:
loader = ItemLoader(ConfusedItem(), selector = items)
loader.add_xpath('clicks', './/@onclick')
loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'json_data.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ConfusedSpider)
process.start()
输出:
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
预期输出:
{'category': 'SELLER TYPE',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
您的代码有 2 个问题。一是你的 xpath 选择器不正确,二是你没有使用 scrapy playwright,因此点击没有完成。循环和更改项目索引是不正确的,因为一旦您单击一个项目,该项目就会从 DOM 中删除,因此下一个项目现在位于第一个索引处。此外,要启用 scrapy-playwright
,您至少需要进行以下附加设置:
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
我已经在下面的代码中纠正了这两个问题。您将需要添加一些错误处理,并找到一种更好的方法来了解您应该在代码中启用多少次点击。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta=dict(
playwright=True,
playwright_page_coroutines=[
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
]
),
)
def parse(self, response):
for category in response.xpath("//div[@id='face_links']/div"):
name = category.xpath("./h2/text()").get()
for item in category.xpath("./ul/li"):
loader = ItemLoader(ConfusedItem(), selector=item)
loader.add_xpath('clicks', './@onclick')
loader.add_value("category", name)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'json_data.jl',
'FEED_FORMAT': 'jsonlines',
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
"PLAYWRIGHT_BROWSER_TYPE": "webkit"
}
)
process.crawl(ConfusedSpider)
process.start()
示例输出如下
我正在尝试从网页的 left-side 列中获取一些数据。目的是使用 scrapy_playwright
单击所有 show more
按钮,并获取属于 show more
列表的每个元素的标题。但是,当我 运行 我的抓取工具时,它会为所有列表重复相同的 header make
。我需要为每组列表获取这些唯一信息。
这是我的刮板:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
for i in range(0, 11):
yield scrapy.Request(
url = url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
PageCoroutine("wait_for_timeout", 5000),
]
),
)
def parse(self, response):
container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
for items in container:
for values in test:
loader = ItemLoader(ConfusedItem(), selector = items)
loader.add_xpath('clicks', './/@onclick')
loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'json_data.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ConfusedSpider)
process.start()
输出:
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
预期输出:
{'category': 'SELLER TYPE',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
您的代码有 2 个问题。一是你的 xpath 选择器不正确,二是你没有使用 scrapy playwright,因此点击没有完成。循环和更改项目索引是不正确的,因为一旦您单击一个项目,该项目就会从 DOM 中删除,因此下一个项目现在位于第一个索引处。此外,要启用 scrapy-playwright
,您至少需要进行以下附加设置:
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
我已经在下面的代码中纠正了这两个问题。您将需要添加一些错误处理,并找到一种更好的方法来了解您应该在代码中启用多少次点击。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta=dict(
playwright=True,
playwright_page_coroutines=[
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
]
),
)
def parse(self, response):
for category in response.xpath("//div[@id='face_links']/div"):
name = category.xpath("./h2/text()").get()
for item in category.xpath("./ul/li"):
loader = ItemLoader(ConfusedItem(), selector=item)
loader.add_xpath('clicks', './@onclick')
loader.add_value("category", name)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'json_data.jl',
'FEED_FORMAT': 'jsonlines',
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
"PLAYWRIGHT_BROWSER_TYPE": "webkit"
}
)
process.crawl(ConfusedSpider)
process.start()
示例输出如下