如何通过 scrapy python 从动态(?)创建的 table 中正确地抓取数据
How to scrape data via scrapy python correctly from a dynamically(?) created table
我目前正在尝试从 alibaba.com 抓取公司概览。
为了获取公司名称等信息,我做了:
response.xpath("//a[@class='company-name company-name-lite-vb']/text()").extract()
效果很好。
当进入“公司概览”>“公司简介”并尝试从 table 中抓取信息时:
response.xpath("//div/div[@class='content-value']").extract()
我得到一个空数组。
resources/search_results_searchpage.yml:
products:
css: 'div[data-content="productItem"]'
multiple: true
type: Text
children:
link:
css: a.elements-title-normal
type: Link
crawler.py:
import scrapy
import csv
#from scrapy_selenium import SeleniumRequest # only needed when using selenium
import os
from selectorlib import Extractor
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text="Headphones"
url="https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(search_text)
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(response.text, base_url=response.url)
for product in data['products']:
parsed_url=product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage)
#yield SeleniumRequest(url=parsed_url, callback=self.crawl_mainpage)
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[@class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
有人知道我可以做些什么来填充 Year of Est. 吗?
我尝试使用 scrapy_selenium 并正确配置它,因为我怀疑该对象是动态生成的,但仍然没有运气,或者我可能使用错误
调整为:
scrapy crawl alibaba_crawler -o out.csv -t csv
您的 xpath 选择器不正确。试试这个
'Year of Est.': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
我还注意到您的代码中存在一些错误,例如下面的行会引发错误。您可能需要重新检查如何从搜索页面提取链接。
data = self.link_extractor.extract(response.text, base_url=response.url)
编辑:
单击公司选项卡后,将加载成立年份。您必须使用 selenium 或 scrapy-playwright 来模拟点击。我使用 scrapy-playwright
的简单实现如下。
import scrapy
from scrapy.crawler import CrawlerProcess
import os
from selectorlib import Extractor
from scrapy_playwright.page import PageCoroutine
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text = "Headphones"
url = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(
search_text)
yield scrapy.Request(url, callback=self.parse, meta={"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(
response.text, base_url=response.url)
for product in data['products']:
parsed_url = product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage, meta={"playwright": True, 'playwright_page_coroutines': {
"click": PageCoroutine("click", selector="//span[@title='Company Profile']"),
},})
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[@class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
if __name__ == "__main__":
process = CrawlerProcess(settings={
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR' :"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
})
process.crawl(Spider)
process.start()
下面是 运行 爬虫使用 python crawler.py
的示例日志。年份 2010
显示在输出
中
我目前正在尝试从 alibaba.com 抓取公司概览。
为了获取公司名称等信息,我做了:
response.xpath("//a[@class='company-name company-name-lite-vb']/text()").extract()
效果很好。
当进入“公司概览”>“公司简介”并尝试从 table 中抓取信息时:
response.xpath("//div/div[@class='content-value']").extract()
我得到一个空数组。
resources/search_results_searchpage.yml:
products:
css: 'div[data-content="productItem"]'
multiple: true
type: Text
children:
link:
css: a.elements-title-normal
type: Link
crawler.py:
import scrapy
import csv
#from scrapy_selenium import SeleniumRequest # only needed when using selenium
import os
from selectorlib import Extractor
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text="Headphones"
url="https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(search_text)
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(response.text, base_url=response.url)
for product in data['products']:
parsed_url=product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage)
#yield SeleniumRequest(url=parsed_url, callback=self.crawl_mainpage)
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[@class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
有人知道我可以做些什么来填充 Year of Est. 吗? 我尝试使用 scrapy_selenium 并正确配置它,因为我怀疑该对象是动态生成的,但仍然没有运气,或者我可能使用错误
调整为:
scrapy crawl alibaba_crawler -o out.csv -t csv
您的 xpath 选择器不正确。试试这个
'Year of Est.': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
我还注意到您的代码中存在一些错误,例如下面的行会引发错误。您可能需要重新检查如何从搜索页面提取链接。
data = self.link_extractor.extract(response.text, base_url=response.url)
编辑:
单击公司选项卡后,将加载成立年份。您必须使用 selenium 或 scrapy-playwright 来模拟点击。我使用 scrapy-playwright
的简单实现如下。
import scrapy
from scrapy.crawler import CrawlerProcess
import os
from selectorlib import Extractor
from scrapy_playwright.page import PageCoroutine
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text = "Headphones"
url = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(
search_text)
yield scrapy.Request(url, callback=self.parse, meta={"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(
response.text, base_url=response.url)
for product in data['products']:
parsed_url = product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage, meta={"playwright": True, 'playwright_page_coroutines': {
"click": PageCoroutine("click", selector="//span[@title='Company Profile']"),
},})
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[@class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
if __name__ == "__main__":
process = CrawlerProcess(settings={
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR' :"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
})
process.crawl(Spider)
process.start()
下面是 运行 爬虫使用 python crawler.py
的示例日志。年份 2010
显示在输出