尝试使用 scrapy 抓取 table
Trying to scrape the table using scrapy
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
table=response.xpath("//table[@id='productDetails_detailBullets_sections1']").extract_first()
yield{
't':table
}
我正在尝试抓取 table 但我不知道如何从 table 中提取 text
试图抓取 product information
这是 link我在其中提取 table
https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
它只适用于这种 table,就像您在问题中遇到的那样:
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
# just for the example
url='https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1'
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//div[@id="prodDetails"]//tr')
table = {}
for row in rows:
key = row.xpath('.//th//text()').get(default='').strip()
# this will work for most of the rows (except "Customer Reviews" and "Best Sellers Rank"):
# value = line.xpath('.//td//text()').get(default='').strip()
# this will work for all the rows
value = row.xpath('.//td/text() | .//td//span/text()').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
这只是一个例子。您需要检查您可以获得的不同类型的 table 并相应地调整您的代码。
我们正在做的是逐行检查 table 并从中提取文本,然后将其添加到字典中并最终生成它。
要获取我们正在使用的文本 /text()
。搜索 xpath cheat sheet
它会帮助你。
要抓取 table,您可以遍历 table header 和 table 数据并将它们分配给键和值,然后生成完整的字典。请参阅下面的示例
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
details = {}
for product_detail in response.xpath("//*[contains(@id,'productDetails')]//table/tr"):
key = product_detail.xpath("normalize-space(./th/text())").get()
value = product_detail.xpath("normalize-space(./td/text())").get().replace("\u200e", "")
if "best sellers rank" in key.lower():
det_list = product_detail.xpath("./td/descendant::*/text()").getall()
value = "".join([i.strip() for i in det_list])
if "customer reviews" in key.lower():
det_list = product_detail.xpath("./td/descendant::span/text()").getall()
value = " ".join([i.strip() for i in det_list])
details[key] = value
yield details
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
table=response.xpath("//table[@id='productDetails_detailBullets_sections1']").extract_first()
yield{
't':table
}
我正在尝试抓取 table 但我不知道如何从 table 中提取 text
试图抓取 product information
这是 link我在其中提取 table
https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
它只适用于这种 table,就像您在问题中遇到的那样:
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
# just for the example
url='https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1'
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//div[@id="prodDetails"]//tr')
table = {}
for row in rows:
key = row.xpath('.//th//text()').get(default='').strip()
# this will work for most of the rows (except "Customer Reviews" and "Best Sellers Rank"):
# value = line.xpath('.//td//text()').get(default='').strip()
# this will work for all the rows
value = row.xpath('.//td/text() | .//td//span/text()').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
这只是一个例子。您需要检查您可以获得的不同类型的 table 并相应地调整您的代码。
我们正在做的是逐行检查 table 并从中提取文本,然后将其添加到字典中并最终生成它。
要获取我们正在使用的文本 /text()
。搜索 xpath cheat sheet
它会帮助你。
要抓取 table,您可以遍历 table header 和 table 数据并将它们分配给键和值,然后生成完整的字典。请参阅下面的示例
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
details = {}
for product_detail in response.xpath("//*[contains(@id,'productDetails')]//table/tr"):
key = product_detail.xpath("normalize-space(./th/text())").get()
value = product_detail.xpath("normalize-space(./td/text())").get().replace("\u200e", "")
if "best sellers rank" in key.lower():
det_list = product_detail.xpath("./td/descendant::*/text()").getall()
value = "".join([i.strip() for i in det_list])
if "customer reviews" in key.lower():
det_list = product_detail.xpath("./td/descendant::span/text()").getall()
value = " ".join([i.strip() for i in det_list])
details[key] = value
yield details