尝试使用 scrapy 抓取 table

Trying to scrape the table using scrapy

from scrapy import Spider
from scrapy.http import Request

class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']

    def parse(self, response):
        books = response.xpath("//h2/a/@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        table=response.xpath("//table[@id='productDetails_detailBullets_sections1']").extract_first()
        yield{
            't':table
        }

我正在尝试抓取 table 但我不知道如何从 table 中提取 text 试图抓取 product information 这是 link我在其中提取 table https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1

它只适用于这种 table,就像您在问题中遇到的那样:

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']

    def parse(self, response):
        books = response.xpath("//h2/a/@href").extract()
        for book in books:
            url = response.urljoin(book)
            # just for the example
            url='https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1'
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        rows = response.xpath('//div[@id="prodDetails"]//tr')
        table = {}
        for row in rows:
            key = row.xpath('.//th//text()').get(default='').strip()
            # this will work for most of the rows (except "Customer Reviews" and "Best Sellers Rank"):
            # value = line.xpath('.//td//text()').get(default='').strip()
            
            # this will work for all the rows
            value = row.xpath('.//td/text() | .//td//span/text()').getall()
            value = ''.join(value).strip()
            table.update({key: value})

        yield table

这只是一个例子。您需要检查您可以获得的不同类型的 table 并相应地调整您的代码。

我们正在做的是逐行检查 table 并从中提取文本,然后将其添加到字典中并最终生成它。

要获取我们正在使用的文本 /text()。搜索 xpath cheat sheet 它会帮助你。

要抓取 table,您可以遍历 table header 和 table 数据并将它们分配给键和值,然后生成完整的字典。请参阅下面的示例

from scrapy import Spider
from scrapy.http import Request

class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']

    def parse(self, response):
        books = response.xpath("//h2/a/@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        details = {}
        for product_detail in response.xpath("//*[contains(@id,'productDetails')]//table/tr"):
            key = product_detail.xpath("normalize-space(./th/text())").get()
            value = product_detail.xpath("normalize-space(./td/text())").get().replace("\u200e", "")
            if "best sellers rank" in key.lower():
                det_list = product_detail.xpath("./td/descendant::*/text()").getall()
                value = "".join([i.strip() for i in det_list])
            if "customer reviews" in key.lower():
                 det_list = product_detail.xpath("./td/descendant::span/text()").getall()
                 value = " ".join([i.strip() for i in det_list])
            details[key] = value
        yield details