从 table 抓取特定文本
Scrape specific text from table
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.sg/s?k=Measuring+Tools+%26+Scales&i=home&crid=1011S67HHJSEW&sprefix=measuring+tools+%26+scales%2Chome%2C408&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@id="productDetails_techSpec_section_1"]//tr')
table={}
for row in rows:
brand = row.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'Brand')]/following-sibling::td/text()").get()
asin = row.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'ASIN')]/following-sibling::td/text()").get().replace('\u200e',"")
table.update({'Brand':brand,'Asin':asin})
yield table
我只想从 table 抓取 brand
和 ASIN
我从 product information
抓取文本,这些是 link https://www.amazon.sg/Etekcity-Accurate-Measuring-Packages-Stainless/dp/B08BPB9T1N/ref=sr_1_1?crid=1011S67HHJSEW&keywords=Measuring%2BTools%2B%26%2BScales&qid=1643125635&s=home&sprefix=measuring%2Btools%2B%26%2Bscales%2Chome%2C408&sr=1-1&th=1
如果您只需要品牌和 ASIN,则无需遍历整个 table。您可以使用 xpath 直接 select 那些属性。一种方法是使用 following.
brand = response.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'Brand')]/following-sibling::td/text()").get()
asin = response.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'ASIN')]/following-sibling::td/text()").get()
您可能需要使用 str().strip() 稍微清理一下生成的文本。 xpath 所说的全部内容是“找到带有正确 class 和 'Brand' 或 'ASIN' 文本的第 th 个标签,然后向前看下一个 TD 标签并获取该文本。”。 =11=]
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.sg/s?k=Measuring+Tools+%26+Scales&i=home&crid=1011S67HHJSEW&sprefix=measuring+tools+%26+scales%2Chome%2C408&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@id="productDetails_techSpec_section_1"]//tr')
table={}
for row in rows:
brand = row.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'Brand')]/following-sibling::td/text()").get()
asin = row.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'ASIN')]/following-sibling::td/text()").get().replace('\u200e',"")
table.update({'Brand':brand,'Asin':asin})
yield table
我只想从 table 抓取 brand
和 ASIN
我从 product information
抓取文本,这些是 link https://www.amazon.sg/Etekcity-Accurate-Measuring-Packages-Stainless/dp/B08BPB9T1N/ref=sr_1_1?crid=1011S67HHJSEW&keywords=Measuring%2BTools%2B%26%2BScales&qid=1643125635&s=home&sprefix=measuring%2Btools%2B%26%2Bscales%2Chome%2C408&sr=1-1&th=1
如果您只需要品牌和 ASIN,则无需遍历整个 table。您可以使用 xpath 直接 select 那些属性。一种方法是使用 following.
brand = response.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'Brand')]/following-sibling::td/text()").get()
asin = response.xpath("//th[@class='a-color-secondary a-size-base prodDetSectionEntry' and contains(text(), 'ASIN')]/following-sibling::td/text()").get()
您可能需要使用 str().strip() 稍微清理一下生成的文本。 xpath 所说的全部内容是“找到带有正确 class 和 'Brand' 或 'ASIN' 文本的第 th 个标签,然后向前看下一个 TD 标签并获取该文本。”。 =11=]