抓取 Html Table 他们会提供一些空结果
Scrape Html Table they will provide some empty result
import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
def parse(self, response):
for link in response.xpath("//div[@class='exbox-name']/a/@href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@class="expo-table general-color"]//tr')
table = {}
for row in rows:
key = row.xpath('.//td[1]//text()').get(default='').strip()
value = row.xpath('.//td[2]/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
我正在尝试抓取 table 但他们不会提供 Telefono
、Fax
、Email
、Membro di
、[=18 的信息=] 检查这些
{'Indirizzo': 'Dr.-Auner-Str. 21a', 'Città': 'Raaba / Graz', 'Nazionalità': 'Austria', 'Sito web': '', 'Stand': 'Pad. 5 B22 C27', 'Telefono': '', 'Fax': '', 'E-mail': '', 'Social': ''}
页面的link是http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh
电话和传真等的值在 a
标记中,因此您需要调整您的 xpath 选择器以解决这些情况。
请参阅下面的示例
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
def parse(self, response):
for link in response.xpath("//div[@class='exbox-name']/a/@href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@class="expo-table general-color"]/tr')
table = {}
for row in rows:
key = row.xpath('./td[1]//text()').get(default='').strip()
value = row.xpath('./td[2]/text() ').getall()
value = ''.join(value).strip()
if not value:
value = row.xpath('./td[2]/a/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
def parse(self, response):
for link in response.xpath("//div[@class='exbox-name']/a/@href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@class="expo-table general-color"]//tr')
table = {}
for row in rows:
key = row.xpath('.//td[1]//text()').get(default='').strip()
value = row.xpath('.//td[2]/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
我正在尝试抓取 table 但他们不会提供 Telefono
、Fax
、Email
、Membro di
、[=18 的信息=] 检查这些
{'Indirizzo': 'Dr.-Auner-Str. 21a', 'Città': 'Raaba / Graz', 'Nazionalità': 'Austria', 'Sito web': '', 'Stand': 'Pad. 5 B22 C27', 'Telefono': '', 'Fax': '', 'E-mail': '', 'Social': ''}
页面的link是http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh
电话和传真等的值在 a
标记中,因此您需要调整您的 xpath 选择器以解决这些情况。
请参阅下面的示例
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
def parse(self, response):
for link in response.xpath("//div[@class='exbox-name']/a/@href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[@class="expo-table general-color"]/tr')
table = {}
for row in rows:
key = row.xpath('./td[1]//text()').get(default='').strip()
value = row.xpath('./td[2]/text() ').getall()
value = ''.join(value).strip()
if not value:
value = row.xpath('./td[2]/a/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table