使用 scrapy 找不到 Xpath
Xpath not found using scrapy
我想提取 email
和 phone
但我找不到它的 xpath。我只会检索 website
的 xpath,这是我提取数据的页面的 link:https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[@class='field__item link link--external']//@href").get()
yield{
'website':link
}
现在,它正在工作。
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[@class='field__item link link--external']//@href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[@class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[@class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}
我想提取 email
和 phone
但我找不到它的 xpath。我只会检索 website
的 xpath,这是我提取数据的页面的 link:https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[@class='field__item link link--external']//@href").get()
yield{
'website':link
}
现在,它正在工作。
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='views-field views-field-title']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[@class='field__item link link--external']//@href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[@class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[@class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}