从电子邮件中提取数据
Extract data from email
我正在从 email
中提取数据,但它们会为我提供这样的输出,我只想得到 email
这是 link https://www.wlw.at/de/firma/hacobau-hallen-und-containersysteme-gmbh-1373570
他们向我展示了这样的输出:
'email': '<span data-v-3743af0a data-v-605c4f02>sales@mm-holz.com</span>
这是我的代码
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.wlw.at/de/suche?q=hallenbau']
def parse(self, response):
books = response.xpath("//div[@class='company-title-link-wrap']/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
website=response.xpath("//a[@id='location-and-contact__website']//@href").get()
mail = response.xpath("//a[@id='location-and-contact__email']//span").get()
yield{
'website':website,
'email':mail
}
只需在选择器的 xpath 中添加 text()
。
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.wlw.at/de/suche?q=hallenbau']
def parse(self, response):
books = response.xpath("//div[@class='company-title-link-wrap']/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
website=response.xpath("//a[@id='location-and-contact__website']//@href").get()
mail = response.xpath("//a[@id='location-and-contact__email']//span//text()").get()
yield{
'website':website,
'email':mail
}
我正在从 email
中提取数据,但它们会为我提供这样的输出,我只想得到 email
这是 link https://www.wlw.at/de/firma/hacobau-hallen-und-containersysteme-gmbh-1373570
他们向我展示了这样的输出:
'email': '<span data-v-3743af0a data-v-605c4f02>sales@mm-holz.com</span>
这是我的代码
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.wlw.at/de/suche?q=hallenbau']
def parse(self, response):
books = response.xpath("//div[@class='company-title-link-wrap']/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
website=response.xpath("//a[@id='location-and-contact__website']//@href").get()
mail = response.xpath("//a[@id='location-and-contact__email']//span").get()
yield{
'website':website,
'email':mail
}
只需在选择器的 xpath 中添加 text()
。
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.wlw.at/de/suche?q=hallenbau']
def parse(self, response):
books = response.xpath("//div[@class='company-title-link-wrap']/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
website=response.xpath("//a[@id='location-and-contact__website']//@href").get()
mail = response.xpath("//a[@id='location-and-contact__email']//span//text()").get()
yield{
'website':website,
'email':mail
}