运行 以下代码在 Python 上使用 Scrapy 时未获取任何数据
Not getting any data scraped when running the following code using Scrapy on Python
这是我用来从 tripadvisor 抓取电子邮件地址和餐馆名称的蜘蛛
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
'https://www.tripadvisor.com/Restaurants-g188633-The_Hague_South_Holland_Province.html#EATERY_OVERVIEW_BOX'
]
def parse(self, response):
for listing in response.xpath('//div[contains(@class,"__cellContainer--")]'):
link = listing.xpath('.//a[contains(@class,"__restaurantName--")]/@href').get()
text = listing.xpath('.//a[contains(@class,"__restaurantName--")]/text()').get()
complete_url = response.urljoin(link)
yield scrapy.Request(
url=complete_url,
callback=self.parse_listing,
meta={'link': complete_url,'text': text}
)
next_url = response.xpath('//*[contains(@class,"pagination")]/*[contains(@class,"next")]/@href').get()
if next_url:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
def parse_listing(self, response):
link = response.meta['link']
text = response.meta['text']
email = response.xpath('//a[contains(@href, "mailto:")]/@href').get()
yield {'Link': link,'Text': text,'Email': email}
我在Anaconda提示符下运行以下命令行到运行上面的Spider并将其保存为json文件
scrapy crawl tripadvisorbot -O tripadvisor.json
没有数据被抓取,创建了一个 json 文件,但它是空的。
我不确定问题出在哪里,我对网络抓取和一般的 Python 编码还很陌生。非常感谢所有帮助
谢谢
在我的电脑上 HTML 中没有 class _cellContainer--
和 __restaurantName--
。
页面使用随机字符作为 class 名称。
但是每个项目都在 div 中直接在 <div data-test-target="restaurants-list">
中,我用它来获取所有项目。
稍后我得到第一个 <a>
(它有图像而不是 name
)然后我跳过 text
和 complete_url
但直接 运行 reponse.follow(link)
.
当我获得包含详细信息的页面时,我得到 reponse.url
得到 complete_url
和 h1
得到 text
您可以将所有代码放在一个文件中 运行 python script.py
而无需创建项目。
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
'https://www.tripadvisor.com/Restaurants-g188633-The_Hague_South_Holland_Province.html#EATERY_OVERVIEW_BOX'
]
def parse(self, response):
for listing in response.xpath('//div[@data-test-target="restaurants-list"]/div'):
url = listing.xpath('.//a/@href').get()
print('link:', url)
if url:
yield response.follow(url, callback=self.parse_listing)
next_url = response.xpath('//*[contains(@class,"pagination")]/*[contains(@class,"next")]/@href').get()
if next_url:
yield response.follow(next_url)
def parse_listing(self, response):
print('url:', response.url)
link = response.url
text = response.xpath('//h1[@data-test-target]/text()').get()
email = response.xpath('//a[contains(@href, "mailto:")]/@href').get()
yield {'Link': link, 'Text': text, 'Email': email}
# --- run without project and save data in `output.json` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.json': {'format': 'json'}}, # new in 2.1
})
c.crawl(RestaurantSpider)
c.start()
部分结果:
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4766834-Reviews-Bab_mansour-The_Hague_South_Holland_Province.html", "Text": "Bab mansour", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d3935897-Reviews-Milos-The_Hague_South_Holland_Province.html", "Text": "Milos", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d10902380-Reviews-Nefeli_deli-The_Hague_South_Holland_Province.html", "Text": "Nefeli deli", "Email": "mailto:info@foodloversnl.com?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d8500914-Reviews-Waterkant-The_Hague_South_Holland_Province.html", "Text": "Waterkant", "Email": "mailto:alles@dewaterkant.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4481254-Reviews-Salero_Minang-The_Hague_South_Holland_Province.html", "Text": "Salero Minang", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d6451334-Reviews-Du_Passage-The_Hague_South_Holland_Province.html", "Text": "Du Passage", "Email": "mailto:info@dupassage.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4451714-Reviews-Lee_s_Garden-The_Hague_South_Holland_Province.html", "Text": "Lee's Garden", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d2181693-Reviews-Warunee-The_Hague_South_Holland_Province.html", "Text": "Warunee", "Email": "mailto:info@warunee.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d8064876-Reviews-Sallo_s-The_Hague_South_Holland_Province.html", "Text": "Sallo's", "Email": "mailto:info@sallos.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d16841532-Reviews-Saravanaa_Bhavan_Den_Haag-The_Hague_South_Holland_Province.html", "Text": "Saravanaa Bhavan Den Haag", "Email": "mailto:hsbamsterdam@saravanabhavan.com?subject=?"},
这是我用来从 tripadvisor 抓取电子邮件地址和餐馆名称的蜘蛛
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
'https://www.tripadvisor.com/Restaurants-g188633-The_Hague_South_Holland_Province.html#EATERY_OVERVIEW_BOX'
]
def parse(self, response):
for listing in response.xpath('//div[contains(@class,"__cellContainer--")]'):
link = listing.xpath('.//a[contains(@class,"__restaurantName--")]/@href').get()
text = listing.xpath('.//a[contains(@class,"__restaurantName--")]/text()').get()
complete_url = response.urljoin(link)
yield scrapy.Request(
url=complete_url,
callback=self.parse_listing,
meta={'link': complete_url,'text': text}
)
next_url = response.xpath('//*[contains(@class,"pagination")]/*[contains(@class,"next")]/@href').get()
if next_url:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
def parse_listing(self, response):
link = response.meta['link']
text = response.meta['text']
email = response.xpath('//a[contains(@href, "mailto:")]/@href').get()
yield {'Link': link,'Text': text,'Email': email}
我在Anaconda提示符下运行以下命令行到运行上面的Spider并将其保存为json文件
scrapy crawl tripadvisorbot -O tripadvisor.json
没有数据被抓取,创建了一个 json 文件,但它是空的。
我不确定问题出在哪里,我对网络抓取和一般的 Python 编码还很陌生。非常感谢所有帮助
谢谢
在我的电脑上 HTML 中没有 class _cellContainer--
和 __restaurantName--
。
页面使用随机字符作为 class 名称。
但是每个项目都在 div 中直接在 <div data-test-target="restaurants-list">
中,我用它来获取所有项目。
稍后我得到第一个 <a>
(它有图像而不是 name
)然后我跳过 text
和 complete_url
但直接 运行 reponse.follow(link)
.
当我获得包含详细信息的页面时,我得到 reponse.url
得到 complete_url
和 h1
得到 text
您可以将所有代码放在一个文件中 运行 python script.py
而无需创建项目。
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
'https://www.tripadvisor.com/Restaurants-g188633-The_Hague_South_Holland_Province.html#EATERY_OVERVIEW_BOX'
]
def parse(self, response):
for listing in response.xpath('//div[@data-test-target="restaurants-list"]/div'):
url = listing.xpath('.//a/@href').get()
print('link:', url)
if url:
yield response.follow(url, callback=self.parse_listing)
next_url = response.xpath('//*[contains(@class,"pagination")]/*[contains(@class,"next")]/@href').get()
if next_url:
yield response.follow(next_url)
def parse_listing(self, response):
print('url:', response.url)
link = response.url
text = response.xpath('//h1[@data-test-target]/text()').get()
email = response.xpath('//a[contains(@href, "mailto:")]/@href').get()
yield {'Link': link, 'Text': text, 'Email': email}
# --- run without project and save data in `output.json` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.json': {'format': 'json'}}, # new in 2.1
})
c.crawl(RestaurantSpider)
c.start()
部分结果:
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4766834-Reviews-Bab_mansour-The_Hague_South_Holland_Province.html", "Text": "Bab mansour", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d3935897-Reviews-Milos-The_Hague_South_Holland_Province.html", "Text": "Milos", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d10902380-Reviews-Nefeli_deli-The_Hague_South_Holland_Province.html", "Text": "Nefeli deli", "Email": "mailto:info@foodloversnl.com?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d8500914-Reviews-Waterkant-The_Hague_South_Holland_Province.html", "Text": "Waterkant", "Email": "mailto:alles@dewaterkant.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4481254-Reviews-Salero_Minang-The_Hague_South_Holland_Province.html", "Text": "Salero Minang", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d6451334-Reviews-Du_Passage-The_Hague_South_Holland_Province.html", "Text": "Du Passage", "Email": "mailto:info@dupassage.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d4451714-Reviews-Lee_s_Garden-The_Hague_South_Holland_Province.html", "Text": "Lee's Garden", "Email": null},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d2181693-Reviews-Warunee-The_Hague_South_Holland_Province.html", "Text": "Warunee", "Email": "mailto:info@warunee.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d8064876-Reviews-Sallo_s-The_Hague_South_Holland_Province.html", "Text": "Sallo's", "Email": "mailto:info@sallos.nl?subject=?"},
{"Link": "https://www.tripadvisor.com/Restaurant_Review-g188633-d16841532-Reviews-Saravanaa_Bhavan_Den_Haag-The_Hague_South_Holland_Province.html", "Text": "Saravanaa Bhavan Den Haag", "Email": "mailto:hsbamsterdam@saravanabhavan.com?subject=?"},