使用 scrapy 抓取网站中的所有 url,而不是检索与该域关联的完整 url
Scraping all urls in a website using scrapy not retreiving complete urls associated with that domain
我正在尝试抓取 https://www.laphil.com/ https://madisonsymphony.org/ https://www.californiasymphony.org/ 等网站中的所有 url,仅举几例。我得到了很多 url 的刮擦,但没有得到与该域相关的完整 url。我不确定为什么它没有抓取所有 urls.
代码
items.py
import scrapy
class ScraperItem(scrapy.Item):
# The source URL
url_from = scrapy.Field()
# The destination URL
url_to = scrapy.Field()
my_crawler.py
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['laphil.com']
start_urls = ['https://www.laphil.com/']
rules = (
Rule(
LinkExtractor(
deny=[
],
),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url
}
next_page = response.css('a::attr(href)').extract()
yield {"sub_url":[response.urljoin(ind_url) for ind_url in next_page]}}
执行
scrapy crawl symphony --logfile laph.log -o laph.jl -t jsonlines
如果我们从爬取的url列表中获取一个url,例如,https://www.laphil.com/events/series/210, there are several links directing from this page like https://www.laphil.com/events/performances/1356/2022-01-08/prokofiev-and-mtt , https://www.laphil.com/events/performances/1337/2021-11-06/reich-adams-and-rachmaninoff等未获取。如何涂写所有这些 url 并避免在结果列表
中出现重复的 url
spider.py:
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['laphil.com']
start_urls = ['https://www.laphil.com']
base_url = 'https://www.laphil.com'
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'tempbuffer.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# This settings are a must:
# Duplicates pipeline
'ITEM_PIPELINES': {'tempbuffer.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='laphil.com'),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url
}
# see if you really need this loop (since you're parsing all the urls in the domain anyway, and you'll need
# to filter all those duplicates):
all_urls = response.css('a::attr(href)').getall()
# In order to change from relative to absolute url in the pipeline:
self.base_url = response.url
for url in all_urls:
yield {
'url': url
}
pipeline.py:
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.scraped_urls = set()
def process_item(self, item, spider):
url = item['url'].strip()
# if it's a relative url then convert to absolute url
if 'http' not in url:
url = spider.base_url + url
item['url'] = url
if url in self.scraped_urls:
raise DropItem(f'Duplicate url: \"{url}\"')
else:
self.scraped_urls.add(url)
return item
- 我没有添加
middlewares.py
文件,因为它不是必需的,而且它有很多行。
- 在您的代码中,而不是
tempbuffer
,它应该是 your project's name
(您需要在管道的 custom_settings 中替换它)。
- 我将域添加到规则中作为唯一允许的域,因此您不会抓取任何您不想要的内容。
- 我验证了重复管道有效,并且它实际上创建了一个包含一列的 csv 文件。
我正在尝试抓取 https://www.laphil.com/ https://madisonsymphony.org/ https://www.californiasymphony.org/ 等网站中的所有 url,仅举几例。我得到了很多 url 的刮擦,但没有得到与该域相关的完整 url。我不确定为什么它没有抓取所有 urls.
代码
items.py
import scrapy
class ScraperItem(scrapy.Item):
# The source URL
url_from = scrapy.Field()
# The destination URL
url_to = scrapy.Field()
my_crawler.py
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['laphil.com']
start_urls = ['https://www.laphil.com/']
rules = (
Rule(
LinkExtractor(
deny=[
],
),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url
}
next_page = response.css('a::attr(href)').extract()
yield {"sub_url":[response.urljoin(ind_url) for ind_url in next_page]}}
执行
scrapy crawl symphony --logfile laph.log -o laph.jl -t jsonlines
如果我们从爬取的url列表中获取一个url,例如,https://www.laphil.com/events/series/210, there are several links directing from this page like https://www.laphil.com/events/performances/1356/2022-01-08/prokofiev-and-mtt , https://www.laphil.com/events/performances/1337/2021-11-06/reich-adams-and-rachmaninoff等未获取。如何涂写所有这些 url 并避免在结果列表
中出现重复的 urlspider.py:
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['laphil.com']
start_urls = ['https://www.laphil.com']
base_url = 'https://www.laphil.com'
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'tempbuffer.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# This settings are a must:
# Duplicates pipeline
'ITEM_PIPELINES': {'tempbuffer.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='laphil.com'),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url
}
# see if you really need this loop (since you're parsing all the urls in the domain anyway, and you'll need
# to filter all those duplicates):
all_urls = response.css('a::attr(href)').getall()
# In order to change from relative to absolute url in the pipeline:
self.base_url = response.url
for url in all_urls:
yield {
'url': url
}
pipeline.py:
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.scraped_urls = set()
def process_item(self, item, spider):
url = item['url'].strip()
# if it's a relative url then convert to absolute url
if 'http' not in url:
url = spider.base_url + url
item['url'] = url
if url in self.scraped_urls:
raise DropItem(f'Duplicate url: \"{url}\"')
else:
self.scraped_urls.add(url)
return item
- 我没有添加
middlewares.py
文件,因为它不是必需的,而且它有很多行。 - 在您的代码中,而不是
tempbuffer
,它应该是your project's name
(您需要在管道的 custom_settings 中替换它)。 - 我将域添加到规则中作为唯一允许的域,因此您不会抓取任何您不想要的内容。
- 我验证了重复管道有效,并且它实际上创建了一个包含一列的 csv 文件。