由于 TypeError,scrapy 蜘蛛无法启动

scrapy spider won't start due to TypeError

我正在尝试使用我已成功部署在其他项目中的代码为德国二手产品网站拼凑一个 scrapy 蜘蛛。然而这一次,我 运行 遇到了 TypeError 并且我似乎无法弄清楚为什么。

与这个问题相比()似乎蜘蛛被喂食了一个非字符串类型URL,但是在检查负责生成[=的各个代码块时43=]s刮,好像都在吐丝。

描述蜘蛛的一般功能并使其更易于阅读:

  1. URL生成器负责提供起始URL (搜索结果第一页)
  2. parse_search_pages函数负责拉取列表 来自该页面上 post 的 URL。
  3. 它会检查 Dataframe 是否在过去被抓取过。如果没有,它 会刮掉它。
  4. 对个人 post 调用了 parse_listing 函数。它用 x_path 变量来提取所有数据。之后会继续 使用 CrawlSpider 规则的下一页。

自从我使用这段代码以来已经有 2 年了,我知道很多功能可能已经改变。所以希望你能帮我指出我做错了什么?

干杯, R.

///

代码

import pandas as pd
import scrapy
from datetime import date
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

# whitevan scraper - Ebay Kleinanzeigen "Elektronik" category scraper
# 1. URL filters out "Gesuche", "Gewerblich" & sets sorting to "Günstigste zuerst"
# to-do: scrapes only listings marked "Zu verschenken"
# to-do: make sure reserviert and removed ads are also removed from the CSV

TODAY = date.today().strftime("%d/%m/%Y")

df = pd.read_csv(
    r'C:\Users\stefa\Documents\VSCodeProjects\scrapers\whitevan\data\whitevan.csv', delimiter=';')
pd.set_option('display.max_columns', None)

# pick city & category to scrape
city_pick = "berlin"  # berlin, munich, hannover
category_pick = "electronics"  # electronics

PRE = "https://www.",
DOMAIN = "ebay-kleinanzeigen.de",

def url_generator(city, category):
    # Function generates an eBay-Kleinanzeigen URL from chosen city & category
    # To-do: make sorting & filtering a function variable

    URL_LIBRARY = {
        "sorting": ["sortierung:preis", "sortierung:zeit"],
        "seller": ["anbieter:privat", "anbieter:gewerblich"],
        "listing": ["angebote", "gesuche"],
        "cities": {
            "berlin": ["berlin", "l3331"],
            "munich": ["muenchen", "l6411"],
            "hannover": ["hannover", "l3155"]
        },
        "categories": {
            "electronics": ["s-multimedia-elektronik", "c161"]
        }
    }

    return "/{category}/{city}/{sorting}/{seller}/{listing}/{code}{city_code}".format(
        category=URL_LIBRARY["categories"][category][0],
        city=URL_LIBRARY["cities"][city][0],
        sorting=URL_LIBRARY["sorting"][0],
        seller=URL_LIBRARY["seller"][0],
        listing=URL_LIBRARY["listing"][0],
        code=URL_LIBRARY["categories"][category][1],
        city_code=URL_LIBRARY["cities"][city][1]
    )


# tested with scrapy shell
x_paths = {
    'header': '//h1[@class="boxedarticle--title"]/text()',
    'description': '//p[@class="text-force-linebreak "]/text()',
    'location': '//span[@id="viewad-locality"]/text()',
    'listing_date': '//div[@id="viewad-extra-info"]/div/span/text()',
    'url': '//head/link[@rel="canonical"]/@href',
    'type': '//li[contains(text(),"Art")]/span/text()',
    'subgroup': '//li[contains(text(),"Gerät & Zubehör")]/span/text()',
    'condition': '//li[contains(text(),"Zustand")]/span/text()',
    'shipping': '//li[contains(text(),"Versand")]/span/text()',
    'user': '//span[@class="text-body-regular-strong text-force-linebreak"]/a/text()',
    'phone_no': '//span[@id="viewad-contact-phone"]/text()',
    'satisfaction': '//span[@class="userbadges-vip userbadges-profile-rating"]/span/text()',
    'friendliness': '//span[@class="userbadges-vip userbadges-profile-friendliness"]/span/text()',
    'reliability': '//span[@class="userbadges-vip userbadges-profile-reliability"]/span/text()',
    'user_id': '//a[@id="poster-other-ads-link"]/@href',
    'posts_online': '//a[@id="poster-other-ads-link"]/text()'
}


class Whitevan(CrawlSpider):
    name = 'whitevan'
    allowed_domains = [DOMAIN]
    search_url = url_generator(city_pick, category_pick)
    start_urls = [f"https://www.ebay-kleinanzeigen.de{search_url}"]
    rules = [
        Rule(
            LinkExtractor(
                restrict_xpaths='//a[@class="pagination-next"]'
            ),
            callback='parse_search_pages',
            follow=True
        )
    ]

    def parse_search_pages(self, response):
        #creates a list of each post's respective URLs to be scraped
        url_list = response.xpath(
            '//li[@class="ad-listitem lazyload-item   "]/article/div/a/@href').getall()
        
        #adds the top level URL to the url so it can be compared to the URLs in the dataframe
        for item in url_list:
            full_url = f"https://www.ebay-kleinanzeigen.de{item}"

            #checks if URL exists in dataframe (thus can be skipped)
            if not df['url'].str.contains(full_url).any():
                #yields the function responsible for scraping the individual post
                yield scrapy.Request(full_url, callback=self.parse_listing)

    def parse_listing(self, response):
        temp_dict = {'date_scraped': TODAY}

        #goes through the dictionary of xpaths, checks the response & adds it to a temp_dict.
        #yields the temp_dict to be added to a CSV.
        for key in x_paths.keys():
            if response.xpath(x_paths[key]):
                temp_dict[key] = response.xpath(x_paths[key]).extract_first()
            else:
                temp_dict[key] = None

        yield temp_dict

    parse_start_url = parse_search_pages

终端输出:

PS C:\Users\stefa\Documents\VSCodeProjects\scrapers\whitevan> conda activate C:\ProgramData\Anaconda3\envs\whitevan
PS C:\Users\stefa\Documents\VSCodeProjects\scrapers\whitevan> & C:/ProgramData/Anaconda3/envs/whitevan/python.exe c:/Users/stefa/Documents/VSCodeProjects/scrapers/whitevan/whitevan/main.py
2022-02-26 12:43:03 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: whitevan)
2022-02-26 12:43:03 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 21.7.0, Python 3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1m  14 Dec 2021), cryptography 36.0.0, Platform Windows-10-10.0.19044-SP0
2022-02-26 12:43:03 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-02-26 12:43:03 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'whitevan',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 1,
 'NEWSPIDER_MODULE': 'whitevan.spiders',
 'SPIDER_MODULES': ['whitevan.spiders']}
2022-02-26 12:43:03 [scrapy.extensions.telnet] INFO: Telnet Password: e670bb7369bd25dd
2022-02-26 12:43:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-02-26 12:43:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',   
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',     
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',   
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-02-26 12:43:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-02-26 12:43:03 [scrapy.middleware] INFO: Enabled item pipelines:
[]
    2022-02-26 12:43:03 [scrapy.core.engine] INFO: Spider opened
    2022-02-26 12:43:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
    2022-02-26 12:43:03 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method OffsiteMiddleware.spider_opened of <scrapy.spidermiddlewares.offsite.OffsiteMiddleware object at 0x00000197491DF880>>
    Traceback (most recent call last):
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\utils\defer.py", line 157, in maybeDeferred_coro
        result = f(*args, **kw)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
        return receiver(*arguments, **named)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 76, in spider_opened
        self.host_regex = self.get_host_regex(spider)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 62, in get_host_regex
        elif url_pattern.match(domain):
    TypeError: expected string or bytes-like object
    2022-02-26 12:43:03 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
    2022-02-26 12:43:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ebay-kleinanzeigen.de/s-multimedia-elektronik/berlin/sortierung:preis/anbieter:privat/angebote/c161l3331> (referer: None)
    2022-02-26 12:43:04 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.ebay-kleinanzeigen.de/s-multimedia-elektronik/berlin/sortierung:preis/anbieter:privat/angebote/c161l3331> (referer: None)
    Traceback (most recent call last):
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
        yield next(it)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
        return next(self.data)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
        return next(self.data)
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
        for r in iterable:
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 31, in process_spider_output
        if x.dont_filter or self.should_follow(x, spider):
      File "C:\ProgramData\Anaconda3\envs\whitevan\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 46, in should_follow
        regex = self.host_regex
    AttributeError: 'OffsiteMiddleware' object has no attribute 'host_regex'
    2022-02-26 12:43:04 [scrapy.core.engine] INFO: Closing spider (finished)
    2022-02-26 12:43:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 307,
     'downloader/request_count': 1,
     'downloader/request_method_count/GET': 1,
     'downloader/response_bytes': 24282,
     'downloader/response_count': 1,
     'downloader/response_status_count/200': 1,
     'elapsed_time_seconds': 1.146168,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2022, 2, 26, 11, 43, 4, 745511),
     'httpcompression/response_bytes': 180025,
     'httpcompression/response_count': 1,
     'log_count/DEBUG': 1,
     'log_count/ERROR': 2,
     'log_count/INFO': 10,
     'request_depth_max': 1,
     'response_received_count': 1,
     'scheduler/dequeued': 1,
     'scheduler/dequeued/memory': 1,
     'scheduler/enqueued': 1,
     'scheduler/enqueued/memory': 1,
     'spider_exceptions/AttributeError': 1,
     'start_time': datetime.datetime(2022, 2, 26, 11, 43, 3, 599343)}
    2022-02-26 12:43:04 [scrapy.core.engine] INFO: Spider closed (finished)

所以答案很简单:)总是triple-check你的代码!在不应该出现的地方仍然有一些逗号。这导致我的 allowed_domains 变量成为元组而不是字符串。

不正确

PRE = "https://www.",
DOMAIN = "ebay-kleinanzeigen.de",

固定

PRE = "https://www."
DOMAIN = "ebay-kleinanzeigen.de"