Scrapy 爬虫不产生任何数据

Scrapy crawler not yielding any Data

我在这里遇到一个奇怪的问题,爬虫 运行 没有任何错误,也没有产生任何数据。

这是一页的起始代码:


# zillow scraper class
class ZillowScraper(scrapy.Spider):
    # scraper/spider name
    name = "zillow"

    # custom_settings = {
    #     "FEED_FORMAT": "csv",
    #     "FEED_URI": "zillow_data.csv",
    # }

    # base URL
    base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"

    # custom headers
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    }

    # string query parameters
    params = {
        "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
    }

    def __init__(self):
        self.zpid = []

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_links
        )

这是解析链接回调,我在其中从 json 获取数据并从 json 获取 id,并将其附加到 class 变量列表中以用于比较带有列表 ID 的 ID:

def parse_links(self, response):
        results_selector = response.css(
            'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
        ).get()
        clean_json = (
            results_selector.replace(
                '<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
                "",
            )
            .replace("</script>", "")
            .replace("-->", "")
        )
        parsed_data = json.loads(clean_json)
        data = parsed_data["cat1"]["searchResults"]["listResults"]
        for zid in data:
            self.zpid.append(zid)

        for listing in data:
            yield scrapy.Request(
                url=listing["detailUrl"],
                headers=self.headers,
                callback=self.parse_detail,
            )

这是此函数中的最终回调解析详细信息我正在从 json 获取数据。首先,我正在做一些 url 解析以从 url 中获取 id 以将其与 self.zpid 列表进行比较,然后我 运行 循环 self.zpid 列表并检查 listing_id(url id) 是否等于 self.zpid 列表 id。然后通过id动态生成key获取详细数据:

def parse_detail(self, response):
        item = {}
        listing_url = response.url.split("/")
        parse_id = [u for u in listing_url if u]
        listing_id = parse_id[4][:8]

        for zid in self.zpid:
            if zid == listing_id:
                print(zid)

                api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
                clean_json = api_endpoint.replace(
                    '<script id="hdpApolloPreloadedData" type="application/json">', ""
                ).replace("</script>", "")
                parsed_data = json.loads(clean_json)
                sub_data = json.loads(parsed_data["apiCache"])

                item["date"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["datePostedString"]
                item["home_status"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["hdpTypeDimension"]
                item["home_type"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["homeType"]
                item["sqft"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid": {zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["livingArea"]
                item["street_address"] = sub_data[
                    f'VariantQuery{{"zpid":{zid},"altId":null}}'
                ]["property"]["streetAddress"]
                item["city"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["city"]
                item["state"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["state"]
                item["zipcode"] = sub_data[
                    f'VariantQuery{{"zpid":{zid},"altId":null}}'
                ]["property"]["zipcode"]
                item["price"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["price"]
                item["zestimate"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["zestimate"]
                item["parcel_number"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["resoFacts"]["parcelNumber"]

        yield item


# main driver
if __name__ == "__main__":
    # run scraper
    process = CrawlerProcess()
    process.crawl(ZillowScraper)
    process.start()

现在爬虫正在 运行 命中 url 获得 200 个响应和所有内容,但没有产生数据。我在这里做错了什么?

我试过 运行 没有比较 id 的爬虫,它输出有意义的 keyerror 但除此之外,爬虫只是 运行 并点击 url 得到 200 个响应但为空字典。我试过了

response.follow 

而不是发起

scrapy.Request 

但没有输出只有 {} 个空字典。

我期待:

{'date': 2022-03-11, 'home_status': 'For sale', 'home_type': 'Residential', 'sqft': '2,249', 'street_address': '659 Erskine Dr', 'city': 'Pacific Palisades', 'state': 'CA', 'zipcode': '90272', 'price': ',995,000', 'zestimate': ',356,900', 'parcel_number': 4413016022}

2022-03-24 01:04:17 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 01:04:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 54014,
 'downloader/request_count': 41,
 'downloader/request_method_count/GET': 41,
 'downloader/response_bytes': 9157579,
 'downloader/response_count': 41,
 'downloader/response_status_count/200': 41,
 'elapsed_time_seconds': 15.943654,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 3, 23, 20, 4, 17, 44889),
 'httpcompression/response_bytes': 49582733,
 'httpcompression/response_count': 41,
 'item_scraped_count': 40,
 'log_count/DEBUG': 90,
 'log_count/INFO': 10,
 'memusage/max': 54341632,
 'memusage/startup': 54341632,
 'request_depth_max': 1,
 'response_received_count': 41,
 'scheduler/dequeued': 41,
 'scheduler/dequeued/memory': 41,
 'scheduler/enqueued': 41,
 'scheduler/enqueued/memory': 41,
 'start_time': datetime.datetime(2022, 3, 23, 20, 4, 1, 101235)}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Spider closed (finished)

你在很多地方都有同样的问题。

第一名

if zid == listing_id:

listing_id 是一个数字,但 zid 是一个字典。

您必须使用 ["id"] 从字典中获取数字

if zid["id"] == listing_id:

后来在所有带有 "zpid":{zid} 的键中都出现了同样的问题 - 你需要 "zpid":{zid["id"]}

在一个键中,您还有额外的 space "zpid": {zid},您必须将其删除。


编辑:

另一个小问题 - 你 yield itemif 之外,但有时它找不到 if zid["id"] == listing_id: 并且它在文件中生成空行。你应该在 if.

内屈服

顺便说一句:

坦率地说,我不喜欢列表 self.zpid 的想法,因为它必须搜索列表中的所有值。许多工人的代码可能 运行 并且他们可能已经分开 self.zpid 并且他们可能无法在此列表中找到元素。标准方法是使用

将值发送到下一个函数
Request(... , meta={"data": zid})

回调将其获取为

zid = response.meta["data"]

但是最新的scrapy可以将它作为参数发送给回调

Request(... , cb_kwars={"data": zid})

并且回调将其作为

中的参数
def parse_detail(self, response, data):

具有其他更改的完整工作代码

import scrapy
import json


class ZillowScraper(scrapy.Spider):

    name = "zillow"

    # custom_settings = {
    #     "FEED_FORMAT": "csv",
    #     "FEED_URI": "zillow_data.csv",
    # }

    # base URL
    base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"

    # custom headers
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    }

    # string query parameters
    params = {
        "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
    }

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_links
        )

    def parse_links(self, response):
        print('[parse_links] url:', response.url)
        
        results_selector = response.css(
            'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
        ).get()
        
        clean_json = (
            results_selector.replace(
                '<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
                "",
            )
            .replace("</script>", "")
            .replace("-->", "")
        )
        
        parsed_data = json.loads(clean_json)
        data = parsed_data["cat1"]["searchResults"]["listResults"]

        for listing in data:
            yield scrapy.Request(
                url=listing["detailUrl"],
                headers=self.headers,
                callback=self.parse_detail,
                meta={'data': listing}
            )

    def parse_detail(self, response):
        print('[parse_detail] url:', response.url)

        listing_url = response.url.split("/")
        parse_id = [u for u in listing_url if u]
        
        listing_id = parse_id[4][:8]
        zid = response.meta['data']        

        #print('listing_id:', listing_id)
        #print("zid['id']:", zid['id'])
        
        if zid['id'] == listing_id:

            api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
            
            clean_json = api_endpoint.replace(
                '<script id="hdpApolloPreloadedData" type="application/json">', ""
            ).replace("</script>", "")
            
            parsed_data = json.loads(clean_json)
            sub_data = json.loads(parsed_data["apiCache"])
            
            id_ = zid['id']
            
            key_1 = f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{id_},"contactFormRenderParameter":{{"zpid":{id_},"platform":"desktop","isDoubleScroll":true}}}}' 
            key_2 = f'VariantQuery{{"zpid":{id_},"altId":null}}'

            properties_1 = sub_data[key_1]["property"]
            properties_2 = sub_data[key_2]["property"]

            item = {}
            
            item["date"]        = properties_1["datePostedString"]
            item["home_status"] = properties_1["hdpTypeDimension"]
            item["home_type"]   = properties_1["homeType"]
            item["sqft"]        = properties_1["livingArea"]
            
            item["street_address"] = properties_2["streetAddress"]
            item["city"]           = properties_2["city"]
            item["state"]          = properties_2["state"]
            item["zipcode"]        = properties_2["zipcode"]
            item["price"]          = properties_2["price"]
            
            item["zestimate"]     = properties_1["zestimate"]
            item["parcel_number"] = properties_1["resoFacts"]["parcelNumber"]

            yield item
        
from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ZillowScraper)
c.start()