Scrapy 爬虫不产生任何数据
Scrapy crawler not yielding any Data
我在这里遇到一个奇怪的问题,爬虫 运行 没有任何错误,也没有产生任何数据。
这是一页的起始代码:
# zillow scraper class
class ZillowScraper(scrapy.Spider):
# scraper/spider name
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def __init__(self):
self.zpid = []
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
这是解析链接回调,我在其中从 json 获取数据并从 json 获取 id,并将其附加到 class 变量列表中以用于比较带有列表 ID 的 ID:
def parse_links(self, response):
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for zid in data:
self.zpid.append(zid)
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
)
这是此函数中的最终回调解析详细信息我正在从 json 获取数据。首先,我正在做一些 url 解析以从 url 中获取 id 以将其与 self.zpid 列表进行比较,然后我 运行 循环 self.zpid 列表并检查 listing_id(url id) 是否等于 self.zpid 列表 id。然后通过id动态生成key获取详细数据:
def parse_detail(self, response):
item = {}
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
for zid in self.zpid:
if zid == listing_id:
print(zid)
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
item["date"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["datePostedString"]
item["home_status"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["hdpTypeDimension"]
item["home_type"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["homeType"]
item["sqft"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid": {zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["livingArea"]
item["street_address"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["streetAddress"]
item["city"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["city"]
item["state"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["state"]
item["zipcode"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["zipcode"]
item["price"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["price"]
item["zestimate"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["zestimate"]
item["parcel_number"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["resoFacts"]["parcelNumber"]
yield item
# main driver
if __name__ == "__main__":
# run scraper
process = CrawlerProcess()
process.crawl(ZillowScraper)
process.start()
现在爬虫正在 运行 命中 url 获得 200 个响应和所有内容,但没有产生数据。我在这里做错了什么?
我试过 运行 没有比较 id 的爬虫,它输出有意义的 keyerror 但除此之外,爬虫只是 运行 并点击 url 得到 200 个响应但为空字典。我试过了
response.follow
而不是发起
scrapy.Request
但没有输出只有 {} 个空字典。
我期待:
{'date': 2022-03-11, 'home_status': 'For sale', 'home_type': 'Residential', 'sqft': '2,249', 'street_address': '659 Erskine Dr', 'city': 'Pacific Palisades', 'state': 'CA', 'zipcode': '90272', 'price': ',995,000', 'zestimate': ',356,900', 'parcel_number': 4413016022}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 01:04:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 54014,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 9157579,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 15.943654,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 23, 20, 4, 17, 44889),
'httpcompression/response_bytes': 49582733,
'httpcompression/response_count': 41,
'item_scraped_count': 40,
'log_count/DEBUG': 90,
'log_count/INFO': 10,
'memusage/max': 54341632,
'memusage/startup': 54341632,
'request_depth_max': 1,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2022, 3, 23, 20, 4, 1, 101235)}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Spider closed (finished)
你在很多地方都有同样的问题。
第一名
if zid == listing_id:
listing_id
是一个数字,但 zid
是一个字典。
您必须使用 ["id"]
从字典中获取数字
if zid["id"] == listing_id:
后来在所有带有 "zpid":{zid}
的键中都出现了同样的问题 - 你需要 "zpid":{zid["id"]}
在一个键中,您还有额外的 space "zpid": {zid}
,您必须将其删除。
编辑:
另一个小问题 - 你 yield item
在 if
之外,但有时它找不到 if zid["id"] == listing_id:
并且它在文件中生成空行。你应该在 if
.
内屈服
顺便说一句:
坦率地说,我不喜欢列表 self.zpid
的想法,因为它必须搜索列表中的所有值。许多工人的代码可能 运行 并且他们可能已经分开 self.zpid
并且他们可能无法在此列表中找到元素。标准方法是使用
将值发送到下一个函数
Request(... , meta={"data": zid})
回调将其获取为
zid = response.meta["data"]
但是最新的scrapy可以将它作为参数发送给回调
Request(... , cb_kwars={"data": zid})
并且回调将其作为
中的参数
def parse_detail(self, response, data):
具有其他更改的完整工作代码
import scrapy
import json
class ZillowScraper(scrapy.Spider):
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
def parse_links(self, response):
print('[parse_links] url:', response.url)
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
meta={'data': listing}
)
def parse_detail(self, response):
print('[parse_detail] url:', response.url)
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
zid = response.meta['data']
#print('listing_id:', listing_id)
#print("zid['id']:", zid['id'])
if zid['id'] == listing_id:
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
id_ = zid['id']
key_1 = f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{id_},"contactFormRenderParameter":{{"zpid":{id_},"platform":"desktop","isDoubleScroll":true}}}}'
key_2 = f'VariantQuery{{"zpid":{id_},"altId":null}}'
properties_1 = sub_data[key_1]["property"]
properties_2 = sub_data[key_2]["property"]
item = {}
item["date"] = properties_1["datePostedString"]
item["home_status"] = properties_1["hdpTypeDimension"]
item["home_type"] = properties_1["homeType"]
item["sqft"] = properties_1["livingArea"]
item["street_address"] = properties_2["streetAddress"]
item["city"] = properties_2["city"]
item["state"] = properties_2["state"]
item["zipcode"] = properties_2["zipcode"]
item["price"] = properties_2["price"]
item["zestimate"] = properties_1["zestimate"]
item["parcel_number"] = properties_1["resoFacts"]["parcelNumber"]
yield item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ZillowScraper)
c.start()
我在这里遇到一个奇怪的问题,爬虫 运行 没有任何错误,也没有产生任何数据。
这是一页的起始代码:
# zillow scraper class
class ZillowScraper(scrapy.Spider):
# scraper/spider name
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def __init__(self):
self.zpid = []
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
这是解析链接回调,我在其中从 json 获取数据并从 json 获取 id,并将其附加到 class 变量列表中以用于比较带有列表 ID 的 ID:
def parse_links(self, response):
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for zid in data:
self.zpid.append(zid)
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
)
这是此函数中的最终回调解析详细信息我正在从 json 获取数据。首先,我正在做一些 url 解析以从 url 中获取 id 以将其与 self.zpid 列表进行比较,然后我 运行 循环 self.zpid 列表并检查 listing_id(url id) 是否等于 self.zpid 列表 id。然后通过id动态生成key获取详细数据:
def parse_detail(self, response):
item = {}
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
for zid in self.zpid:
if zid == listing_id:
print(zid)
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
item["date"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["datePostedString"]
item["home_status"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["hdpTypeDimension"]
item["home_type"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["homeType"]
item["sqft"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid": {zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["livingArea"]
item["street_address"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["streetAddress"]
item["city"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["city"]
item["state"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["state"]
item["zipcode"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["zipcode"]
item["price"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["price"]
item["zestimate"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["zestimate"]
item["parcel_number"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["resoFacts"]["parcelNumber"]
yield item
# main driver
if __name__ == "__main__":
# run scraper
process = CrawlerProcess()
process.crawl(ZillowScraper)
process.start()
现在爬虫正在 运行 命中 url 获得 200 个响应和所有内容,但没有产生数据。我在这里做错了什么?
我试过 运行 没有比较 id 的爬虫,它输出有意义的 keyerror 但除此之外,爬虫只是 运行 并点击 url 得到 200 个响应但为空字典。我试过了
response.follow
而不是发起
scrapy.Request
但没有输出只有 {} 个空字典。
我期待:
{'date': 2022-03-11, 'home_status': 'For sale', 'home_type': 'Residential', 'sqft': '2,249', 'street_address': '659 Erskine Dr', 'city': 'Pacific Palisades', 'state': 'CA', 'zipcode': '90272', 'price': ',995,000', 'zestimate': ',356,900', 'parcel_number': 4413016022}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 01:04:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 54014,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 9157579,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 15.943654,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 23, 20, 4, 17, 44889),
'httpcompression/response_bytes': 49582733,
'httpcompression/response_count': 41,
'item_scraped_count': 40,
'log_count/DEBUG': 90,
'log_count/INFO': 10,
'memusage/max': 54341632,
'memusage/startup': 54341632,
'request_depth_max': 1,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2022, 3, 23, 20, 4, 1, 101235)}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Spider closed (finished)
你在很多地方都有同样的问题。
第一名
if zid == listing_id:
listing_id
是一个数字,但 zid
是一个字典。
您必须使用 ["id"]
从字典中获取数字
if zid["id"] == listing_id:
后来在所有带有 "zpid":{zid}
的键中都出现了同样的问题 - 你需要 "zpid":{zid["id"]}
在一个键中,您还有额外的 space "zpid": {zid}
,您必须将其删除。
编辑:
另一个小问题 - 你 yield item
在 if
之外,但有时它找不到 if zid["id"] == listing_id:
并且它在文件中生成空行。你应该在 if
.
顺便说一句:
坦率地说,我不喜欢列表 self.zpid
的想法,因为它必须搜索列表中的所有值。许多工人的代码可能 运行 并且他们可能已经分开 self.zpid
并且他们可能无法在此列表中找到元素。标准方法是使用
Request(... , meta={"data": zid})
回调将其获取为
zid = response.meta["data"]
但是最新的scrapy可以将它作为参数发送给回调
Request(... , cb_kwars={"data": zid})
并且回调将其作为
中的参数def parse_detail(self, response, data):
具有其他更改的完整工作代码
import scrapy
import json
class ZillowScraper(scrapy.Spider):
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
def parse_links(self, response):
print('[parse_links] url:', response.url)
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
meta={'data': listing}
)
def parse_detail(self, response):
print('[parse_detail] url:', response.url)
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
zid = response.meta['data']
#print('listing_id:', listing_id)
#print("zid['id']:", zid['id'])
if zid['id'] == listing_id:
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
id_ = zid['id']
key_1 = f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{id_},"contactFormRenderParameter":{{"zpid":{id_},"platform":"desktop","isDoubleScroll":true}}}}'
key_2 = f'VariantQuery{{"zpid":{id_},"altId":null}}'
properties_1 = sub_data[key_1]["property"]
properties_2 = sub_data[key_2]["property"]
item = {}
item["date"] = properties_1["datePostedString"]
item["home_status"] = properties_1["hdpTypeDimension"]
item["home_type"] = properties_1["homeType"]
item["sqft"] = properties_1["livingArea"]
item["street_address"] = properties_2["streetAddress"]
item["city"] = properties_2["city"]
item["state"] = properties_2["state"]
item["zipcode"] = properties_2["zipcode"]
item["price"] = properties_2["price"]
item["zestimate"] = properties_1["zestimate"]
item["parcel_number"] = properties_1["resoFacts"]["parcelNumber"]
yield item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ZillowScraper)
c.start()