Scrapy:解析来自多个页面(分页)的数据并将产量输出组合在单个数组中
Scrapy: parse the data from multiple pages(pagination) and combine the yield output in single array
我想做的是抓取多个页面并在一个数组中产生结果。
到目前为止我尝试过的:
import scrapy
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
start_urls = ["http://realtor.com/"]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Connection": "keep-alive",
"If-None-Match": '"d9b9d-uhdwucnqmaT5gbxbobPzbm+uEgs"',
"Cache-Control": "max-age=0",
"TE": "trailers",
}
def start_requests(self):
url = "https://www.realtor.com/realestateandhomes-search/Seattle_WA/show-newest-listings"
for page in range(1, 4):
next_page = url + "/pg-" + str(page)
yield scrapy.Request(
url=next_page, headers=self.headers, callback=self.parse, priority=1
)
def parse(self, response):
# extract data
for card in response.css("ul.property-list"):
item = {"price": card.css("span[data-label=pc-price]::text").getall()}
yield item
这给了我三个单独的价格表。
['0,000', '8,000', '0,000', ......, '9,000', ',975,000', ',099,000']
['0,000', '4,000', '5,000', ......, '5,000', '9,500', ',199,000']
[',095,000', '5,000', '0,000', ........, '0,000', '5,000', '9,950']
我正在寻找的是获得一个这样的列表:
0,000 - 1
8,000 - 2
0,000 - 3
9,000 - 4
.
.
.
5,000 - 143
9,950 - 144
我不确定示例列表中的确切结果是什么,但假设您调用了 RealtorSpider
中的函数之一,实际上得到了三个列表。由于这些函数使用 yield
到 return 的值,您可能需要在这些函数的输出上调用 list
以获得列表而不是 generator
.
我建议您编辑 realtor.py
文件,如下所示:
import scrapy
import json
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
start_urls = ["http://realtor.com/"]
prices = []
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Connection": "keep-alive",
"If-None-Match": '"d9b9d-uhdwucnqmaT5gbxbobPzbm+uEgs"',
"Cache-Control": "max-age=0",
"TE": "trailers",
}
def start_requests(self):
url = "https://www.realtor.com/realestateandhomes-search/Seattle_WA/show-newest-listings"
for page in range(1, 4):
next_page = url + "/pg-" + str(page)
yield scrapy.Request(
url=next_page, headers=self.headers, callback=self.parse, priority=1
)
def parse(self, response):
# extract data
for card in response.css("ul.property-list"):
item = {"price": card.css("span[data-label=pc-price]::text").getall()}
self.prices.append(item["price"])
yield item
data = [x for y in self.prices for x in y]
with open("data.json", "w") as f:
f.write(json.dumps(data))
如果将文件编辑成这个文件,在shell运行scrapy crawl realtor
之后,会生成一个名为data.json
的文件。这个文件正是你想要的。因此,您可以阅读它:
import json
data = json.load(open("data.json"))
data
输出
['5,000',
'9,950',
'0,000',
',150,000',
',100,000',
'0,000',
'5,000',
'7,000',
'9,800',
'0,000',
'5,000',
'0,000',
'9,950',
'0,000',
'5,000',
'5,000',
'0,000',
'5,000',
'9,000',
',000,000',
',325,000',
'4,900',
'9,950',
'8,000',
',150,000',
'9,999',
'9,000',
',050,000',
'0,000',
',975,000',
',300,000',
',350,000',
'0,000',
',349,000',
',175,000',
',049,000',
',500,000',
'9,000',
'9,000',
'4,950',
',099,000',
'9,000',
'9,000',
',095,000',
'0,000',
'5,000',
'0,000',
'5,000',
'0,000',
'5,000',
'5,000',
'5,000',
'9,950',
'5,000',
'9,000',
'5,000',
',495,000',
'5,000',
'5,000',
'9,950',
',150,000',
'5,000',
'8,900',
'5,000',
'5,000',
'0,000',
'9,000',
'5,000',
',000,000',
'5,000',
'5,000',
'9,000',
',095,000',
',175,000',
'5,000',
',300,000',
'0,000',
'9,000',
',249,900',
',650,000',
',500,000',
'9,950',
'5,000',
'0,000',
'0,000',
'9,000',
'4,000',
'0,000',
'5,000',
'5,000',
'0,000',
'0,000',
'9,000',
'5,000',
'0,000',
'9,950',
'5,000',
'5,000',
'0,000',
'9,450',
',088,000',
'5,000',
'5,000',
'0,000',
'5,000',
'9,000',
'9,950',
'8,000',
'0,000',
'8,000',
',585,000',
',150,000',
',045,000',
'0,000',
'0,000',
',950,000',
'9,000',
',975,000',
',179,500',
',100,000',
'9,000',
',750,000',
'5,000',
'9,950',
'9,500',
',199,000']
我想做的是抓取多个页面并在一个数组中产生结果。
到目前为止我尝试过的:
import scrapy
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
start_urls = ["http://realtor.com/"]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Connection": "keep-alive",
"If-None-Match": '"d9b9d-uhdwucnqmaT5gbxbobPzbm+uEgs"',
"Cache-Control": "max-age=0",
"TE": "trailers",
}
def start_requests(self):
url = "https://www.realtor.com/realestateandhomes-search/Seattle_WA/show-newest-listings"
for page in range(1, 4):
next_page = url + "/pg-" + str(page)
yield scrapy.Request(
url=next_page, headers=self.headers, callback=self.parse, priority=1
)
def parse(self, response):
# extract data
for card in response.css("ul.property-list"):
item = {"price": card.css("span[data-label=pc-price]::text").getall()}
yield item
这给了我三个单独的价格表。
['0,000', '8,000', '0,000', ......, '9,000', ',975,000', ',099,000']
['0,000', '4,000', '5,000', ......, '5,000', '9,500', ',199,000']
[',095,000', '5,000', '0,000', ........, '0,000', '5,000', '9,950']
我正在寻找的是获得一个这样的列表:
0,000 - 1
8,000 - 2
0,000 - 3
9,000 - 4
.
.
.
5,000 - 143
9,950 - 144
我不确定示例列表中的确切结果是什么,但假设您调用了 RealtorSpider
中的函数之一,实际上得到了三个列表。由于这些函数使用 yield
到 return 的值,您可能需要在这些函数的输出上调用 list
以获得列表而不是 generator
.
我建议您编辑 realtor.py
文件,如下所示:
import scrapy
import json
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
start_urls = ["http://realtor.com/"]
prices = []
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Connection": "keep-alive",
"If-None-Match": '"d9b9d-uhdwucnqmaT5gbxbobPzbm+uEgs"',
"Cache-Control": "max-age=0",
"TE": "trailers",
}
def start_requests(self):
url = "https://www.realtor.com/realestateandhomes-search/Seattle_WA/show-newest-listings"
for page in range(1, 4):
next_page = url + "/pg-" + str(page)
yield scrapy.Request(
url=next_page, headers=self.headers, callback=self.parse, priority=1
)
def parse(self, response):
# extract data
for card in response.css("ul.property-list"):
item = {"price": card.css("span[data-label=pc-price]::text").getall()}
self.prices.append(item["price"])
yield item
data = [x for y in self.prices for x in y]
with open("data.json", "w") as f:
f.write(json.dumps(data))
如果将文件编辑成这个文件,在shell运行scrapy crawl realtor
之后,会生成一个名为data.json
的文件。这个文件正是你想要的。因此,您可以阅读它:
import json
data = json.load(open("data.json"))
data
输出
['5,000',
'9,950',
'0,000',
',150,000',
',100,000',
'0,000',
'5,000',
'7,000',
'9,800',
'0,000',
'5,000',
'0,000',
'9,950',
'0,000',
'5,000',
'5,000',
'0,000',
'5,000',
'9,000',
',000,000',
',325,000',
'4,900',
'9,950',
'8,000',
',150,000',
'9,999',
'9,000',
',050,000',
'0,000',
',975,000',
',300,000',
',350,000',
'0,000',
',349,000',
',175,000',
',049,000',
',500,000',
'9,000',
'9,000',
'4,950',
',099,000',
'9,000',
'9,000',
',095,000',
'0,000',
'5,000',
'0,000',
'5,000',
'0,000',
'5,000',
'5,000',
'5,000',
'9,950',
'5,000',
'9,000',
'5,000',
',495,000',
'5,000',
'5,000',
'9,950',
',150,000',
'5,000',
'8,900',
'5,000',
'5,000',
'0,000',
'9,000',
'5,000',
',000,000',
'5,000',
'5,000',
'9,000',
',095,000',
',175,000',
'5,000',
',300,000',
'0,000',
'9,000',
',249,900',
',650,000',
',500,000',
'9,950',
'5,000',
'0,000',
'0,000',
'9,000',
'4,000',
'0,000',
'5,000',
'5,000',
'0,000',
'0,000',
'9,000',
'5,000',
'0,000',
'9,950',
'5,000',
'5,000',
'0,000',
'9,450',
',088,000',
'5,000',
'5,000',
'0,000',
'5,000',
'9,000',
'9,950',
'8,000',
'0,000',
'8,000',
',585,000',
',150,000',
',045,000',
'0,000',
'0,000',
',950,000',
'9,000',
',975,000',
',179,500',
',100,000',
'9,000',
',750,000',
'5,000',
'9,950',
'9,500',
',199,000']