使用 BeautifulSoup 从 Zillow.com 抓取数据
Scraping data from Zillow.com using BeautifulSoup
在此 tutorial 之后,我试图从 zillow.com 中提取基本的 属性 信息。更具体地说,我想提取与网站上显示的 属性 卡片相关的信息。
即使第一页上存在多个 属性 卡片,以下代码也只能提取 3 个属性的信息。有人可以解释为什么代码会跳过剩余的属性吗?
import requests
import ast
from bs4 import BeautifulSoup
url = 'https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-106.43826441618356%2C%22east%22%3A-103.36483912321481%2C%22south%22%3A38.903882034738686%2C%22north%22%3A40.52008627183672%7D%2C%22mapZoom%22%3A9%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22hoa%22%3A%7B%22max%22%3A200%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22basf%22%3A%7B%22value%22%3Atrue%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schb%22%3A%7B%22min%22%3A%227%22%7D%7D%2C%22isListVisible%22%3Atrue%7D'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'zguid=23|%24ca6368b9-7b92-4d51-ab67-c2be89065efd; _ga=GA1.2.1460486079.1621047110; _pxvid=7fa13d96-b528-11eb-9860-0242ac120012; _gcl_au=1.1.2025797213.1621047113; __gads=ID=66253ab863481044:T=1621047113:S=ALNI_MZr3mehwm2Wjo7NOrmalVtEcJSXag; __pdst=50987f626deb4767a53b5d8ca2ea406a; _fbp=fb.1.1621047115574.1019382068; _pin_unauth=dWlkPU5EVm1PRGRpTVRBdE5UTTFaUzAwWlRBNExUZzJZall0TWpZMU1HWTBNV0ppWlRkbA; G_ENABLED_IDPS=google; userid=X|3|231a9d744e104379%7C3%7CiEt8bkUx9hWaFeyCeAwN9tHl_T0d0Cq-kynGuEvNYr4%3D; loginmemento=1|c2274ba4a4ad76bbe89263d30695c182e9177b9c40a2691f3054987d66a944be; zjs_user_id=%22X1-ZU158jhpb2klds9_4wzn7%22; zgcus_lbut=; zgcus_aeut=189997416; zgcus_ludi=b44a961b-c7ef-11eb-a48f-96824e7eff50-18999; optimizelyEndUserId=oeu1623111792776r0.8778663892923859; _cs_c=1; WRUIDAWS=3326630244368428; visitor_id701843=248614376; visitor_id701843-hash=4be116fbd77089f953bfb6eaf5996ef92662a6ef7d237d3c49f154ffaf4eaa9295c64fb254b106bdff234e183c94498c01af2aab; __stripe_mid=80125db1-17d1-4fc5-ae37-86b12a68709cf3da6d; g_state={"i_p":1627697570928,"i_l":4}; zjs_anonymous_id=%22ca6368b9-7b92-4d51-ab67-c2be89065efd%22; _gac_UA-21174015-56=1.1626042638.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; _gcl_aw=GCL.1626042640.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; zgsession=1|1edd82e6-372a-4546-bc8b-c2bbadfd29b4; DoubleClickSession=true; fbc=fb.1.1626412984774.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _fbc=fb.1.1626413249162.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _csrf=lV2BBFim7Vy2gFTn--PUt0VA; _gaexp=GAX1.2.w27igyYtRQaAa8XQM3MjDw.18837.2!VDVoDKTnRcyv8f4FAcJ8PA.18915.2!Khnq27RoQmSe5DEusmh5xA.18913.3; _gid=GA1.2.705011419.1630004829; FSsampler=707279376; __CT_Data=gpv=26&ckp=tld&dm=zillow.com&apv_82_www33=26&cpv_82_www33=26&rpv_82_www33=13; OptanonConsent=isIABGlobal=false&datestamp=Fri+Aug+27+2021+12%3A39%3A52+GMT-0600+(Mountain+Daylight+Time)&version=5.11.0&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C4%3A1&AwaitingReconsent=false; _cs_id=41cbdc9c-bb0b-aad9-9521-b1328a65ff77.1623111795.22.1630089665.1630089591.1.1657275795752; utag_main=v_id:01796deff9e3001a59964343177e03079002907100838$_sn:41$_se:2$_ss:0$_st:1630255637884$dc_visit:38$ses_id:1630253822479%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_event:2%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:7b8796ca-44dd-45c9-97d9-bcb642d04cd1%3Bexp-session; JSESSIONID=6CB8C410E0FE216644E8C3A0D0851618; ZILLOW_SID=1|AAAAAVVbFRIBVVsVEklf443J474nftKzJe5PKLD80sujgHvySB7tGcqZunX3BDDH9VwceMqGMTPC54%2F0q4CH%2BfmwsC6P; KruxPixel=true; _derived_epik=dj0yJnU9ai1PSUp1eHZ2Y3J3d0c2NVU1N3BBOFlHbnRBOGFzT0smbj1vLWRISDFwdUNoblN5MjQ4cTVyN213Jm09MSZ0PUFBQUFBR0VzRjRVJnJtPTEmcnQ9QUFBQUFHRXNGNFU; KruxAddition=true; search=6|1632872450375%7Crect%3D40.241821806991595%252C-103.77545313688668%252C39.18758562803622%252C-106.02765040251168%26disp%3Dmap%26mdm%3Dauto%26type%3Dhouse%252Cmultifamily%252Ctownhouse%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%0911093%09%09%09%09%09%09; _uetsid=d5e0465006a011ecbe3bd1a0f1c47d01; _uetvid=987e1c70c40a11ebaed8859af36f82fb; _px3=ba45c3df5d5d63d4d9780a102253cd60b21ab52b04778344e332e05474011c21:oCvapPXE6jD0rCXhSf4UjtEC2U956148EDyiWwRFOF8z5vwK63/hC8OWsk09O61g1spnZw64iXApZu1wOmKpyA==:1000:68UzJ5+ar5XwNm61bm41bhSHp8Zp1PfQQlL/5tcqdUIJ3RmA106//vvYGewCCwmln6acqbDAVKgqfB8Th05yX0Cw0TBW7dhfNdeNRjp9bxeLvKqZ56yuW+aVoYYp/zj6MNKv9c16vKlP771xSdCgUTvZ0CDmh7Ng55sHugOHt/jj+2Zmp2WLnuYR4rf7SEndqWBbAyQhhG4BKeyrZyEMpA==; AWSALB=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ; AWSALBCORS=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?1',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'
}
params = {
'searchQueryState': '{"mapBounds":{"west":-106.02765040251168,"east":-103.77545313688668,"south":39.18758562803622,"north":40.241821806991595},"isMapVisible":true,"filterState":{"sort":{"value":"globalrelevanceex"},"ah":{"value":true},"con":{"value":false},"apco":{"value":false},"land":{"value":false},"apa":{"value":false},"manu":{"value":false},"basf":{"value":true},"hoa":{"max":200},"sch":{"value":true},"schb":{"min":"7"},"schc":{"value":false},"schr":{"value":false},"schu":{"value":false}},"isListVisible":true,"mapZoom":9,"customRegionId":"fcac4612c1X1-CR9xde3hldsvpa_v24ah","pagination":{}}'
}
class ZillowScraper:
def __init__(self, url, headers, params):
self.headers = headers
self.url = url
self.params = params
def fetch(self):
response = requests.get(url=self.url, headers=self.headers, params=self.params)
return response
def get_cards_info(self, deck_text):
urls = []
for card in deck_text.contents:
script = card.find('script', {'type': 'application/ld+json'})
if script:
script_json = ast.literal_eval(str(script.contents[0]))
print(script_json)
def parse(self, response_text):
content = BeautifulSoup(response_text, features="html.parser")
deck_text = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
cards_info = self.get_cards_info(deck_text)
def run(self):
response = self.fetch()
self.parse(response.text)
if __name__ == "__main__":
scraper = ZillowScraper(url, headers, params)
scraper.run()
输出
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '11615 River Run Cir, Henderson, CO 80640', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '2,001'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '11615 River Run Cir', 'addressLocality': 'Henderson', 'addressRegion': 'CO', 'postalCode': '80640'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.908753, 'longitude': -104.851576}, 'url': 'https://www.zillow.com/homedetails/11615-River-Run-Cir-Henderson-CO-80640/49457209_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '5089 Enid Way, Denver, CO 80239', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,852'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '5089 Enid Way', 'addressLocality': 'Denver', 'addressRegion': 'CO', 'postalCode': '80239'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.784449, 'longitude': -104.815903}, 'url': 'https://www.zillow.com/homedetails/5089-Enid-Way-Denver-CO-80239/13271929_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '6088 S Pierson Ct, Littleton, CO 80127', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,810'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '6088 S Pierson Ct', 'addressLocality': 'Littleton', 'addressRegion': 'CO', 'postalCode': '80127'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.605764, 'longitude': -105.123466}, 'url': 'https://www.zillow.com/homedetails/6088-S-Pierson-Ct-Littleton-CO-80127/13818492_zpid/'}
结果存储在页面内的 <script>
变量中。要解析它们,您可以使用下一个示例:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState={%22pagination%22%3A{}%2C%22mapBounds%22%3A{%22west%22%3A-106.97384791227731%2C%22east%22%3A-102.82925562712106%2C%22south%22%3A39.18758562803622%2C%22north%22%3A40.241821806991595}%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A{%22hoa%22%3A{%22max%22%3A200}%2C%22con%22%3A{%22value%22%3Afalse}%2C%22apa%22%3A{%22value%22%3Afalse}%2C%22sch%22%3A{%22value%22%3Atrue}%2C%22ah%22%3A{%22value%22%3Atrue}%2C%22sort%22%3A{%22value%22%3A%22globalrelevanceex%22}%2C%22land%22%3A{%22value%22%3Afalse}%2C%22schu%22%3A{%22value%22%3Afalse}%2C%22manu%22%3A{%22value%22%3Afalse}%2C%22schr%22%3A{%22value%22%3Afalse}%2C%22apco%22%3A{%22value%22%3Afalse}%2C%22basf%22%3A{%22value%22%3Atrue}%2C%22schc%22%3A{%22value%22%3Afalse}%2C%22schb%22%3A{%22min%22%3A%227%22}}%2C%22isListVisible%22%3Atrue}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
data = json.loads(
soup.select_one("script[data-zrr-shared-data-key]")
.contents[0]
.strip("!<>-")
)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for result in data["cat1"]["searchResults"]["listResults"]:
print(
"{:<15} {:<50} {:<15}".format(
result["statusText"], result["address"], result["price"]
)
)
打印:
House for sale 6092 S Marshall Dr, Littleton, CO 80123 0,000
House for sale 3050 S Roslyn St, Denver, CO 80231 4,900
House for sale 15538 Greenstone Cir, Parker, CO 80134 0,000
House for sale 7141 Fenton Cir, Arvada, CO 80003 9,500
House for sale 7823 S Logan Dr, Littleton, CO 80122 5,000
House for sale 1825 Clermont St, Denver, CO 80220 9,900
House for sale 408 S Locust St, Denver, CO 80224 0,000
House for sale 8660 De Soto St, Denver, CO 80229 0,000
House for sale 1811 S Humboldt St, Denver, CO 80210 5,000
House for sale 7329 E Easter Ave, Centennial, CO 80112 9,900
House for sale 13638 W Montana Pl, Lakewood, CO 80228 0,000
House for sale 8296 E Hinsdale Dr, Centennial, CO 80112 9,900
House for sale 10325 Ravenswood Ln, Highlands Ranch, CO 80130 0,000
House for sale 2833 E 90th Pl, Denver, CO 80229 5,000
House for sale 5756 W 8th Ave, Lakewood, CO 80214 0,000
House for sale 6088 S Pierson Ct, Littleton, CO 80127 9,000
House for sale 2829 S Lowell Blvd, Denver, CO 80236 5,000
House for sale 604 Eldridge St, Golden, CO 80401 0,000
House for sale 7171 McIntyre Ct, Arvada, CO 80007 0,000
House for sale 1301 S Blackhawk Way, Aurora, CO 80012 0,000
House for sale 215 S Julian St, Denver, CO 80219 0,000
House for sale 7095 E 67th Ave, Commerce City, CO 80022 0,000
House for sale 8248 S Yukon St, Littleton, CO 80128 5,000
House for sale 2846 S Macon Ct, Aurora, CO 80014 0,000
House for sale 9340 Burgundy Cir, Littleton, CO 80126 9,000
House for sale 2072 S Cathay Way, Aurora, CO 80013 0,000
House for sale 1317 W 85th Ave, Federal Heights, CO 80260 5,000
House for sale 6701 Eagle Shadow Ave, Brighton, CO 80602 ,145,000
House for sale 2900 Webster St, Wheat Ridge, CO 80033 0,000
House for sale 3943 S Allison Ct, Lakewood, CO 80235 9,950
House for sale 511 E Irwin Ave, Littleton, CO 80122 4,500
House for sale 4700 E Montana Pl, Denver, CO 80222 0,000
House for sale 2344 S Gray Dr, Lakewood, CO 80227 5,000
House for sale 5546 E 130th Dr, Thornton, CO 80241 0,000
House for sale 2270 S Joyce St, Lakewood, CO 80228 ,340,000
House for sale 12171 W Dakota Dr, Lakewood, CO 80228 0,000
House for sale 6641 Miller St, Arvada, CO 80004 5,000
House for sale 3220 W Nevada Pl, Denver, CO 80219 0,000
House for sale 8630 W 64th Pl, Arvada, CO 80004 7,000
House for sale 5890 Wood Sorrel Dr, Littleton, CO 80123 5,000
如果上面的代码给你错误试试这个
response = requests.get(ZILLOW_URL, headers=headers).content
soup = BeautifulSoup(response, 'html.parser')
data = json.loads(
soup.select_one("script[data-zrr-shared-data-key]")
.contents[0]
.strip("!<>-")
)
all_data = data['cat1']['searchResults']['listResults']
for i in range(len(all_data)):
#some items have the 'price' key nested inside units key, while others have simply inside data key
try:
price = all_data[i]['units'][0]['price']
except KeyError:
price = all_data[i]['price']
address = all_data[i]['address']
link = all_data[i]['detailUrl']
# sometimes the link does not contain the starting website url, thats why we are inserting "https://www.zillow.com{link}" at the starting of link
if 'http' not in link:
link_to_buy = f"https://www.zillow.com{link}"
else:
link_to_buy = link
print(price)
print(address)
print(link_to_buy)
print("\n")
在此 tutorial 之后,我试图从 zillow.com 中提取基本的 属性 信息。更具体地说,我想提取与网站上显示的 属性 卡片相关的信息。
即使第一页上存在多个 属性 卡片,以下代码也只能提取 3 个属性的信息。有人可以解释为什么代码会跳过剩余的属性吗?
import requests
import ast
from bs4 import BeautifulSoup
url = 'https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-106.43826441618356%2C%22east%22%3A-103.36483912321481%2C%22south%22%3A38.903882034738686%2C%22north%22%3A40.52008627183672%7D%2C%22mapZoom%22%3A9%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22hoa%22%3A%7B%22max%22%3A200%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22basf%22%3A%7B%22value%22%3Atrue%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schb%22%3A%7B%22min%22%3A%227%22%7D%7D%2C%22isListVisible%22%3Atrue%7D'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'zguid=23|%24ca6368b9-7b92-4d51-ab67-c2be89065efd; _ga=GA1.2.1460486079.1621047110; _pxvid=7fa13d96-b528-11eb-9860-0242ac120012; _gcl_au=1.1.2025797213.1621047113; __gads=ID=66253ab863481044:T=1621047113:S=ALNI_MZr3mehwm2Wjo7NOrmalVtEcJSXag; __pdst=50987f626deb4767a53b5d8ca2ea406a; _fbp=fb.1.1621047115574.1019382068; _pin_unauth=dWlkPU5EVm1PRGRpTVRBdE5UTTFaUzAwWlRBNExUZzJZall0TWpZMU1HWTBNV0ppWlRkbA; G_ENABLED_IDPS=google; userid=X|3|231a9d744e104379%7C3%7CiEt8bkUx9hWaFeyCeAwN9tHl_T0d0Cq-kynGuEvNYr4%3D; loginmemento=1|c2274ba4a4ad76bbe89263d30695c182e9177b9c40a2691f3054987d66a944be; zjs_user_id=%22X1-ZU158jhpb2klds9_4wzn7%22; zgcus_lbut=; zgcus_aeut=189997416; zgcus_ludi=b44a961b-c7ef-11eb-a48f-96824e7eff50-18999; optimizelyEndUserId=oeu1623111792776r0.8778663892923859; _cs_c=1; WRUIDAWS=3326630244368428; visitor_id701843=248614376; visitor_id701843-hash=4be116fbd77089f953bfb6eaf5996ef92662a6ef7d237d3c49f154ffaf4eaa9295c64fb254b106bdff234e183c94498c01af2aab; __stripe_mid=80125db1-17d1-4fc5-ae37-86b12a68709cf3da6d; g_state={"i_p":1627697570928,"i_l":4}; zjs_anonymous_id=%22ca6368b9-7b92-4d51-ab67-c2be89065efd%22; _gac_UA-21174015-56=1.1626042638.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; _gcl_aw=GCL.1626042640.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; zgsession=1|1edd82e6-372a-4546-bc8b-c2bbadfd29b4; DoubleClickSession=true; fbc=fb.1.1626412984774.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _fbc=fb.1.1626413249162.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _csrf=lV2BBFim7Vy2gFTn--PUt0VA; _gaexp=GAX1.2.w27igyYtRQaAa8XQM3MjDw.18837.2!VDVoDKTnRcyv8f4FAcJ8PA.18915.2!Khnq27RoQmSe5DEusmh5xA.18913.3; _gid=GA1.2.705011419.1630004829; FSsampler=707279376; __CT_Data=gpv=26&ckp=tld&dm=zillow.com&apv_82_www33=26&cpv_82_www33=26&rpv_82_www33=13; OptanonConsent=isIABGlobal=false&datestamp=Fri+Aug+27+2021+12%3A39%3A52+GMT-0600+(Mountain+Daylight+Time)&version=5.11.0&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C4%3A1&AwaitingReconsent=false; _cs_id=41cbdc9c-bb0b-aad9-9521-b1328a65ff77.1623111795.22.1630089665.1630089591.1.1657275795752; utag_main=v_id:01796deff9e3001a59964343177e03079002907100838$_sn:41$_se:2$_ss:0$_st:1630255637884$dc_visit:38$ses_id:1630253822479%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_event:2%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:7b8796ca-44dd-45c9-97d9-bcb642d04cd1%3Bexp-session; JSESSIONID=6CB8C410E0FE216644E8C3A0D0851618; ZILLOW_SID=1|AAAAAVVbFRIBVVsVEklf443J474nftKzJe5PKLD80sujgHvySB7tGcqZunX3BDDH9VwceMqGMTPC54%2F0q4CH%2BfmwsC6P; KruxPixel=true; _derived_epik=dj0yJnU9ai1PSUp1eHZ2Y3J3d0c2NVU1N3BBOFlHbnRBOGFzT0smbj1vLWRISDFwdUNoblN5MjQ4cTVyN213Jm09MSZ0PUFBQUFBR0VzRjRVJnJtPTEmcnQ9QUFBQUFHRXNGNFU; KruxAddition=true; search=6|1632872450375%7Crect%3D40.241821806991595%252C-103.77545313688668%252C39.18758562803622%252C-106.02765040251168%26disp%3Dmap%26mdm%3Dauto%26type%3Dhouse%252Cmultifamily%252Ctownhouse%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%0911093%09%09%09%09%09%09; _uetsid=d5e0465006a011ecbe3bd1a0f1c47d01; _uetvid=987e1c70c40a11ebaed8859af36f82fb; _px3=ba45c3df5d5d63d4d9780a102253cd60b21ab52b04778344e332e05474011c21:oCvapPXE6jD0rCXhSf4UjtEC2U956148EDyiWwRFOF8z5vwK63/hC8OWsk09O61g1spnZw64iXApZu1wOmKpyA==:1000:68UzJ5+ar5XwNm61bm41bhSHp8Zp1PfQQlL/5tcqdUIJ3RmA106//vvYGewCCwmln6acqbDAVKgqfB8Th05yX0Cw0TBW7dhfNdeNRjp9bxeLvKqZ56yuW+aVoYYp/zj6MNKv9c16vKlP771xSdCgUTvZ0CDmh7Ng55sHugOHt/jj+2Zmp2WLnuYR4rf7SEndqWBbAyQhhG4BKeyrZyEMpA==; AWSALB=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ; AWSALBCORS=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?1',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'
}
params = {
'searchQueryState': '{"mapBounds":{"west":-106.02765040251168,"east":-103.77545313688668,"south":39.18758562803622,"north":40.241821806991595},"isMapVisible":true,"filterState":{"sort":{"value":"globalrelevanceex"},"ah":{"value":true},"con":{"value":false},"apco":{"value":false},"land":{"value":false},"apa":{"value":false},"manu":{"value":false},"basf":{"value":true},"hoa":{"max":200},"sch":{"value":true},"schb":{"min":"7"},"schc":{"value":false},"schr":{"value":false},"schu":{"value":false}},"isListVisible":true,"mapZoom":9,"customRegionId":"fcac4612c1X1-CR9xde3hldsvpa_v24ah","pagination":{}}'
}
class ZillowScraper:
def __init__(self, url, headers, params):
self.headers = headers
self.url = url
self.params = params
def fetch(self):
response = requests.get(url=self.url, headers=self.headers, params=self.params)
return response
def get_cards_info(self, deck_text):
urls = []
for card in deck_text.contents:
script = card.find('script', {'type': 'application/ld+json'})
if script:
script_json = ast.literal_eval(str(script.contents[0]))
print(script_json)
def parse(self, response_text):
content = BeautifulSoup(response_text, features="html.parser")
deck_text = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
cards_info = self.get_cards_info(deck_text)
def run(self):
response = self.fetch()
self.parse(response.text)
if __name__ == "__main__":
scraper = ZillowScraper(url, headers, params)
scraper.run()
输出
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '11615 River Run Cir, Henderson, CO 80640', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '2,001'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '11615 River Run Cir', 'addressLocality': 'Henderson', 'addressRegion': 'CO', 'postalCode': '80640'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.908753, 'longitude': -104.851576}, 'url': 'https://www.zillow.com/homedetails/11615-River-Run-Cir-Henderson-CO-80640/49457209_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '5089 Enid Way, Denver, CO 80239', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,852'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '5089 Enid Way', 'addressLocality': 'Denver', 'addressRegion': 'CO', 'postalCode': '80239'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.784449, 'longitude': -104.815903}, 'url': 'https://www.zillow.com/homedetails/5089-Enid-Way-Denver-CO-80239/13271929_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '6088 S Pierson Ct, Littleton, CO 80127', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,810'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '6088 S Pierson Ct', 'addressLocality': 'Littleton', 'addressRegion': 'CO', 'postalCode': '80127'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.605764, 'longitude': -105.123466}, 'url': 'https://www.zillow.com/homedetails/6088-S-Pierson-Ct-Littleton-CO-80127/13818492_zpid/'}
结果存储在页面内的 <script>
变量中。要解析它们,您可以使用下一个示例:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState={%22pagination%22%3A{}%2C%22mapBounds%22%3A{%22west%22%3A-106.97384791227731%2C%22east%22%3A-102.82925562712106%2C%22south%22%3A39.18758562803622%2C%22north%22%3A40.241821806991595}%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A{%22hoa%22%3A{%22max%22%3A200}%2C%22con%22%3A{%22value%22%3Afalse}%2C%22apa%22%3A{%22value%22%3Afalse}%2C%22sch%22%3A{%22value%22%3Atrue}%2C%22ah%22%3A{%22value%22%3Atrue}%2C%22sort%22%3A{%22value%22%3A%22globalrelevanceex%22}%2C%22land%22%3A{%22value%22%3Afalse}%2C%22schu%22%3A{%22value%22%3Afalse}%2C%22manu%22%3A{%22value%22%3Afalse}%2C%22schr%22%3A{%22value%22%3Afalse}%2C%22apco%22%3A{%22value%22%3Afalse}%2C%22basf%22%3A{%22value%22%3Atrue}%2C%22schc%22%3A{%22value%22%3Afalse}%2C%22schb%22%3A{%22min%22%3A%227%22}}%2C%22isListVisible%22%3Atrue}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
data = json.loads(
soup.select_one("script[data-zrr-shared-data-key]")
.contents[0]
.strip("!<>-")
)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for result in data["cat1"]["searchResults"]["listResults"]:
print(
"{:<15} {:<50} {:<15}".format(
result["statusText"], result["address"], result["price"]
)
)
打印:
House for sale 6092 S Marshall Dr, Littleton, CO 80123 0,000
House for sale 3050 S Roslyn St, Denver, CO 80231 4,900
House for sale 15538 Greenstone Cir, Parker, CO 80134 0,000
House for sale 7141 Fenton Cir, Arvada, CO 80003 9,500
House for sale 7823 S Logan Dr, Littleton, CO 80122 5,000
House for sale 1825 Clermont St, Denver, CO 80220 9,900
House for sale 408 S Locust St, Denver, CO 80224 0,000
House for sale 8660 De Soto St, Denver, CO 80229 0,000
House for sale 1811 S Humboldt St, Denver, CO 80210 5,000
House for sale 7329 E Easter Ave, Centennial, CO 80112 9,900
House for sale 13638 W Montana Pl, Lakewood, CO 80228 0,000
House for sale 8296 E Hinsdale Dr, Centennial, CO 80112 9,900
House for sale 10325 Ravenswood Ln, Highlands Ranch, CO 80130 0,000
House for sale 2833 E 90th Pl, Denver, CO 80229 5,000
House for sale 5756 W 8th Ave, Lakewood, CO 80214 0,000
House for sale 6088 S Pierson Ct, Littleton, CO 80127 9,000
House for sale 2829 S Lowell Blvd, Denver, CO 80236 5,000
House for sale 604 Eldridge St, Golden, CO 80401 0,000
House for sale 7171 McIntyre Ct, Arvada, CO 80007 0,000
House for sale 1301 S Blackhawk Way, Aurora, CO 80012 0,000
House for sale 215 S Julian St, Denver, CO 80219 0,000
House for sale 7095 E 67th Ave, Commerce City, CO 80022 0,000
House for sale 8248 S Yukon St, Littleton, CO 80128 5,000
House for sale 2846 S Macon Ct, Aurora, CO 80014 0,000
House for sale 9340 Burgundy Cir, Littleton, CO 80126 9,000
House for sale 2072 S Cathay Way, Aurora, CO 80013 0,000
House for sale 1317 W 85th Ave, Federal Heights, CO 80260 5,000
House for sale 6701 Eagle Shadow Ave, Brighton, CO 80602 ,145,000
House for sale 2900 Webster St, Wheat Ridge, CO 80033 0,000
House for sale 3943 S Allison Ct, Lakewood, CO 80235 9,950
House for sale 511 E Irwin Ave, Littleton, CO 80122 4,500
House for sale 4700 E Montana Pl, Denver, CO 80222 0,000
House for sale 2344 S Gray Dr, Lakewood, CO 80227 5,000
House for sale 5546 E 130th Dr, Thornton, CO 80241 0,000
House for sale 2270 S Joyce St, Lakewood, CO 80228 ,340,000
House for sale 12171 W Dakota Dr, Lakewood, CO 80228 0,000
House for sale 6641 Miller St, Arvada, CO 80004 5,000
House for sale 3220 W Nevada Pl, Denver, CO 80219 0,000
House for sale 8630 W 64th Pl, Arvada, CO 80004 7,000
House for sale 5890 Wood Sorrel Dr, Littleton, CO 80123 5,000
如果上面的代码给你错误试试这个
response = requests.get(ZILLOW_URL, headers=headers).content
soup = BeautifulSoup(response, 'html.parser')
data = json.loads(
soup.select_one("script[data-zrr-shared-data-key]")
.contents[0]
.strip("!<>-")
)
all_data = data['cat1']['searchResults']['listResults']
for i in range(len(all_data)):
#some items have the 'price' key nested inside units key, while others have simply inside data key
try:
price = all_data[i]['units'][0]['price']
except KeyError:
price = all_data[i]['price']
address = all_data[i]['address']
link = all_data[i]['detailUrl']
# sometimes the link does not contain the starting website url, thats why we are inserting "https://www.zillow.com{link}" at the starting of link
if 'http' not in link:
link_to_buy = f"https://www.zillow.com{link}"
else:
link_to_buy = link
print(price)
print(address)
print(link_to_buy)
print("\n")