使用 Beautifulsoup python 进行网页抓取 - 无法抓取所有结果
Web Scraping with Beautifulsoup python - cannot scrap all results
我试图从该站点抓取数据
我使用了下面的代码
\\
import pandas as pd
from requests import get as gt
from bs4 import BeautifulSoup as bs
def nobroker(url):
house_title = []
location=[]
area=[]
emi=[]
price=[]
p = gt(url)
b = bs(p.content)
for i in b.find_all('span',class_="overflow-hidden overflow-ellipsis whitespace-nowrap max-w-80pe po:max-w-full"):
house_title.append(i.text)
for i in b.find_all('div',class_="mt-0.5p overflow-hidden overflow-ellipsis whitespace-nowrap max-w-70 text-gray-light leading-4 po:mb-0 po:max-w-95"):
location.append(i.text)
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
area.append(i.text.split("₹")[1].replace("sqftBuiltup",""))
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
emi.append(i.text.split("₹")[2].replace("/MonthEstimated EMI",""))
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
price.append(i.text.split("₹")[3])
df = pd.DataFrame({"Name of House":house_title,"Location":location,"Area":area,"EMI":emi,"Price":price})
return df
\\
但它只得到 4 个结果数据。但在实际网站上有 100 多个结果。为什么我不能从该站点删除所有数据。
您看到的数据是通过 JavaScript 从外部源加载的。您可以使用他们的 API 来获取数据。例如:
import json
import requests
import pandas as pd
api_url = "https://www.nobroker.in/api/v1/multi/property/sale/filter"
query = {
"pageNo": 0,
"searchParam": "W3sibGF0IjoxMy4wMDExNzc0LCJsb24iOjgwLjI1NjQ5NTcsInBsYWNlSWQiOiJDaElKZ1JiRUZlMW5Vam9SZzU0a2VwYk9hV1UiLCJwbGFjZU5hbWUiOiJBZHlhciJ9XQ==",
"radius": "2.0",
"city": ["chennai", "chennai"],
"locality": "Adyar",
}
all_data, page = [], 0
while True:
print("Getting page. {}".format(page))
query["pageNo"] = page
data = requests.get(api_url, params=query).json()
if not data["data"]:
break
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for p in data["data"]:
all_data.append(
{
"house_title": p["propertyTitle"],
"location": p["locality"],
"emi": p["defaultEmi"],
"price": p["price"],
}
)
page += 1
df = pd.DataFrame(all_data)
print(df)
df.to_csv("data.csv", index=False)
打印:
house_title location emi price
0 2 BHK Flat For Sale In Vasanth Apartments In Adyar Adyar 57,314/Month 10000000
1 2 BHK Flat For Sale In Rams In Adyar Adyar 63,045/Month 11000000
2 3 BHK In Independent House For Sale In Besant Nagar Besant Nagar 1.43 Lacs/Month 25000000
...
并保存 data.csv
(来自 LibreOffice 的屏幕截图):
我试图从该站点抓取数据
我使用了下面的代码
\\
import pandas as pd
from requests import get as gt
from bs4 import BeautifulSoup as bs
def nobroker(url):
house_title = []
location=[]
area=[]
emi=[]
price=[]
p = gt(url)
b = bs(p.content)
for i in b.find_all('span',class_="overflow-hidden overflow-ellipsis whitespace-nowrap max-w-80pe po:max-w-full"):
house_title.append(i.text)
for i in b.find_all('div',class_="mt-0.5p overflow-hidden overflow-ellipsis whitespace-nowrap max-w-70 text-gray-light leading-4 po:mb-0 po:max-w-95"):
location.append(i.text)
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
area.append(i.text.split("₹")[1].replace("sqftBuiltup",""))
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
emi.append(i.text.split("₹")[2].replace("/MonthEstimated EMI",""))
for i in b.find_all('div',class_="p-1.5p flex border-b border-b-solid border-cardbordercolor tp:py-1p tp:px-1.5p tp:border-b-0"):
price.append(i.text.split("₹")[3])
df = pd.DataFrame({"Name of House":house_title,"Location":location,"Area":area,"EMI":emi,"Price":price})
return df
\\
但它只得到 4 个结果数据。但在实际网站上有 100 多个结果。为什么我不能从该站点删除所有数据。
您看到的数据是通过 JavaScript 从外部源加载的。您可以使用他们的 API 来获取数据。例如:
import json
import requests
import pandas as pd
api_url = "https://www.nobroker.in/api/v1/multi/property/sale/filter"
query = {
"pageNo": 0,
"searchParam": "W3sibGF0IjoxMy4wMDExNzc0LCJsb24iOjgwLjI1NjQ5NTcsInBsYWNlSWQiOiJDaElKZ1JiRUZlMW5Vam9SZzU0a2VwYk9hV1UiLCJwbGFjZU5hbWUiOiJBZHlhciJ9XQ==",
"radius": "2.0",
"city": ["chennai", "chennai"],
"locality": "Adyar",
}
all_data, page = [], 0
while True:
print("Getting page. {}".format(page))
query["pageNo"] = page
data = requests.get(api_url, params=query).json()
if not data["data"]:
break
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for p in data["data"]:
all_data.append(
{
"house_title": p["propertyTitle"],
"location": p["locality"],
"emi": p["defaultEmi"],
"price": p["price"],
}
)
page += 1
df = pd.DataFrame(all_data)
print(df)
df.to_csv("data.csv", index=False)
打印:
house_title location emi price
0 2 BHK Flat For Sale In Vasanth Apartments In Adyar Adyar 57,314/Month 10000000
1 2 BHK Flat For Sale In Rams In Adyar Adyar 63,045/Month 11000000
2 3 BHK In Independent House For Sale In Besant Nagar Besant Nagar 1.43 Lacs/Month 25000000
...
并保存 data.csv
(来自 LibreOffice 的屏幕截图):