找不到具有 BeautifulSoup Python 的 HTML 个元素
Cannot find HTML elements with BeautifulSoup Python
我在 https://towardsdatascience.com/ 网站上发现了一个非常好的网页抓取代码,我正在尝试实现以供自己使用。
https://ingatlan.com/lista/elado+lakas+ii-ker?page=1 这是一个匈牙利房地产网站。首先,我只想获取房地产的价格,但如果我 运行 我的代码没有得到任何结果,则找到的项目数为 0。
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet= 1
upperframe=[]
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'https://ingatlan.com/lista/elado+lakas+ii-ker?page='+str(page)
print(url)
try:
page=requests.get(url)
except Exception as e:
error_type, error_obj, error_info = sys.exc_info()
print ('ERROR FOR LINK:',url)
print (error_type, 'Line:', error_info.tb_lineno)
continue
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
frame=[]
links=soup.find_all('div',attrs={'class':'listing js-listing '})
print(len(links))
filename="NEWS.csv"
f=open(filename,"w", encoding = 'utf-8')
headers="Price\n"
f.write(headers)
for j in links:
Price = j.find("div",attrs={'class':'price'})
frame.append((Price))
upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Price'])
data.head()
我能毁掉什么?它在某些网站上有效,例如 Myprotein,但在某些地方无效。
这里只取了你问的价格
没有 User-Agent 它会给出 403 错误 forbidden
import requests
from bs4 import BeautifulSoup
import pandas as pd
start_url="https://ingatlan.com/lista/elado+lakas+ii-ker?page=1"
page_data=requests.get(start_url, headers={'User-Agent': 'XYZ/3.0'})
soup=BeautifulSoup(page_data.content,"html.parser")
#for i in soup: #i was first just checking http staus here
#print(i) #without useragent i got 403 as response
#print()
Price=[]
for job_tag in soup.find_all("div",class_="resultspage__content"):
for job_tag2 in job_tag.find_all("div",class_="listing js-listing"):
for job_tag3 in job_tag2.find_all("div",class_="price__container js-has-sqm-price-info-tooltip"):
price=job_tag3.find("div",class_="price")
Price.append(price.text.strip())
#print(Price)
data=pd.DataFrame(Price,columns=["price"])
print(data)
pandasDataFrame
的输出
price
0 31.5 M Ft
1 77.9 M Ft
2 62 M Ft
3 129.5 M Ft
4 125 M Ft
5 95.9 M Ft
6 46.9 M Ft
7 45.9 M Ft
8 59.9 M Ft
9 109 M Ft
10 48 M Ft
11 87 M Ft
我在 https://towardsdatascience.com/ 网站上发现了一个非常好的网页抓取代码,我正在尝试实现以供自己使用。
https://ingatlan.com/lista/elado+lakas+ii-ker?page=1 这是一个匈牙利房地产网站。首先,我只想获取房地产的价格,但如果我 运行 我的代码没有得到任何结果,则找到的项目数为 0。
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet= 1
upperframe=[]
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'https://ingatlan.com/lista/elado+lakas+ii-ker?page='+str(page)
print(url)
try:
page=requests.get(url)
except Exception as e:
error_type, error_obj, error_info = sys.exc_info()
print ('ERROR FOR LINK:',url)
print (error_type, 'Line:', error_info.tb_lineno)
continue
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
frame=[]
links=soup.find_all('div',attrs={'class':'listing js-listing '})
print(len(links))
filename="NEWS.csv"
f=open(filename,"w", encoding = 'utf-8')
headers="Price\n"
f.write(headers)
for j in links:
Price = j.find("div",attrs={'class':'price'})
frame.append((Price))
upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Price'])
data.head()
我能毁掉什么?它在某些网站上有效,例如 Myprotein,但在某些地方无效。
这里只取了你问的价格
没有 User-Agent 它会给出 403 错误 forbidden
import requests
from bs4 import BeautifulSoup
import pandas as pd
start_url="https://ingatlan.com/lista/elado+lakas+ii-ker?page=1"
page_data=requests.get(start_url, headers={'User-Agent': 'XYZ/3.0'})
soup=BeautifulSoup(page_data.content,"html.parser")
#for i in soup: #i was first just checking http staus here
#print(i) #without useragent i got 403 as response
#print()
Price=[]
for job_tag in soup.find_all("div",class_="resultspage__content"):
for job_tag2 in job_tag.find_all("div",class_="listing js-listing"):
for job_tag3 in job_tag2.find_all("div",class_="price__container js-has-sqm-price-info-tooltip"):
price=job_tag3.find("div",class_="price")
Price.append(price.text.strip())
#print(Price)
data=pd.DataFrame(Price,columns=["price"])
print(data)
pandasDataFrame
的输出 price
0 31.5 M Ft
1 77.9 M Ft
2 62 M Ft
3 129.5 M Ft
4 125 M Ft
5 95.9 M Ft
6 46.9 M Ft
7 45.9 M Ft
8 59.9 M Ft
9 109 M Ft
10 48 M Ft
11 87 M Ft