在 Python 中从 Indeed 抓取员工评分
Scrape the employee ratings from Indeed in Python
我是网络抓取的新手,我需要从 Indeed 抓取员工评分和评论,但我的代码无法运行。你能告诉我的代码有什么问题吗?非常感谢您的帮助。
from bs4 import BeautifulSoup
import pandas as pd
import requests
df = pd.DataFrame({'review_title': [],'review':[],'author':[],'rating':[]})
for i in range(0, 140, 20):
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
results = soup.find("div", {"id" : 'cmp-container'})
elems = results.find_all(class_='cmp-Review-container')
for elem in elems:
title = elem.find(attrs = {'class':'cmp-Review-title'})
review = elem.find('div', {'class': 'cmp-Review-text'})
author = elem.find(attrs = {'class':'cmp-Review-author'})
rating = elem.find(attrs = {'class':'cmp-ReviewRating-text'})
df = df.append({'review_title': title.text,
'review': review.text,
'author': author.text,
'rating': rating.text
}, ignore_index=True)
只有 return 是 header。
采纳Parikh的建议后,可以return员工评价,但不显示员工状态,前任还是现任。如何改进我的代码以获得员工身份?
# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd
# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021
# Meta(Facebook),
lst=[]
for i in range(0, 460, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,review,rating])
df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta
输出结果如下,我也想得到员工状态。非常感谢您的帮助。
再次感谢您的帮助和时间。我的最后一个问题是,我试图刮掉利弊,但它只是 returns NA。我该如何修改?
import numpy as np
lst=[]
for i in range(0, 240, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)
except:
pros=np.nan
try:
cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
except:
cons=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,pros,cons,review,rating])
请先查看打印出来的 main_data
并了解其中存在哪些标签数据的概况,根据获取的特定数据,我还添加了 try
和 except
块
import numpy as np
lst=[]
for i in range(0, 140, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data=results.find_all("div",attrs={"data-tn-section":"reviews" })
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,review,rating])
现在使用 lst
作为 DataFrame 中的数据
import pandas as pd
df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
df
输出:
title author status review rating
0 good exerccise Provincia di Milano, Lombardia Senior Manager(Former Employee) working here can be challenging but helps buil... 3.0
我是网络抓取的新手,我需要从 Indeed 抓取员工评分和评论,但我的代码无法运行。你能告诉我的代码有什么问题吗?非常感谢您的帮助。
from bs4 import BeautifulSoup
import pandas as pd
import requests
df = pd.DataFrame({'review_title': [],'review':[],'author':[],'rating':[]})
for i in range(0, 140, 20):
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
results = soup.find("div", {"id" : 'cmp-container'})
elems = results.find_all(class_='cmp-Review-container')
for elem in elems:
title = elem.find(attrs = {'class':'cmp-Review-title'})
review = elem.find('div', {'class': 'cmp-Review-text'})
author = elem.find(attrs = {'class':'cmp-Review-author'})
rating = elem.find(attrs = {'class':'cmp-ReviewRating-text'})
df = df.append({'review_title': title.text,
'review': review.text,
'author': author.text,
'rating': rating.text
}, ignore_index=True)
只有 return 是 header。
采纳Parikh的建议后,可以return员工评价,但不显示员工状态,前任还是现任。如何改进我的代码以获得员工身份?
# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd
# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021
# Meta(Facebook),
lst=[]
for i in range(0, 460, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,review,rating])
df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta
输出结果如下,我也想得到员工状态。非常感谢您的帮助。
再次感谢您的帮助和时间。我的最后一个问题是,我试图刮掉利弊,但它只是 returns NA。我该如何修改?
import numpy as np
lst=[]
for i in range(0, 240, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)
except:
pros=np.nan
try:
cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
except:
cons=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,pros,cons,review,rating])
请先查看打印出来的 main_data
并了解其中存在哪些标签数据的概况,根据获取的特定数据,我还添加了 try
和 except
块
import numpy as np
lst=[]
for i in range(0, 140, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data=results.find_all("div",attrs={"data-tn-section":"reviews" })
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,review,rating])
现在使用 lst
作为 DataFrame 中的数据
import pandas as pd
df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
df
输出:
title author status review rating
0 good exerccise Provincia di Milano, Lombardia Senior Manager(Former Employee) working here can be challenging but helps buil... 3.0