在 Python 中从 Indeed 抓取员工评分

Scrape the employee ratings from Indeed in Python

我是网络抓取的新手,我需要从 Indeed 抓取员工评分和评论,但我的代码无法运行。你能告诉我的代码有什么问题吗?非常感谢您的帮助。

from bs4 import BeautifulSoup
import pandas as pd
import requests

df = pd.DataFrame({'review_title': [],'review':[],'author':[],'rating':[]})

for i in range(0, 140, 20):
    url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    results = soup.find("div", {"id" : 'cmp-container'})
    elems = results.find_all(class_='cmp-Review-container')
    for elem in elems:
            title = elem.find(attrs = {'class':'cmp-Review-title'})
            review = elem.find('div', {'class': 'cmp-Review-text'})
            author = elem.find(attrs = {'class':'cmp-Review-author'})
            rating = elem.find(attrs = {'class':'cmp-ReviewRating-text'})
            df = df.append({'review_title': title.text,
                 'review': review.text,
                 'author': author.text,
                 'rating': rating.text
                }, ignore_index=True)

只有 return 是 header。

采纳Parikh的建议后,可以return员工评价,但不显示员工状态,前任还是现任。如何改进我的代码以获得员工身份?

# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd

# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021

# Meta(Facebook), 
lst=[]
for i in range(0, 460, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
        lst.append([title,author,review,rating])

df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta

输出结果如下,我也想得到员工状态。非常感谢您的帮助。

再次感谢您的帮助和时间。我的最后一个问题是,我试图刮掉利弊,但它只是 returns NA。我该如何修改?

import numpy as np
lst=[]
for i in range(0, 240, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
            
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
            
        try:
            status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
        except AttributeError:
            status=np.nan
            
    
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
            
        try:
            pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)            
        except:
            pros=np.nan
        try:
            cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
        except:
            cons=np.nan
            
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
            
        lst.append([title,author,status,pros,cons,review,rating])

请先查看打印出来的 main_data 并了解其中存在哪些标签数据的概况,根据获取的特定数据,我还添加了 tryexcept

import numpy as np
lst=[]
for i in range(0, 140, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')   
    main_data=results.find_all("div",attrs={"data-tn-section":"reviews" })
    for data in main_data:
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
        try:
           status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
        except AttributeError:
           status=np.nan

        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
       except AttributeError:
            rating=np.nan
        lst.append([title,author,status,review,rating])

现在使用 lst 作为 DataFrame 中的数据

import pandas as pd
df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
df

输出:

              title            author              status    review rating
0   good exerccise  Provincia di Milano, Lombardia  Senior Manager(Former Employee) working here can be challenging but helps buil...   3.0