用 Selenium 抓取完整 post Indeed

Scraping full post Indeed with Selenium

我正在尝试使 python 爬虫代码工作,但我做不到,一点帮助会很有用,我还是个初学者。代码运行正常,但它崩溃并将单个作业导出到我的 csv,我认为它是随机的并且不提供任何 error.Please,有更多经验的人可以帮助我一些 tips.Thanks提前。

from selenium import webdriver
import pandas as pd 
from bs4 import BeautifulSoup

options = webdriver.FirefoxOptions()
driver = webdriver.Firefox()
driver.maximize_window()


df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])

for i in range(25):
    driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
    jobs = []
    driver.implicitly_wait(20)
    

    for job in driver.find_elements_by_class_name('result'):

        soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
        
        try:
            title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
            
        except:
            title = 'None'

        try:
            location = soup.find(class_="location").text
        except:
            location = 'None'

        try:
            company = soup.find(class_="company").text.replace("\n","").strip()
        except:
            company = 'None'

        try:
            salary = soup.find(class_="salary").text.replace("\n","").strip()
        except:
            salary = 'None'

        try:
            sponsored = soup.find(class_="sponsoredGray").text
            sponsored = "Sponsored"
        except:
            sponsored = "Organic"
                
        
sum_div = job.find_element_by_class_name('summary')

try:    
              sum_div.click()
except:
             close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
             close_button.click()
             sum_div.click()            
driver.implicitly_wait(2)
try:            
    job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
    print(job_desc)
except:
    job_desc = 'None'   

df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
                        "Sponsored":sponsored,"Description":job_desc},ignore_index=True)


df.to_csv(r"C:\Users\Desktop\Python\Newtest.csv",index=False)

这似乎是一个简单的缩进问题。 您的部分代码 运行 在 for 循环之外。

from selenium import webdriver
import pandas as pd 
from bs4 import BeautifulSoup

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

options = Options()    
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])

for i in range(0,50,10):
    driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
    jobs = []
    driver.implicitly_wait(20)
    

    for job in driver.find_elements_by_class_name('result'):

        soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
        
        try:
            title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
            
        except:
            title = 'None'

        try:
            location = soup.find(class_="location").text
        except:
            location = 'None'

        try:
            company = soup.find(class_="company").text.replace("\n","").strip()
        except:
            company = 'None'

        try:
            salary = soup.find(class_="salary").text.replace("\n","").strip()
        except:
            salary = 'None'

        try:
            sponsored = soup.find(class_="sponsoredGray").text
            sponsored = "Sponsored"
        except:
            sponsored = "Organic"


        sum_div = job.find_element_by_class_name('summary')

        try:    
                    sum_div.click()
        except:
                    close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
                    close_button.click()
                    sum_div.click()            
        driver.implicitly_wait(2)
        try:            
            job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
            print(job_desc)
        except:
            job_desc = 'None'   

        df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
                                "Sponsored":sponsored,"Description":job_desc},ignore_index=True)

df.to_csv("test.csv",index=False)

我使用 Chrome 而不是 Firefox,但我认为问题不存在。我刚刚正确地缩进了你的代码。

此外,在没有异常错误的情况下放置 except 也不是一个好主意。 Why is "except: pass" a bad programming practice?