用 Selenium 抓取完整 post Indeed
Scraping full post Indeed with Selenium
我正在尝试使 python 爬虫代码工作,但我做不到,一点帮助会很有用,我还是个初学者。代码运行正常,但它崩溃并将单个作业导出到我的 csv,我认为它是随机的并且不提供任何 error.Please,有更多经验的人可以帮助我一些 tips.Thanks提前。
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
options = webdriver.FirefoxOptions()
driver = webdriver.Firefox()
driver.maximize_window()
df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])
for i in range(25):
driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
jobs = []
driver.implicitly_wait(20)
for job in driver.find_elements_by_class_name('result'):
soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
try:
title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
except:
title = 'None'
try:
location = soup.find(class_="location").text
except:
location = 'None'
try:
company = soup.find(class_="company").text.replace("\n","").strip()
except:
company = 'None'
try:
salary = soup.find(class_="salary").text.replace("\n","").strip()
except:
salary = 'None'
try:
sponsored = soup.find(class_="sponsoredGray").text
sponsored = "Sponsored"
except:
sponsored = "Organic"
sum_div = job.find_element_by_class_name('summary')
try:
sum_div.click()
except:
close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
close_button.click()
sum_div.click()
driver.implicitly_wait(2)
try:
job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
print(job_desc)
except:
job_desc = 'None'
df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
"Sponsored":sponsored,"Description":job_desc},ignore_index=True)
df.to_csv(r"C:\Users\Desktop\Python\Newtest.csv",index=False)
这似乎是一个简单的缩进问题。
您的部分代码 运行 在 for 循环之外。
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])
for i in range(0,50,10):
driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
jobs = []
driver.implicitly_wait(20)
for job in driver.find_elements_by_class_name('result'):
soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
try:
title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
except:
title = 'None'
try:
location = soup.find(class_="location").text
except:
location = 'None'
try:
company = soup.find(class_="company").text.replace("\n","").strip()
except:
company = 'None'
try:
salary = soup.find(class_="salary").text.replace("\n","").strip()
except:
salary = 'None'
try:
sponsored = soup.find(class_="sponsoredGray").text
sponsored = "Sponsored"
except:
sponsored = "Organic"
sum_div = job.find_element_by_class_name('summary')
try:
sum_div.click()
except:
close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
close_button.click()
sum_div.click()
driver.implicitly_wait(2)
try:
job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
print(job_desc)
except:
job_desc = 'None'
df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
"Sponsored":sponsored,"Description":job_desc},ignore_index=True)
df.to_csv("test.csv",index=False)
我使用 Chrome 而不是 Firefox,但我认为问题不存在。我刚刚正确地缩进了你的代码。
此外,在没有异常错误的情况下放置 except 也不是一个好主意。
Why is "except: pass" a bad programming practice?
我正在尝试使 python 爬虫代码工作,但我做不到,一点帮助会很有用,我还是个初学者。代码运行正常,但它崩溃并将单个作业导出到我的 csv,我认为它是随机的并且不提供任何 error.Please,有更多经验的人可以帮助我一些 tips.Thanks提前。
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
options = webdriver.FirefoxOptions()
driver = webdriver.Firefox()
driver.maximize_window()
df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])
for i in range(25):
driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
jobs = []
driver.implicitly_wait(20)
for job in driver.find_elements_by_class_name('result'):
soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
try:
title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
except:
title = 'None'
try:
location = soup.find(class_="location").text
except:
location = 'None'
try:
company = soup.find(class_="company").text.replace("\n","").strip()
except:
company = 'None'
try:
salary = soup.find(class_="salary").text.replace("\n","").strip()
except:
salary = 'None'
try:
sponsored = soup.find(class_="sponsoredGray").text
sponsored = "Sponsored"
except:
sponsored = "Organic"
sum_div = job.find_element_by_class_name('summary')
try:
sum_div.click()
except:
close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
close_button.click()
sum_div.click()
driver.implicitly_wait(2)
try:
job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
print(job_desc)
except:
job_desc = 'None'
df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
"Sponsored":sponsored,"Description":job_desc},ignore_index=True)
df.to_csv(r"C:\Users\Desktop\Python\Newtest.csv",index=False)
这似乎是一个简单的缩进问题。 您的部分代码 运行 在 for 循环之外。
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
df = pd.DataFrame(columns=["Title","Location","Company","Salary","Sponsored","Description"])
for i in range(0,50,10):
driver.get('https://www.indeed.co.in/jobs?q=artificial%20intelligence&l=India&start='+str(i))
jobs = []
driver.implicitly_wait(20)
for job in driver.find_elements_by_class_name('result'):
soup = BeautifulSoup(job.get_attribute('innerHTML'),'html.parser')
try:
title = soup.find("a",class_="jobtitle").text.replace("\n","").strip()
except:
title = 'None'
try:
location = soup.find(class_="location").text
except:
location = 'None'
try:
company = soup.find(class_="company").text.replace("\n","").strip()
except:
company = 'None'
try:
salary = soup.find(class_="salary").text.replace("\n","").strip()
except:
salary = 'None'
try:
sponsored = soup.find(class_="sponsoredGray").text
sponsored = "Sponsored"
except:
sponsored = "Organic"
sum_div = job.find_element_by_class_name('summary')
try:
sum_div.click()
except:
close_button = driver.find_elements_by_class_name('popover-x-button-close')[0]
close_button.click()
sum_div.click()
driver.implicitly_wait(2)
try:
job_desc = driver.find_element_by_css_selector('div#vjs-desc').text
print(job_desc)
except:
job_desc = 'None'
df = df.append({'Title':title,'Location':location,"Company":company,"Salary":salary,
"Sponsored":sponsored,"Description":job_desc},ignore_index=True)
df.to_csv("test.csv",index=False)
我使用 Chrome 而不是 Firefox,但我认为问题不存在。我刚刚正确地缩进了你的代码。
此外,在没有异常错误的情况下放置 except 也不是一个好主意。 Why is "except: pass" a bad programming practice?