如何使用 python 网络抓取从职位发布中提取多个职位 url
How to extract multiple job url from a jobpost using python webscrapping
到目前为止,我已经创建了以下代码来提取所有信息,但我不知道如何提取 link。我试过使用 for 循环,但我得到了不同的 links。我真的希望有人能给我指出正确的方向。
def extract(page, tag):
url = f"https://www.jobindex.dk/jobsoegning?page ={page} &q ={tag}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_="jobsearch-result")
for item in divs:
title = item.find_all("b")[0].text.strip()
company = item.find_all("b")[1].text.strip()
published_date = item.find("time").text.strip()
summary = item.find_all("p")[1].text.strip()
job_location = item.find_all("p")[0].text.strip()
job_url = item.find_all("href")
job = {
"title" : title,
"company" : company,
"published_date" : published_date,
"summary" : summary,
"job_location" : job_location,
"Job_url" : job_url
}
joblist.append(job)
return
您可以将 attribute = value css 选择器与 contains *
运算符结合使用,以通过子字符串定位 onclick
属性。添加到该选择器列表 :has
以指定具有匹配 onclick
属性的元素必须具有直接子 b
标记,该标记将匹配限制为具有粗体职位的那些
[data-click*="u="]:has(> b)
import requests
from bs4 import BeautifulSoup
def extract(page, tag):
headers = {'User-Agent':'Mozilla/5.0'}
url = f"https://www.jobindex.dk/jobsoegning?page={page}&q={tag}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_="jobsearch-result")
for item in divs:
title = item.find_all("b")[0].text.strip()
company = item.find_all("b")[1].text.strip()
published_date = item.find("time").text.strip()
summary = item.find_all("p")[1].text.strip()
job_location = item.find_all("p")[0].text.strip()
job_url = item.select_one('[data-click*="u="]:has(> b)')['href']
job = {
"title" : title,
"company" : company,
"published_date" : published_date,
"summary" : summary,
"job_location" : job_location,
"Job_url" : job_url
}
joblist.append(job)
return
joblist = []
soup = extract(1, "python")
#print(soup)
transform(soup)
print(joblist)
到目前为止,我已经创建了以下代码来提取所有信息,但我不知道如何提取 link。我试过使用 for 循环,但我得到了不同的 links。我真的希望有人能给我指出正确的方向。
def extract(page, tag):
url = f"https://www.jobindex.dk/jobsoegning?page ={page} &q ={tag}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_="jobsearch-result")
for item in divs:
title = item.find_all("b")[0].text.strip()
company = item.find_all("b")[1].text.strip()
published_date = item.find("time").text.strip()
summary = item.find_all("p")[1].text.strip()
job_location = item.find_all("p")[0].text.strip()
job_url = item.find_all("href")
job = {
"title" : title,
"company" : company,
"published_date" : published_date,
"summary" : summary,
"job_location" : job_location,
"Job_url" : job_url
}
joblist.append(job)
return
您可以将 attribute = value css 选择器与 contains *
运算符结合使用,以通过子字符串定位 onclick
属性。添加到该选择器列表 :has
以指定具有匹配 onclick
属性的元素必须具有直接子 b
标记,该标记将匹配限制为具有粗体职位的那些
[data-click*="u="]:has(> b)
import requests
from bs4 import BeautifulSoup
def extract(page, tag):
headers = {'User-Agent':'Mozilla/5.0'}
url = f"https://www.jobindex.dk/jobsoegning?page={page}&q={tag}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_="jobsearch-result")
for item in divs:
title = item.find_all("b")[0].text.strip()
company = item.find_all("b")[1].text.strip()
published_date = item.find("time").text.strip()
summary = item.find_all("p")[1].text.strip()
job_location = item.find_all("p")[0].text.strip()
job_url = item.select_one('[data-click*="u="]:has(> b)')['href']
job = {
"title" : title,
"company" : company,
"published_date" : published_date,
"summary" : summary,
"job_location" : job_location,
"Job_url" : job_url
}
joblist.append(job)
return
joblist = []
soup = extract(1, "python")
#print(soup)
transform(soup)
print(joblist)