为什么我在 glints.com 中抓取时无法发布工作
Why i can't get the job posted while scraping in glints.com
我想获取 glints.com 中职位项目内的职位发布元素,但奇怪的是它不能,即使类名是相同的,我不知道为什么它不能为拿到它,为实现它。但是,在项目工作之外我可以得到它。
也许有人对此有所了解。谢谢。
这是代码
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer-sc-1f9hdu8-0 RBKNv CompactOpportunityCardsc__CompactJobCardWrapper-sc-1y4v110-0 cqpitP compact_job_card'))
for tag in tags:
def get_link(tag):
link = tag.find('div', class_=re.compile('JobCardsc__JobCardWrapper-sc-1f9hdu8-1 elhnMG')).find('a', class_=re.compile('CompactOpportunityCardsc__CardAnchorWrapper-sc-1y4v110-18 fKmNmg job-search-results_job-card_link')).get('href')
link = 'https://glints.com'+link
return link
link_res = requests.get(get_link(tag))
link_soup = BeautifulSoup(link_res.text, 'lxml')
container = link_soup.find('main',class_=re.compile('Opportunitysc__Main-sc-1gsvee3-3 cOQpRQ'))
job_name_tag = container.find('div', class_=re.compile('TopFoldsc__JobOverViewContainer-kklg8i-2 hCbpbU'))
if job_name_tag is not None:
job_name = job_name_tag.find('h1', class_=re.compile('TopFoldsc__JobOverViewTitle-kklg8i-3 gaBsxq')).text.strip()
else:
job_name = 'None'
#this is for the element job posted in inside the job item
tag_posted = container.find('div', class_=re.compile('TopFoldsc__JobOverViewInfoContainer-kklg8i-8 cyqTAm')).find('div', class_=re.compile('TopFoldsc__JobOverViewTime-kklg8i-10 iDOZMJ'))
job_posted = tag_posted.find('span',class_=re.compile('TopFoldsc__UpdatedAt-kklg8i-12 fSjRSR'))
if job_posted:
job_posted = job_posted.text
else:
job_posted = 'None'
#this is for the element job posted in outside the job item
tag_post = tag.find('div', class_=re.compile('CompactOpportunityCardsc__OpportunityMeta-sc-1y4v110-14 eSReDz'))
job_post = tag_post.find('span',class_=re.compile('CompactOpportunityCardsc__UpdatedAtMessage-sc-1y4v110-17 hrxARV'))
if job_post is not None:
job_post = job_post.text
else:
job_post = 'None'
data = [job_name,job_posted, job_post]
print(data)
您正在搜索的 类 可能会随着每次调用而变化。在编写每个名称时,我建议您尽可能简化名称。例如:
from bs4 import BeautifulSoup
import requests
import csv
import re
req = requests.get("https://glints.com/opportunities/jobs/explore?keyword=software&country=SG&searchCity=127215&locationName=Singapore")
soup = BeautifulSoup(req.content, 'lxml')
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer'))
for tag in tags:
def get_link(tag):
link = tag.find('div', class_=re.compile('JobCardsc__JobCardWrapper')).find('a', class_=re.compile('CompactOpportunityCardsc__CardAnchorWrapper')).get('href')
link = 'https://glints.com'+link
return link
link_res = requests.get(get_link(tag))
link_soup = BeautifulSoup(link_res.text, 'lxml')
container = link_soup.find('main',class_=re.compile('Opportunitysc__Main'))
job_name_tag = container.find('div', class_=re.compile('JobOverViewContainer'))
if job_name_tag is not None:
job_name = job_name_tag.find('h1', class_=re.compile('JobOverViewTitle')).text.strip()
else:
job_name = 'None'
#this is for the element job posted in inside the job item
tag_posted = container.find('div', class_=re.compile('JobOverViewInfoContainer')).find('div', class_=re.compile('JobOverView'))
job_posted = tag_posted.find('span',class_=re.compile('TopFoldsc'))
if job_posted:
job_posted = job_posted.text
else:
job_posted = 'None'
#this is for the element job posted in outside the job item
tag_post = tag.find('div', class_=re.compile('CompactOpportunityCardsc'))
job_post = tag_post.find('span', class_=re.compile('CompactOpportunityCardsc'))
if job_post is not None:
job_post = job_post.text
else:
job_post = 'None'
data = [job_name, job_posted, job_post]
print(data)
此搜索给出的结果开始于:
['Software Architect', 'None', 'PSA International Pte Ltd']
['Software Engineer (Logistics)', 'SGD10,000 - 14,000/month', 'Confidential Company']
['Software Developer', 'SGD3,500 - 6,000/month', 'ANOTECH Singapore']
['Software Engineer', 'SGD2,000 - 3,500/month', 'Presidium Instruments Pte Ltd']
信息(和上市日期)可以从页面中包含的 JSON 中获得。
我强烈建议研究 data
以了解所有数据的排列方式。条目具有引用其他条目的 ID,因此它不是一个简单的列表:
from bs4 import BeautifulSoup
import requests
import csv
import re
import json
req = requests.get("https://glints.com/opportunities/jobs/explore?keyword=software&country=SG&searchCity=127215&locationName=Singapore")
soup = BeautifulSoup(req.content, 'lxml')
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer'))
data = json.loads(soup.find('script', id="__NEXT_DATA__").string)
apollo_cache = data['props']['apolloCache']
for key, value in apollo_cache['ROOT_QUERY'].items():
if key.startswith('searchJobs'):
for job_ref in value['jobsInPage']:
job = apollo_cache[job_ref['__ref']]
if job['salaries']:
s = apollo_cache[job['salaries'][0]['__ref']]
salary = f"{s['CurrencyCode']}{s['minAmount']} - {s['maxAmount']} / {s['salaryMode']}"
else:
salary = 'Not given'
company = apollo_cache[job['company']['__ref']]
print([job['title'], job['createdAt'], company['name'], salary])
这将使您的输出开始:
['Software Architect', '2022-02-03T09:05:35.552Z', 'PSA International Pte Ltd', 'Not given']
['Software Engineer (Logistics)', '2022-03-03T07:43:21.037Z', '', 'SGD10000 - 14000 / MONTH']
['Software Developer', '2022-04-26T02:31:03.930Z', 'ANOTECH Singapore', 'SGD3500 - 6000 / MONTH']
['Software Engineer', '2022-03-24T02:34:52.322Z', 'Presidium Instruments Pte Ltd', 'SGD2000 - 3500 / MONTH']
['Software Engineer ', '2022-04-04T08:07:58.950Z', 'Tradex Systems Pte Ltd', 'SGD4500 - 6500 / MONTH']
我想获取 glints.com 中职位项目内的职位发布元素,但奇怪的是它不能,即使类名是相同的,我不知道为什么它不能为拿到它,为实现它。但是,在项目工作之外我可以得到它。
也许有人对此有所了解。谢谢。
这是代码
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer-sc-1f9hdu8-0 RBKNv CompactOpportunityCardsc__CompactJobCardWrapper-sc-1y4v110-0 cqpitP compact_job_card'))
for tag in tags:
def get_link(tag):
link = tag.find('div', class_=re.compile('JobCardsc__JobCardWrapper-sc-1f9hdu8-1 elhnMG')).find('a', class_=re.compile('CompactOpportunityCardsc__CardAnchorWrapper-sc-1y4v110-18 fKmNmg job-search-results_job-card_link')).get('href')
link = 'https://glints.com'+link
return link
link_res = requests.get(get_link(tag))
link_soup = BeautifulSoup(link_res.text, 'lxml')
container = link_soup.find('main',class_=re.compile('Opportunitysc__Main-sc-1gsvee3-3 cOQpRQ'))
job_name_tag = container.find('div', class_=re.compile('TopFoldsc__JobOverViewContainer-kklg8i-2 hCbpbU'))
if job_name_tag is not None:
job_name = job_name_tag.find('h1', class_=re.compile('TopFoldsc__JobOverViewTitle-kklg8i-3 gaBsxq')).text.strip()
else:
job_name = 'None'
#this is for the element job posted in inside the job item
tag_posted = container.find('div', class_=re.compile('TopFoldsc__JobOverViewInfoContainer-kklg8i-8 cyqTAm')).find('div', class_=re.compile('TopFoldsc__JobOverViewTime-kklg8i-10 iDOZMJ'))
job_posted = tag_posted.find('span',class_=re.compile('TopFoldsc__UpdatedAt-kklg8i-12 fSjRSR'))
if job_posted:
job_posted = job_posted.text
else:
job_posted = 'None'
#this is for the element job posted in outside the job item
tag_post = tag.find('div', class_=re.compile('CompactOpportunityCardsc__OpportunityMeta-sc-1y4v110-14 eSReDz'))
job_post = tag_post.find('span',class_=re.compile('CompactOpportunityCardsc__UpdatedAtMessage-sc-1y4v110-17 hrxARV'))
if job_post is not None:
job_post = job_post.text
else:
job_post = 'None'
data = [job_name,job_posted, job_post]
print(data)
您正在搜索的 类 可能会随着每次调用而变化。在编写每个名称时,我建议您尽可能简化名称。例如:
from bs4 import BeautifulSoup
import requests
import csv
import re
req = requests.get("https://glints.com/opportunities/jobs/explore?keyword=software&country=SG&searchCity=127215&locationName=Singapore")
soup = BeautifulSoup(req.content, 'lxml')
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer'))
for tag in tags:
def get_link(tag):
link = tag.find('div', class_=re.compile('JobCardsc__JobCardWrapper')).find('a', class_=re.compile('CompactOpportunityCardsc__CardAnchorWrapper')).get('href')
link = 'https://glints.com'+link
return link
link_res = requests.get(get_link(tag))
link_soup = BeautifulSoup(link_res.text, 'lxml')
container = link_soup.find('main',class_=re.compile('Opportunitysc__Main'))
job_name_tag = container.find('div', class_=re.compile('JobOverViewContainer'))
if job_name_tag is not None:
job_name = job_name_tag.find('h1', class_=re.compile('JobOverViewTitle')).text.strip()
else:
job_name = 'None'
#this is for the element job posted in inside the job item
tag_posted = container.find('div', class_=re.compile('JobOverViewInfoContainer')).find('div', class_=re.compile('JobOverView'))
job_posted = tag_posted.find('span',class_=re.compile('TopFoldsc'))
if job_posted:
job_posted = job_posted.text
else:
job_posted = 'None'
#this is for the element job posted in outside the job item
tag_post = tag.find('div', class_=re.compile('CompactOpportunityCardsc'))
job_post = tag_post.find('span', class_=re.compile('CompactOpportunityCardsc'))
if job_post is not None:
job_post = job_post.text
else:
job_post = 'None'
data = [job_name, job_posted, job_post]
print(data)
此搜索给出的结果开始于:
['Software Architect', 'None', 'PSA International Pte Ltd']
['Software Engineer (Logistics)', 'SGD10,000 - 14,000/month', 'Confidential Company']
['Software Developer', 'SGD3,500 - 6,000/month', 'ANOTECH Singapore']
['Software Engineer', 'SGD2,000 - 3,500/month', 'Presidium Instruments Pte Ltd']
信息(和上市日期)可以从页面中包含的 JSON 中获得。
我强烈建议研究 data
以了解所有数据的排列方式。条目具有引用其他条目的 ID,因此它不是一个简单的列表:
from bs4 import BeautifulSoup
import requests
import csv
import re
import json
req = requests.get("https://glints.com/opportunities/jobs/explore?keyword=software&country=SG&searchCity=127215&locationName=Singapore")
soup = BeautifulSoup(req.content, 'lxml')
tags = soup.find_all('div', class_=re.compile(r'JobCardsc__JobcardContainer'))
data = json.loads(soup.find('script', id="__NEXT_DATA__").string)
apollo_cache = data['props']['apolloCache']
for key, value in apollo_cache['ROOT_QUERY'].items():
if key.startswith('searchJobs'):
for job_ref in value['jobsInPage']:
job = apollo_cache[job_ref['__ref']]
if job['salaries']:
s = apollo_cache[job['salaries'][0]['__ref']]
salary = f"{s['CurrencyCode']}{s['minAmount']} - {s['maxAmount']} / {s['salaryMode']}"
else:
salary = 'Not given'
company = apollo_cache[job['company']['__ref']]
print([job['title'], job['createdAt'], company['name'], salary])
这将使您的输出开始:
['Software Architect', '2022-02-03T09:05:35.552Z', 'PSA International Pte Ltd', 'Not given']
['Software Engineer (Logistics)', '2022-03-03T07:43:21.037Z', '', 'SGD10000 - 14000 / MONTH']
['Software Developer', '2022-04-26T02:31:03.930Z', 'ANOTECH Singapore', 'SGD3500 - 6000 / MONTH']
['Software Engineer', '2022-03-24T02:34:52.322Z', 'Presidium Instruments Pte Ltd', 'SGD2000 - 3500 / MONTH']
['Software Engineer ', '2022-04-04T08:07:58.950Z', 'Tradex Systems Pte Ltd', 'SGD4500 - 6500 / MONTH']