网页抓取 returns 个 None 列表
web scraping returns list of None
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_title = []
company_name = []
location_name = []
job_skill = []
links = []
salary = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python%5C&a=hpb")
source = result.content
soup = BeautifulSoup(source, "lxml")
job_titles = soup.find_all("h2", {"class": "css-m604qf"})
company_names = soup.find_all("a", {"class": "css-17s97q8"})
location_names = soup.find_all("span", {"class": "css-5wys0k"})
job_skills = soup.find_all("div", {"class": "css-y4udm8"})
for i in range(len(job_titles)):
job_title.append(job_titles[i].text)
links.append("https://wuzzuf.net" + job_titles[i].find("a").attrs["href"])
company_name.append(company_names[i].text)
location_name.append(location_names[i].text)
job_skill.append(job_skills[i].text)
for link in links:
result = requests.get(link)
source = result.content
soup = BeautifulSoup(source, "lxml")
salaries = soup.find("span", {"class": "css-4xky9y"})
salary.append(salaries)
file_list = [job_title, company_name, location_name, job_skill, links, salary]
exported = zip_longest(*file_list)
with open("/Users/Rich/Desktop/JobTutorial.csv", "w") as myfile:
writer = csv.writer(myfile)
writer.writerow(["Job titles", "Company names", "Location names", "Job skills", "Links", "Salary"])
writer.writerows(exported)
print(salary)
问题是 salary 函数 returns 什么都没有,当我将它的结果附加到一个名为 salary 的列表并打印出结果时,它打印了一个 Nones 的列表...
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
请大家帮帮我,感谢你们的帮助。
工资数据是动态生成的,如果你查看jobpost的来源code/Page来源(ctrl+U on chrome)你可以看到数据不是在 HTML 元素中。但它可以在 Wuzzuf.initialStoreState
object
内的 <script>
标签下找到
现在您必须解析此 json 文件以获取作业详细信息数据。你可以使用正则表达式
这是一个工作代码,用于从该列表中解析单个作业的字典 -
link = "https://wuzzuf.net/jobs/p/jITGU1cOLq2S-Senior-Python-Developer-SURE-International-Technology-Cairo-Egypt"
result = requests.get(link, headers=headers)
raw_data = re.compile(r'Wuzzuf.initialStoreState = (.*);').search(result.text)
job_details_dict = json.loads(raw_data.group(1).strip())
job_details_dict
示例输出 -
{'badges': {'landingPage': {'loading': False,
'providers': None,
'timestamp': None}},
'browsingPage': {'sets': {}},
'coaches': {'coachesContactUs': {}, 'coachesPartner': {}},
.................
现在你只需要从这个字典中解析你想要的数据(例如,薪水)
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_title = []
company_name = []
location_name = []
job_skill = []
links = []
salary = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python%5C&a=hpb")
source = result.content
soup = BeautifulSoup(source, "lxml")
job_titles = soup.find_all("h2", {"class": "css-m604qf"})
company_names = soup.find_all("a", {"class": "css-17s97q8"})
location_names = soup.find_all("span", {"class": "css-5wys0k"})
job_skills = soup.find_all("div", {"class": "css-y4udm8"})
for i in range(len(job_titles)):
job_title.append(job_titles[i].text)
links.append("https://wuzzuf.net" + job_titles[i].find("a").attrs["href"])
company_name.append(company_names[i].text)
location_name.append(location_names[i].text)
job_skill.append(job_skills[i].text)
for link in links:
result = requests.get(link)
source = result.content
soup = BeautifulSoup(source, "lxml")
salaries = soup.find("span", {"class": "css-4xky9y"})
salary.append(salaries)
file_list = [job_title, company_name, location_name, job_skill, links, salary]
exported = zip_longest(*file_list)
with open("/Users/Rich/Desktop/JobTutorial.csv", "w") as myfile:
writer = csv.writer(myfile)
writer.writerow(["Job titles", "Company names", "Location names", "Job skills", "Links", "Salary"])
writer.writerows(exported)
print(salary)
问题是 salary 函数 returns 什么都没有,当我将它的结果附加到一个名为 salary 的列表并打印出结果时,它打印了一个 Nones 的列表...
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
请大家帮帮我,感谢你们的帮助。
工资数据是动态生成的,如果你查看jobpost的来源code/Page来源(ctrl+U on chrome)你可以看到数据不是在 HTML 元素中。但它可以在 Wuzzuf.initialStoreState
object
<script>
标签下找到
现在您必须解析此 json 文件以获取作业详细信息数据。你可以使用正则表达式
这是一个工作代码,用于从该列表中解析单个作业的字典 -
link = "https://wuzzuf.net/jobs/p/jITGU1cOLq2S-Senior-Python-Developer-SURE-International-Technology-Cairo-Egypt"
result = requests.get(link, headers=headers)
raw_data = re.compile(r'Wuzzuf.initialStoreState = (.*);').search(result.text)
job_details_dict = json.loads(raw_data.group(1).strip())
job_details_dict
示例输出 -
{'badges': {'landingPage': {'loading': False,
'providers': None,
'timestamp': None}},
'browsingPage': {'sets': {}},
'coaches': {'coachesContactUs': {}, 'coachesPartner': {}},
.................
现在你只需要从这个字典中解析你想要的数据(例如,薪水)