有没有一种方法可以使用 python 中的编码从网页中获取完美(正确格式)的值?
Is there a way to get the perfect (proper format) values from web page using encoding in python?
问题:网页上的内容,如 AMD Ryzen™ 9 5950X,当我将其放入 csv 文件时,它显示为:AMD Ryzen™ 9 5950X,90°C 显示为 90°C。我必须编写一个函数来从网页中获取准确(正确)的值。我试图捕获这些并将它们存储在 .txt 文件中。它显示为完全正常的值。 excel/csv 是否造成了这个问题?我该如何解决这个问题?
config.ini
[configData]
baseurl = https://www.amd.com/en/processors/ryzen
sub_url = https://www.amd.com
all_keys = Model,Platform,Product Family,Product Line,# of CPU Cores,# of Threads, Max. Boost Clock, Base Clock, Total L2 Cache, Total L3 Cache,Default TDP, Processor Technology for CPU Cores, Unlocked for Overclocking, CPU Socket,Thermal Solution (PIB), Max. Operating Temperature (Tjmax), Launch Date, *OS Support
Main.py 文件代码
import logging
from data_extraction import data_extraction
from data_processing import data_processing
from configparser import ConfigParser
class Main:
def __init__(self):
self.config = ConfigParser()
self.config.read('config.ini')
logging.basicConfig(filename='logfile.log', level=logging.DEBUG,
format='%(asctime)s:%(lineno)d:%(name)s:%(levelname)s:%(message)s')
self.baseurl = self.config['configData']['baseurl']
self.sub_url = self.config['configData']['sub_url']
self.all_keys = self.config['configData']['all_keys']
# print(all_keys)
self.all_keys = [key.strip() for key in self.all_keys.split(',')]
def __Processing__(self):
de = data_extraction(self.baseurl)
dp = data_processing()
self.model_links = de.get_links_in_list(self.sub_url)
logging.debug(self.model_links)
each_link_data = de.data_extraction(self.baseurl, self.all_keys, self.model_links)
logging.info('data extraction is called from main')
all_link_data = dp.data_processing(each_link_data)
# calling function write_to_csv
dp.write_to_csv(all_link_data,self.all_keys)
logging.info('data copied to csv')
dp.rectify_csv(str(all_link_data))
Main().__Processing__()
DataExtraction.py
import logging
import requests
from bs4 import BeautifulSoup
class data_extraction:
def __init__(self, baseurl):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"}
r = requests.get(baseurl, headers=self.headers)
status = r.status_code
self.soup = BeautifulSoup(r.content, 'html.parser')
self.model_links = []
# function to get the model links in one list from soup object(1st page extraction)
def get_links_in_list(self, sub_url):
for model_list in self.soup.find_all('td', headers='view-name-table-column'):
# model_list = model_list.a.text - to get the model names
model_list = model_list.a.get('href')
# print(model_list)
model_list = sub_url + model_list
# print(model_list)
one_link = model_list.split(" ")[0]
self.model_links.append(one_link)
return self.model_links
# function to get data for each link from the website(2nd page extraction)
def data_extraction(self, baseurl, all_keys, model_links):
r = requests.get(baseurl, headers=self.headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
each_link_data = []
try:
for link in model_links:
r = requests.get(link, headers=self.headers)
soup = BeautifulSoup(r.content, 'html.parser')
specification = {}
for key in all_keys:
spec = soup.select_one(
f'.field__label:-soup-contains("{key}") + .field__item, .field__label:-soup-contains("{key}") + .field__items .field__item')
# print(spec)
if spec is None:
specification[key] = ''
if key == 'Model':
specification[key] = [i.text for i in soup.select_one('.page-title')]
specification[key] = specification[key][0:1:1]
# print(specification[key])
else:
if key == '*OS Support':
specification[key] = [i.text for i in spec.parent.select('.field__item')]
else:
specification[key] = spec.text
specification['link'] = link
each_link_data.append(specification)
except Exception as e:
print('Error occurred')
logging.info('data not extracted')
return each_link_data
# print(each_link_data)
数据processing.py
import pandas as pd
class data_processing:
# function for data processing : converting the each link object into dataframe
def data_processing(self, each_link_data):
all_link_data = []
for each_linkdata_obj in each_link_data:
# make the nested dictionary to normal dict
norm_dict = dict()
for key in each_linkdata_obj:
if isinstance(each_linkdata_obj[key], list):
norm_dict[key] = ','.join(each_linkdata_obj[key])
else:
norm_dict[key] = each_linkdata_obj[key]
all_link_data.append(norm_dict)
return all_link_data
# function to write dataframe data into csv
def write_to_csv(self, all_link_data, all_keys):
all_link_df = pd.DataFrame.from_dict(all_link_data)
all_link_df2 = all_link_df.drop_duplicates()
all_link_df3 = all_link_df2.reset_index()
# print(all_link_df3)
all_keys = all_keys + ['link']
all_link_df4 = all_link_df3[all_keys]
# print(all_link_df4)
all_link_df4.to_csv('final_data.csv', index=False)
# all_link_df4.to_csv('final_data.xlsx',index=False)
# function to remove unwanted characters from the csv file.
def rectify_csv(self, all_link_df4):
all_link_data_str = str(all_link_df4)
with open('readme.txt', 'w') as f:
f.writelines(all_link_data_str)
readme.txt:
'link': 'https://www.amd.com/en/products/cpu/amd-ryzen-9-5950x',
'Platform': 'Boxed Processor',
'Product Family': 'AMD Ryzen™ Processors',
'Product Line': 'AMD Ryzen™ 9 Desktop Processors',
'# of CPU Cores': '16',
'# of Threads': '32',
'Max. Boost Clock': 'Up to 4.9GHz',
'Base Clock': '3.4GHz',
'Total L2 Cache': '8MB',
'Total L3 Cache': '64MB',
'Default TDP': '105W',
'Processor Technology for CPU Cores': 'TSMC 7nm FinFET',
'Unlocked for Overclocking': 'Yes',
'CPU Socket': 'AM4',
'Thermal Solution (PIB)': 'Not included',
'Max. Operating Temperature (Tjmax)': '90°C',
'Launch Date': '11/5/2020',
'*OS Support': 'Windows 10 - 64-Bit Edition,RHEL x86 64-Bit,Ubuntu x86 64-Bit,*Operating System (OS) support will vary by manufacturer.'},
final_data.csv
AMD Ryzenâ„¢ 9 5950X Desktop Processors
AMD Ryzenâ„¢ Processors
AMD Ryzenâ„¢ 9 Desktop Processors
90°C
Microsoft 软件因假定文件内容以本机 Windows 字符编码进行编码而臭名昭著。如果您的文件是用类似 UTF-8 的合理格式编写的,则很可能会被误解。
让 Microsoft 应用程序如 Excel 正确识别 UTF-8 的方法是在文件前面放置一个 byte order mark。如果用encoding='utf_8_sig'
打开文件,Python可以自动添加这个。
问题:网页上的内容,如 AMD Ryzen™ 9 5950X,当我将其放入 csv 文件时,它显示为:AMD Ryzen™ 9 5950X,90°C 显示为 90°C。我必须编写一个函数来从网页中获取准确(正确)的值。我试图捕获这些并将它们存储在 .txt 文件中。它显示为完全正常的值。 excel/csv 是否造成了这个问题?我该如何解决这个问题?
config.ini
[configData]
baseurl = https://www.amd.com/en/processors/ryzen
sub_url = https://www.amd.com
all_keys = Model,Platform,Product Family,Product Line,# of CPU Cores,# of Threads, Max. Boost Clock, Base Clock, Total L2 Cache, Total L3 Cache,Default TDP, Processor Technology for CPU Cores, Unlocked for Overclocking, CPU Socket,Thermal Solution (PIB), Max. Operating Temperature (Tjmax), Launch Date, *OS Support
Main.py 文件代码
import logging
from data_extraction import data_extraction
from data_processing import data_processing
from configparser import ConfigParser
class Main:
def __init__(self):
self.config = ConfigParser()
self.config.read('config.ini')
logging.basicConfig(filename='logfile.log', level=logging.DEBUG,
format='%(asctime)s:%(lineno)d:%(name)s:%(levelname)s:%(message)s')
self.baseurl = self.config['configData']['baseurl']
self.sub_url = self.config['configData']['sub_url']
self.all_keys = self.config['configData']['all_keys']
# print(all_keys)
self.all_keys = [key.strip() for key in self.all_keys.split(',')]
def __Processing__(self):
de = data_extraction(self.baseurl)
dp = data_processing()
self.model_links = de.get_links_in_list(self.sub_url)
logging.debug(self.model_links)
each_link_data = de.data_extraction(self.baseurl, self.all_keys, self.model_links)
logging.info('data extraction is called from main')
all_link_data = dp.data_processing(each_link_data)
# calling function write_to_csv
dp.write_to_csv(all_link_data,self.all_keys)
logging.info('data copied to csv')
dp.rectify_csv(str(all_link_data))
Main().__Processing__()
DataExtraction.py
import logging
import requests
from bs4 import BeautifulSoup
class data_extraction:
def __init__(self, baseurl):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"}
r = requests.get(baseurl, headers=self.headers)
status = r.status_code
self.soup = BeautifulSoup(r.content, 'html.parser')
self.model_links = []
# function to get the model links in one list from soup object(1st page extraction)
def get_links_in_list(self, sub_url):
for model_list in self.soup.find_all('td', headers='view-name-table-column'):
# model_list = model_list.a.text - to get the model names
model_list = model_list.a.get('href')
# print(model_list)
model_list = sub_url + model_list
# print(model_list)
one_link = model_list.split(" ")[0]
self.model_links.append(one_link)
return self.model_links
# function to get data for each link from the website(2nd page extraction)
def data_extraction(self, baseurl, all_keys, model_links):
r = requests.get(baseurl, headers=self.headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
each_link_data = []
try:
for link in model_links:
r = requests.get(link, headers=self.headers)
soup = BeautifulSoup(r.content, 'html.parser')
specification = {}
for key in all_keys:
spec = soup.select_one(
f'.field__label:-soup-contains("{key}") + .field__item, .field__label:-soup-contains("{key}") + .field__items .field__item')
# print(spec)
if spec is None:
specification[key] = ''
if key == 'Model':
specification[key] = [i.text for i in soup.select_one('.page-title')]
specification[key] = specification[key][0:1:1]
# print(specification[key])
else:
if key == '*OS Support':
specification[key] = [i.text for i in spec.parent.select('.field__item')]
else:
specification[key] = spec.text
specification['link'] = link
each_link_data.append(specification)
except Exception as e:
print('Error occurred')
logging.info('data not extracted')
return each_link_data
# print(each_link_data)
数据processing.py
import pandas as pd
class data_processing:
# function for data processing : converting the each link object into dataframe
def data_processing(self, each_link_data):
all_link_data = []
for each_linkdata_obj in each_link_data:
# make the nested dictionary to normal dict
norm_dict = dict()
for key in each_linkdata_obj:
if isinstance(each_linkdata_obj[key], list):
norm_dict[key] = ','.join(each_linkdata_obj[key])
else:
norm_dict[key] = each_linkdata_obj[key]
all_link_data.append(norm_dict)
return all_link_data
# function to write dataframe data into csv
def write_to_csv(self, all_link_data, all_keys):
all_link_df = pd.DataFrame.from_dict(all_link_data)
all_link_df2 = all_link_df.drop_duplicates()
all_link_df3 = all_link_df2.reset_index()
# print(all_link_df3)
all_keys = all_keys + ['link']
all_link_df4 = all_link_df3[all_keys]
# print(all_link_df4)
all_link_df4.to_csv('final_data.csv', index=False)
# all_link_df4.to_csv('final_data.xlsx',index=False)
# function to remove unwanted characters from the csv file.
def rectify_csv(self, all_link_df4):
all_link_data_str = str(all_link_df4)
with open('readme.txt', 'w') as f:
f.writelines(all_link_data_str)
readme.txt:
'link': 'https://www.amd.com/en/products/cpu/amd-ryzen-9-5950x',
'Platform': 'Boxed Processor',
'Product Family': 'AMD Ryzen™ Processors',
'Product Line': 'AMD Ryzen™ 9 Desktop Processors',
'# of CPU Cores': '16',
'# of Threads': '32',
'Max. Boost Clock': 'Up to 4.9GHz',
'Base Clock': '3.4GHz',
'Total L2 Cache': '8MB',
'Total L3 Cache': '64MB',
'Default TDP': '105W',
'Processor Technology for CPU Cores': 'TSMC 7nm FinFET',
'Unlocked for Overclocking': 'Yes',
'CPU Socket': 'AM4',
'Thermal Solution (PIB)': 'Not included',
'Max. Operating Temperature (Tjmax)': '90°C',
'Launch Date': '11/5/2020',
'*OS Support': 'Windows 10 - 64-Bit Edition,RHEL x86 64-Bit,Ubuntu x86 64-Bit,*Operating System (OS) support will vary by manufacturer.'},
final_data.csv
AMD Ryzenâ„¢ 9 5950X Desktop Processors
AMD Ryzenâ„¢ Processors
AMD Ryzenâ„¢ 9 Desktop Processors
90°C
Microsoft 软件因假定文件内容以本机 Windows 字符编码进行编码而臭名昭著。如果您的文件是用类似 UTF-8 的合理格式编写的,则很可能会被误解。
让 Microsoft 应用程序如 Excel 正确识别 UTF-8 的方法是在文件前面放置一个 byte order mark。如果用encoding='utf_8_sig'
打开文件,Python可以自动添加这个。