Python CSV 操作脚本中的 UnicodeEncodeError
UnicodeEncodeError in Python CSV manipulation script
我有一个脚本之前可以运行,但现在由于 UnicodeEncodeError 而停止。
我正在使用 Python 3.4.3.
完整的错误信息如下:
Traceback (most recent call last):
File "R:/A/APIDevelopment/ScivalPubsExternal/Combine/ScivalPubsExt.py", line 58, in <module>
outputFD.writerow(row)
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x8a' in position 413: character maps to <undefined>
我该如何解决这个错误?
Python脚本如下:
import pdb
import csv,sys,os
import glob
import os
import codecs
os.chdir('R:/A/APIDevelopment/ScivalPubsExternal/Combine')
joinedFileOut='ScivalUpdate'
csvSourceDir="R:/A/APIDevelopment/ScivalPubsExternal/Combine/AustralianUniversities"
# create dictionary from Codes file (Institution names and codes)
codes = csv.reader(open('Codes.csv'))
#rows of the file are stored as lists/arrays
InstitutionCodesDict = {}
InstitutionYearsDict = {}
for row in codes:
#keys: instnames, #values: instcodes
InstitutionCodesDict[row[0]] = row[1]
#define year dictionary with empty values field
InstitutionYearsDict[row[0]] = []
#to create a fiel descriptor for the outputfile, wt means text mode (also rt opr r is the same)
with open(joinedFileOut,'wt') as csvWriteFD:
#write the file (it is still empty here)
outputFD=csv.writer(csvWriteFD,delimiter=',')
#with closes the file at the end, if exception occurs then before that
# open each scival file, create file descriptor (encoding needed) and then read it and print the name of the file
if not glob.glob(csvSourceDir+"/*.csv"):
print("CSV source files not found")
sys.exit()
for scivalFile in glob.glob(csvSourceDir+"/*.csv"):
#with open(scivalFile,"rt", encoding="utf8") as csvInFD:
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
print(scivalFile)
#create condition for loop
printon=False
#reads all rows in file and creates lists/arrays of each row
for row in fileFD:
if len(row)>1:
#the next printon part is skipped when looping through the rows above the data because it is not set to true
if printon:
#inserts instcode and inst sequentially to each row where there is data and after the header row
row.insert(0, InstitutionCode)
row.insert(0, Institution)
if row[10].strip() == "-":
row[10] = " "
else:
p = row[10].zfill(8)
q = p[0:4] + '-' + p[4:]
row[10] = q
#writes output file
outputFD.writerow(row)
else:
if "Publications at" in row[1]:
#get institution name from cell B1
Institution=row[1].replace('Publications at the ', "").replace('Publications at ',"")
print(Institution)
#lookup institution code from dictionary
InstitutionCode=InstitutionCodesDict[Institution]
#printon gets set to TRUE after the header column
if "Title" in row[0]: printon=True
if "Publication years" in row[0]:
#get the year to print it later to see which years were pulled
year=row[1]
#add year to institution in dictionary
if not year in InstitutionYearsDict[Institution]:
InstitutionYearsDict[Institution].append(year)
# Write a report showing the institution name followed by the years for
# which we have that institution's data.
with open("Instyears.txt","w") as instReportFD:
for inst in (InstitutionYearsDict):
instReportFD.write(inst)
for yr in InstitutionYearsDict[inst]:
instReportFD.write(" "+yr)
instReportFD.write("\n")
该错误是由于尝试使用系统的默认 cp1252 编码写入包含 U+008A 字符的字符串引起的。修复起来很简单,只需为您的输出文件声明一个 latin1 编码(或 iso-8859-1)(因为它只输出原始字节而不进行转换):
with open(joinedFileOut,'wt', encoding='latin1') as csvWriteFD:
但这只会掩盖真正的问题:这个0x8a
字符是从哪里来的?我的建议是拦截异常并转储它发生的行:
try:
outputFD.writerow(row)
except UnicodeEncodeError:
# print row, the name of the file being processed and the line number
这可能是由于其中一个输入文件不是 is-8859-1 编码而更可能是 utf8 编码引起的...
确保对源文件和目标文件使用正确编码。您在三个位置打开文件:
codes = csv.reader(open('Codes.csv'))
: : :
with open(joinedFileOut,'wt') as csvWriteFD:
outputFD=csv.writer(csvWriteFD,delimiter=',')
: : :
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
这应该类似于:
# Use the correct encoding. If you made this file on
# Windows it is likely Windows-1252 (also known as cp1252):
with open('Codes.csv', encoding='cp1252') as f:
codes = csv.reader(f)
: : :
# The output encoding can be anything you want. UTF-8
# supports all Unicode characters. Windows apps tend to like
# the files to start with a UTF-8 BOM if the file is UTF-8,
# so 'utf-8-sig' is an option.
with open(joinedFileOut,'w', encoding='utf-8-sig') as csvWriteFD:
outputFD=csv.writer(csvWriteFD)
: : :
# This file is probably the cause of your problem and is not ISO-8859-1.
# Maybe UTF-8 instead? 'utf-8-sig' will safely handle and remove a UTF-8 BOM
# if present.
with open(scivalFile,'r', encoding='utf-8-sig') as csvInFD:
fileFD = csv.reader(csvInFD)
我有一个脚本之前可以运行,但现在由于 UnicodeEncodeError 而停止。
我正在使用 Python 3.4.3.
完整的错误信息如下:
Traceback (most recent call last):
File "R:/A/APIDevelopment/ScivalPubsExternal/Combine/ScivalPubsExt.py", line 58, in <module>
outputFD.writerow(row)
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x8a' in position 413: character maps to <undefined>
我该如何解决这个错误?
Python脚本如下:
import pdb
import csv,sys,os
import glob
import os
import codecs
os.chdir('R:/A/APIDevelopment/ScivalPubsExternal/Combine')
joinedFileOut='ScivalUpdate'
csvSourceDir="R:/A/APIDevelopment/ScivalPubsExternal/Combine/AustralianUniversities"
# create dictionary from Codes file (Institution names and codes)
codes = csv.reader(open('Codes.csv'))
#rows of the file are stored as lists/arrays
InstitutionCodesDict = {}
InstitutionYearsDict = {}
for row in codes:
#keys: instnames, #values: instcodes
InstitutionCodesDict[row[0]] = row[1]
#define year dictionary with empty values field
InstitutionYearsDict[row[0]] = []
#to create a fiel descriptor for the outputfile, wt means text mode (also rt opr r is the same)
with open(joinedFileOut,'wt') as csvWriteFD:
#write the file (it is still empty here)
outputFD=csv.writer(csvWriteFD,delimiter=',')
#with closes the file at the end, if exception occurs then before that
# open each scival file, create file descriptor (encoding needed) and then read it and print the name of the file
if not glob.glob(csvSourceDir+"/*.csv"):
print("CSV source files not found")
sys.exit()
for scivalFile in glob.glob(csvSourceDir+"/*.csv"):
#with open(scivalFile,"rt", encoding="utf8") as csvInFD:
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
print(scivalFile)
#create condition for loop
printon=False
#reads all rows in file and creates lists/arrays of each row
for row in fileFD:
if len(row)>1:
#the next printon part is skipped when looping through the rows above the data because it is not set to true
if printon:
#inserts instcode and inst sequentially to each row where there is data and after the header row
row.insert(0, InstitutionCode)
row.insert(0, Institution)
if row[10].strip() == "-":
row[10] = " "
else:
p = row[10].zfill(8)
q = p[0:4] + '-' + p[4:]
row[10] = q
#writes output file
outputFD.writerow(row)
else:
if "Publications at" in row[1]:
#get institution name from cell B1
Institution=row[1].replace('Publications at the ', "").replace('Publications at ',"")
print(Institution)
#lookup institution code from dictionary
InstitutionCode=InstitutionCodesDict[Institution]
#printon gets set to TRUE after the header column
if "Title" in row[0]: printon=True
if "Publication years" in row[0]:
#get the year to print it later to see which years were pulled
year=row[1]
#add year to institution in dictionary
if not year in InstitutionYearsDict[Institution]:
InstitutionYearsDict[Institution].append(year)
# Write a report showing the institution name followed by the years for
# which we have that institution's data.
with open("Instyears.txt","w") as instReportFD:
for inst in (InstitutionYearsDict):
instReportFD.write(inst)
for yr in InstitutionYearsDict[inst]:
instReportFD.write(" "+yr)
instReportFD.write("\n")
该错误是由于尝试使用系统的默认 cp1252 编码写入包含 U+008A 字符的字符串引起的。修复起来很简单,只需为您的输出文件声明一个 latin1 编码(或 iso-8859-1)(因为它只输出原始字节而不进行转换):
with open(joinedFileOut,'wt', encoding='latin1') as csvWriteFD:
但这只会掩盖真正的问题:这个0x8a
字符是从哪里来的?我的建议是拦截异常并转储它发生的行:
try:
outputFD.writerow(row)
except UnicodeEncodeError:
# print row, the name of the file being processed and the line number
这可能是由于其中一个输入文件不是 is-8859-1 编码而更可能是 utf8 编码引起的...
确保对源文件和目标文件使用正确编码。您在三个位置打开文件:
codes = csv.reader(open('Codes.csv'))
: : :
with open(joinedFileOut,'wt') as csvWriteFD:
outputFD=csv.writer(csvWriteFD,delimiter=',')
: : :
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
这应该类似于:
# Use the correct encoding. If you made this file on
# Windows it is likely Windows-1252 (also known as cp1252):
with open('Codes.csv', encoding='cp1252') as f:
codes = csv.reader(f)
: : :
# The output encoding can be anything you want. UTF-8
# supports all Unicode characters. Windows apps tend to like
# the files to start with a UTF-8 BOM if the file is UTF-8,
# so 'utf-8-sig' is an option.
with open(joinedFileOut,'w', encoding='utf-8-sig') as csvWriteFD:
outputFD=csv.writer(csvWriteFD)
: : :
# This file is probably the cause of your problem and is not ISO-8859-1.
# Maybe UTF-8 instead? 'utf-8-sig' will safely handle and remove a UTF-8 BOM
# if present.
with open(scivalFile,'r', encoding='utf-8-sig') as csvInFD:
fileFD = csv.reader(csvInFD)