Python3 linux 中的文件名编码问题
Filename encoding issue in Python3 linux
我正在使用下面的代码片段来读取文件并使用数据从云端下载。此代码在 windows 和 Mac 中也完全正常。但是当通过登录 linux 实例在 ssh 中 运行 设置脚本时脚本失败。
后来我使用下面的命令来 运行 文件,这似乎工作到某个时候,后来它失败了。
命令:
PYTHONIOENCODING=utf-8 python3 filename.py 文本文件名
def file_block(fp, number_of_blocks, block):
'''
A generator that splits a file into blocks and iterates
over the lines of one of the blocks.
'''
assert 0 <= block and block < number_of_blocks
assert 0 < number_of_blocks
fp.seek(0,2)
file_size = fp.tell()
ini = file_size * block / number_of_blocks
end = file_size * (1 + block) / number_of_blocks
if ini <= 0:
fp.seek(0)
else:
fp.seek(ini-1)
fp.readline()
while fp.tell() < end:
yield fp.readline()
def download_files(conn,container_name,number_of_chunks, chunk_number, file_name):
fp = open(file_name, encoding='utf-8')
counter = 0
try:
for line in file_block(fp,number_of_chunks, chunk_number):
counter = counter + 1
clean_object_name = str(bytes(line, encoding='utf-8').decode('utf-8', 'ignore')).rstrip('\n\r ')
try:
if not os.path.exists(os.path.dirname(clean_object_name)):
os.makedirs(os.path.dirname(clean_object_name))
if os.path.basename(clean_object_name) != '':
obj_tuple = conn.get_object(container_name, clean_object_name)
with open(clean_object_name, 'wb') as f:
f.write(obj_tuple[1])
print("Successfull ", current_process().name, " ", counter , " " ,clean_object_name.encode('utf-8'), "\n")
except:
sys.exc_info()[2]
if not os.path.exists("log"):
os.mkdir("log")
with open("log/" + "log_" + current_process().name + ".txt", 'a', encoding='utf-8') as f:
try:
print("Failed counter ", counter, " " ,clean_object_name.encode('utf-8'))
f.write("missing " + clean_object_name.encode('utf-8') + "\n")
f.write("traceback " + sys.exc_info()[2] + "\n")
except:
f.write("missing " + str(counter) + "\n")
except:
sys.exc_info()[2]
if not os.path.exists("process_failure_log"):
os.mkdir("process_failure_log")
with open("process_failure_log/" + "log_" + current_process().name + ".txt", 'a', encoding='utf-8') as f:
try:
f.write("process failed while reading the file at counter " + str(counter) + "\n")
f.write(str(sys.exc_info()[2]) + "\n")
except:
f.write("missing " + str(counter) + "\n")
包含以下数据的文本文件:
user_photos/images/282/onehundred/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/original/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/preview/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/thumbnail/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/twohundred/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/283/onehundred/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/original/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/preview/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/thumbnail/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/twohundred/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/284/onehundred/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/original/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/preview/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/thumbnail/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/twohundred/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
使用上述命令后,我能够读写文件,但脚本在以下几点失败:
with open(clean_object_name, 'wb') as f:
f.write(obj_tuple[1])
回溯:
'ascii' codec can't encode character in position 55-56: ordinal not in
range(128).
我知道是因为性格古怪。我可以使用解码方法。但是我不想用无法识别的字符替换文件名。
我很困惑,如果编码是一个有问题的脚本,那么在初始阶段应该会失败。但脚本在读取和写入其他文件时工作正常。仅在创建具有偏心字符的文件时失败。请提出建议,我已经浪费了整整两天的工作时间。代码在 windows 和 mac.
中完美运行
这是 linux 中的环境变量问题。
我已经通过运行下面的python方法验证
import sys
print(sys.getfilesystemencoding())
返回值:ascii
我在 linux 终端中进行了以下更改。
$ sudo vim /etc/environment
and set the LC_ALL to :
LANG="en_US.UTF-8"
LC_MESSAGES="C"
LC_ALL="en_US.UTF-8"
then reboot,and run locale
此更改后,上述方法返回值 'utf-8'
并且运行良好。
我正在使用下面的代码片段来读取文件并使用数据从云端下载。此代码在 windows 和 Mac 中也完全正常。但是当通过登录 linux 实例在 ssh 中 运行 设置脚本时脚本失败。
后来我使用下面的命令来 运行 文件,这似乎工作到某个时候,后来它失败了。
命令: PYTHONIOENCODING=utf-8 python3 filename.py 文本文件名
def file_block(fp, number_of_blocks, block):
'''
A generator that splits a file into blocks and iterates
over the lines of one of the blocks.
'''
assert 0 <= block and block < number_of_blocks
assert 0 < number_of_blocks
fp.seek(0,2)
file_size = fp.tell()
ini = file_size * block / number_of_blocks
end = file_size * (1 + block) / number_of_blocks
if ini <= 0:
fp.seek(0)
else:
fp.seek(ini-1)
fp.readline()
while fp.tell() < end:
yield fp.readline()
def download_files(conn,container_name,number_of_chunks, chunk_number, file_name):
fp = open(file_name, encoding='utf-8')
counter = 0
try:
for line in file_block(fp,number_of_chunks, chunk_number):
counter = counter + 1
clean_object_name = str(bytes(line, encoding='utf-8').decode('utf-8', 'ignore')).rstrip('\n\r ')
try:
if not os.path.exists(os.path.dirname(clean_object_name)):
os.makedirs(os.path.dirname(clean_object_name))
if os.path.basename(clean_object_name) != '':
obj_tuple = conn.get_object(container_name, clean_object_name)
with open(clean_object_name, 'wb') as f:
f.write(obj_tuple[1])
print("Successfull ", current_process().name, " ", counter , " " ,clean_object_name.encode('utf-8'), "\n")
except:
sys.exc_info()[2]
if not os.path.exists("log"):
os.mkdir("log")
with open("log/" + "log_" + current_process().name + ".txt", 'a', encoding='utf-8') as f:
try:
print("Failed counter ", counter, " " ,clean_object_name.encode('utf-8'))
f.write("missing " + clean_object_name.encode('utf-8') + "\n")
f.write("traceback " + sys.exc_info()[2] + "\n")
except:
f.write("missing " + str(counter) + "\n")
except:
sys.exc_info()[2]
if not os.path.exists("process_failure_log"):
os.mkdir("process_failure_log")
with open("process_failure_log/" + "log_" + current_process().name + ".txt", 'a', encoding='utf-8') as f:
try:
f.write("process failed while reading the file at counter " + str(counter) + "\n")
f.write(str(sys.exc_info()[2]) + "\n")
except:
f.write("missing " + str(counter) + "\n")
包含以下数据的文本文件:
user_photos/images/282/onehundred/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/original/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/preview/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/thumbnail/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/282/twohundred/Capture d’écran 2012-09-07 à 2.50.31 PM20120917-37935-13g7sn1-0_1347875141.png
user_photos/images/283/onehundred/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/original/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/preview/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/thumbnail/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/283/twohundred/Capture d’écran 2012-09-11 à 6.21.50 PM20120917-38000-37awsu-0_1347875181.jpg
user_photos/images/284/onehundred/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/original/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/preview/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/thumbnail/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
user_photos/images/284/twohundred/Capture d’écran 2012-09-11 à 6.20.56 PM20120917-38101-6po8vq-0_1347875238.jpg
使用上述命令后,我能够读写文件,但脚本在以下几点失败:
with open(clean_object_name, 'wb') as f:
f.write(obj_tuple[1])
回溯:
'ascii' codec can't encode character in position 55-56: ordinal not in
range(128).
我知道是因为性格古怪。我可以使用解码方法。但是我不想用无法识别的字符替换文件名。
我很困惑,如果编码是一个有问题的脚本,那么在初始阶段应该会失败。但脚本在读取和写入其他文件时工作正常。仅在创建具有偏心字符的文件时失败。请提出建议,我已经浪费了整整两天的工作时间。代码在 windows 和 mac.
中完美运行这是 linux 中的环境变量问题。
我已经通过运行下面的python方法验证
import sys
print(sys.getfilesystemencoding())
返回值:ascii
我在 linux 终端中进行了以下更改。
$ sudo vim /etc/environment
and set the LC_ALL to :
LANG="en_US.UTF-8"
LC_MESSAGES="C"
LC_ALL="en_US.UTF-8"
then reboot,and run locale
此更改后,上述方法返回值 'utf-8'
并且运行良好。