当我的系统编码与我的文件编码不匹配时,如何找到我的系统编码?
How do I find my system encoding when it does not match my file encoding?
我正在编写一个小实用程序,在我的 Mac OsX Yosemite 上,它使用 glob2 对我的文件系统进行 glob,并使用 py.test 测试我的代码。
我的系统区域设置是en_gb,因为这是我通常所说的,但是,我也有很多文件和文件夹的名称是法语和日语。
现在每当我通过 glob2 得到一个 "French" 字符串时,比如“/tmp/test/réc”,e 尖音符的编码是 \xcc\x81c。
但是我将我的 python 文件的编码声明为 utf-8,这使我的 e acute 为 \xc3\xa9c。很明显,我的测试变得很有趣,因为它们与尖音符号不匹配。
如何找到我的系统用于编码我的急音符号的编码?除了使用类似 chardet 的库,还有其他选择吗?
谢谢
附录
失败的测试是:scope_test.py
# -*- coding: utf-8 -*-
import pytest as p
import os
import itertools
import shutil
from os import environ
environ['HOME']= '/tmp/test'
import scope as s #This is the library I am testing
@p.fixture(scope='module')
def scopes(request):
"""creates temporary directories, temporary test files and
returns a list of created scopes"""
dirs = dict(home=['~'],
simple=['~/simple1',
'~/simple2',
'~/simple3'],
recursive=['~/réc',
'~/réc/sub11',
'~/réc/sub12',
'~/réc/sub11/sub111',
'~/réc/sub11/sub112',
'~/réc/sub12/sub 121',
'~/réc/sub12/sub 122'])
# Create the test directories
os.mkdir(os.path.expanduser('~'))
for pthlist in dirs.values():
for dirpth in pthlist:
if dirpth != '~':
os.mkdir(os.path.expanduser(dirpth))
# Make a few files in each directory too
for pthlist in dirs.values():
for dirpth in pthlist:
hidden = ('','.')
base = ('test','zest','hello')
num = ('1','2','3','4','5')
ext = ('.txt','.jpg','.pdf','.todo','.otl')
fnames=itertools.product(hidden,base,num,ext)
touch = lambda fullpth: open(fullpth,'w').close()
for f in fnames:
touch(os.path.join(os.path.expanduser(dirpth),''.join(f)))
def delete_directories():
shutil.rmtree('/tmp/test')
request.addfinalizer(delete_directories)
return [ s.Scope('home',
'no scope filtera applied'),
s.Scope('simple',
'simple scope',
['~/simple1',
'~/simple2',
'~/simple3']),
s.Scope('recursive',
'recursive scope',
['~/r*c/**', '~/réc/sub11/sub111'],
['~/r*c/**/*1'])]
class Test_Scope:
def test_recursive_paths(self, scopes):
assert sorted(scopes[2].get_dir()) == \
sorted([os.path.expanduser(item) for item in
['~/réc/sub12',
'~/réc/sub11/sub111',
'~/réc/sub11/sub112',
'~/réc/sub12/sub 122']])
Scope实例的定义是:scope.py
class Scope(object):
"""a scope object produces a list of directories.
These directories are used to narrow the scope of searching, creating, chdir
or any other administrative task
incl_paths and excl_paths take strings representing absolute paths or globs
of absolute paths. If the user enters a relative path, then an error occurs.
User can:
- use conventions from glob2 library for globbing. To unequivocally
identify a glob, the glob string must have magic characters "*[]?"
eg: "~/D?[wo]*"
will find Downloads, Dropbox but not Documents in user home directory
- use "~" shortcut
- use bash variables if they were defined as environment variables in
the .bashrc or .bash_profile files
"""
def __init__(self, name,comment='',
incl_paths=[],
excl_paths=[]):
self.name = name
self.comment = comment
self.incl_paths = incl_paths
self.excl_paths = excl_paths
self.dirty = False
#...Missing details that do not harm the comprehension of this question...#
def get_dir(self):
g = DirGlobber()
inpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.incl_paths if not has_magic(item)]
inglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.incl_paths if has_magic(item)]
outpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.excl_paths if not has_magic(item)]
outglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.excl_paths if has_magic(item)]
res = []
for item in inglob:
res.extend(g.glob(item))
if res:
inset = set(res)
else:
inset = set()
res = []
for item in outglob:
res.extend(g.glob(item))
if res:
outset = set(res)
else:
outset = set()
#precedence of outglob over inglob
inset = inset - outset
#add specific paths and remove specific paths
inset = inset | set(inpaths)
inset = inset - set(outpaths)
return list(inset)
它们都是UTF-8,只是表示字符的两种方式。
>>> import unicodedata
>>> unicodedata.name(b'\xcc\x81'.decode('utf8'))
'COMBINING ACUTE ACCENT'
>>> unicodedata.name(b'\xc3\xa9'.decode('utf8'))
'LATIN SMALL LETTER E WITH ACUTE'
>>> print(b'\xc3\xa9'.decode('utf8'))
é
>>> print(b'\xcc\x81'.decode('utf8'))
́
>>> print(b'e\xcc\x81'.decode('utf8'))
é
所以当 OSX 写 file/directory 时,它写的是 "e" + "combining acute accent",而你期望它是一个字面的“é”。
要解决此问题,您需要比较规范化的 unicode 字符串而不是字节字符串(甚至是解码的 unicode 字符串)。 python标准库中的unicodedata.normalize
函数可以做到这一点:
>>> s1 = unicodedata.normalize('NFC', b're\xcc\x81c'.decode('utf8'))
>>> s2 = unicodedata.normalize('NFC', b'r\xc3\xa9c'.decode('utf8'))
>>> print(s1, s2)
réc réc
>>> s1 == s2
True
我正在编写一个小实用程序,在我的 Mac OsX Yosemite 上,它使用 glob2 对我的文件系统进行 glob,并使用 py.test 测试我的代码。
我的系统区域设置是en_gb,因为这是我通常所说的,但是,我也有很多文件和文件夹的名称是法语和日语。
现在每当我通过 glob2 得到一个 "French" 字符串时,比如“/tmp/test/réc”,e 尖音符的编码是 \xcc\x81c。
但是我将我的 python 文件的编码声明为 utf-8,这使我的 e acute 为 \xc3\xa9c。很明显,我的测试变得很有趣,因为它们与尖音符号不匹配。
如何找到我的系统用于编码我的急音符号的编码?除了使用类似 chardet 的库,还有其他选择吗?
谢谢
附录 失败的测试是:scope_test.py
# -*- coding: utf-8 -*-
import pytest as p
import os
import itertools
import shutil
from os import environ
environ['HOME']= '/tmp/test'
import scope as s #This is the library I am testing
@p.fixture(scope='module')
def scopes(request):
"""creates temporary directories, temporary test files and
returns a list of created scopes"""
dirs = dict(home=['~'],
simple=['~/simple1',
'~/simple2',
'~/simple3'],
recursive=['~/réc',
'~/réc/sub11',
'~/réc/sub12',
'~/réc/sub11/sub111',
'~/réc/sub11/sub112',
'~/réc/sub12/sub 121',
'~/réc/sub12/sub 122'])
# Create the test directories
os.mkdir(os.path.expanduser('~'))
for pthlist in dirs.values():
for dirpth in pthlist:
if dirpth != '~':
os.mkdir(os.path.expanduser(dirpth))
# Make a few files in each directory too
for pthlist in dirs.values():
for dirpth in pthlist:
hidden = ('','.')
base = ('test','zest','hello')
num = ('1','2','3','4','5')
ext = ('.txt','.jpg','.pdf','.todo','.otl')
fnames=itertools.product(hidden,base,num,ext)
touch = lambda fullpth: open(fullpth,'w').close()
for f in fnames:
touch(os.path.join(os.path.expanduser(dirpth),''.join(f)))
def delete_directories():
shutil.rmtree('/tmp/test')
request.addfinalizer(delete_directories)
return [ s.Scope('home',
'no scope filtera applied'),
s.Scope('simple',
'simple scope',
['~/simple1',
'~/simple2',
'~/simple3']),
s.Scope('recursive',
'recursive scope',
['~/r*c/**', '~/réc/sub11/sub111'],
['~/r*c/**/*1'])]
class Test_Scope:
def test_recursive_paths(self, scopes):
assert sorted(scopes[2].get_dir()) == \
sorted([os.path.expanduser(item) for item in
['~/réc/sub12',
'~/réc/sub11/sub111',
'~/réc/sub11/sub112',
'~/réc/sub12/sub 122']])
Scope实例的定义是:scope.py
class Scope(object):
"""a scope object produces a list of directories.
These directories are used to narrow the scope of searching, creating, chdir
or any other administrative task
incl_paths and excl_paths take strings representing absolute paths or globs
of absolute paths. If the user enters a relative path, then an error occurs.
User can:
- use conventions from glob2 library for globbing. To unequivocally
identify a glob, the glob string must have magic characters "*[]?"
eg: "~/D?[wo]*"
will find Downloads, Dropbox but not Documents in user home directory
- use "~" shortcut
- use bash variables if they were defined as environment variables in
the .bashrc or .bash_profile files
"""
def __init__(self, name,comment='',
incl_paths=[],
excl_paths=[]):
self.name = name
self.comment = comment
self.incl_paths = incl_paths
self.excl_paths = excl_paths
self.dirty = False
#...Missing details that do not harm the comprehension of this question...#
def get_dir(self):
g = DirGlobber()
inpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.incl_paths if not has_magic(item)]
inglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.incl_paths if has_magic(item)]
outpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.excl_paths if not has_magic(item)]
outglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
self.excl_paths if has_magic(item)]
res = []
for item in inglob:
res.extend(g.glob(item))
if res:
inset = set(res)
else:
inset = set()
res = []
for item in outglob:
res.extend(g.glob(item))
if res:
outset = set(res)
else:
outset = set()
#precedence of outglob over inglob
inset = inset - outset
#add specific paths and remove specific paths
inset = inset | set(inpaths)
inset = inset - set(outpaths)
return list(inset)
它们都是UTF-8,只是表示字符的两种方式。
>>> import unicodedata
>>> unicodedata.name(b'\xcc\x81'.decode('utf8'))
'COMBINING ACUTE ACCENT'
>>> unicodedata.name(b'\xc3\xa9'.decode('utf8'))
'LATIN SMALL LETTER E WITH ACUTE'
>>> print(b'\xc3\xa9'.decode('utf8'))
é
>>> print(b'\xcc\x81'.decode('utf8'))
́
>>> print(b'e\xcc\x81'.decode('utf8'))
é
所以当 OSX 写 file/directory 时,它写的是 "e" + "combining acute accent",而你期望它是一个字面的“é”。
要解决此问题,您需要比较规范化的 unicode 字符串而不是字节字符串(甚至是解码的 unicode 字符串)。 python标准库中的unicodedata.normalize
函数可以做到这一点:
>>> s1 = unicodedata.normalize('NFC', b're\xcc\x81c'.decode('utf8'))
>>> s2 = unicodedata.normalize('NFC', b'r\xc3\xa9c'.decode('utf8'))
>>> print(s1, s2)
réc réc
>>> s1 == s2
True