当我的系统编码与我的文件编码不匹配时,如何找到我的系统编码?

How do I find my system encoding when it does not match my file encoding?

我正在编写一个小实用程序,在我的 Mac OsX Yosemite 上,它使用 glob2 对我的文件系统进行 glob,并使用 py.test 测试我的代码。

我的系统区域设置是en_gb,因为这是我通常所说的,但是,我也有很多文件和文件夹的名称是法语和日语。

现在每当我通过 glob2 得到一个 "French" 字符串时,比如“/tmp/test/réc”,e 尖音符的编码是 \xcc\x81c。

但是我将我的 python 文件的编码声明为 utf-8,这使我的 e acute 为 \xc3\xa9c。很明显,我的测试变得很有趣,因为它们与尖音符号不匹配。

如何找到我的系统用于编码我的急音符号的编码?除了使用类似 chardet 的库,还有其他选择吗?

谢谢

附录 失败的测试是:scope_test.py

# -*- coding: utf-8 -*-

import pytest as p
import os
import itertools
import shutil
from os import environ
environ['HOME']= '/tmp/test'

import scope as s  #This is the library I am testing

@p.fixture(scope='module')
def scopes(request):
    """creates temporary directories, temporary test files and 
    returns a list of created scopes"""
    dirs = dict(home=['~'], 
                  simple=['~/simple1',
                         '~/simple2',
                         '~/simple3'], 
                  recursive=['~/réc',
                        '~/réc/sub11',
                        '~/réc/sub12',
                        '~/réc/sub11/sub111',
                        '~/réc/sub11/sub112',
                        '~/réc/sub12/sub 121',
                        '~/réc/sub12/sub 122'])
    # Create the test directories
    os.mkdir(os.path.expanduser('~'))
    for pthlist in dirs.values():
        for dirpth in pthlist:
            if dirpth != '~':
                os.mkdir(os.path.expanduser(dirpth))
    # Make a few files in each directory too
    for pthlist in dirs.values():
        for dirpth in pthlist:
            hidden = ('','.')
            base = ('test','zest','hello')
            num = ('1','2','3','4','5')
            ext = ('.txt','.jpg','.pdf','.todo','.otl')
            fnames=itertools.product(hidden,base,num,ext)
            touch = lambda fullpth: open(fullpth,'w').close()
            for f in fnames:
                touch(os.path.join(os.path.expanduser(dirpth),''.join(f)))
    def delete_directories():
        shutil.rmtree('/tmp/test')
    request.addfinalizer(delete_directories)
    return [ s.Scope('home', 
                     'no scope filtera applied'),
            s.Scope('simple',
                    'simple scope',
                    ['~/simple1',
                     '~/simple2',
                     '~/simple3']),
           s.Scope('recursive', 
                   'recursive scope',
                  ['~/r*c/**', '~/réc/sub11/sub111'],
                  ['~/r*c/**/*1'])]


class Test_Scope:
    def test_recursive_paths(self, scopes):
        assert sorted(scopes[2].get_dir()) == \
                sorted([os.path.expanduser(item) for item in 
                      ['~/réc/sub12',
                            '~/réc/sub11/sub111',
                            '~/réc/sub11/sub112',
                            '~/réc/sub12/sub 122']])

Scope实例的定义是:scope.py

class Scope(object):
    """a scope object produces a list of directories.
    These directories are used to narrow the scope of searching, creating, chdir
    or any other administrative task

    incl_paths and excl_paths take strings representing absolute paths or globs
    of absolute paths. If the user enters a relative path, then an error occurs.
    User can:
        - use conventions from glob2 library for globbing. To unequivocally
        identify a glob, the glob string must have magic characters "*[]?"
        eg: "~/D?[wo]*" 
        will find Downloads, Dropbox but not Documents in user home directory
        - use "~" shortcut
        - use bash variables if they were defined as environment variables in
        the .bashrc or .bash_profile files

    """

    def __init__(self, name,comment='', 
                 incl_paths=[],
                 excl_paths=[]):
        self.name = name
        self.comment = comment
        self.incl_paths = incl_paths
        self.excl_paths = excl_paths
        self.dirty = False


    #...Missing details that do not harm the comprehension of this question...#


    def get_dir(self):
        g = DirGlobber()
        inpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
                   self.incl_paths if not has_magic(item)]
        inglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
                  self.incl_paths if has_magic(item)]
        outpaths = [os.path.expanduser(os.path.expandvars(item)) for item in \
                    self.excl_paths if not has_magic(item)]
        outglob = [os.path.expanduser(os.path.expandvars(item)) for item in \
                   self.excl_paths if has_magic(item)]
        res = []
        for item in inglob:
            res.extend(g.glob(item))
        if res:
            inset = set(res)
        else:
            inset = set()
        res = []
        for item in outglob:
            res.extend(g.glob(item))
        if res:
            outset = set(res)
        else:
            outset = set()
        #precedence of outglob over inglob
        inset = inset - outset
        #add specific paths and remove specific paths
        inset = inset | set(inpaths)
        inset = inset - set(outpaths)
        return list(inset)

它们都是UTF-8,只是表示字符的两种方式。

>>> import unicodedata
>>> unicodedata.name(b'\xcc\x81'.decode('utf8'))
'COMBINING ACUTE ACCENT'
>>> unicodedata.name(b'\xc3\xa9'.decode('utf8'))
'LATIN SMALL LETTER E WITH ACUTE'

>>> print(b'\xc3\xa9'.decode('utf8'))
é
>>> print(b'\xcc\x81'.decode('utf8'))
 ́
>>> print(b'e\xcc\x81'.decode('utf8'))
é

所以当 OSX 写 file/directory 时,它写的是 "e" + "combining acute accent",而你期望它是一个字面的“é”。

要解决此问题,您需要比较规范化的 unicode 字符串而不是字节字符串(甚至是解码的 unicode 字符串)。 python标准库中的unicodedata.normalize函数可以做到这一点:

>>> s1 = unicodedata.normalize('NFC', b're\xcc\x81c'.decode('utf8'))
>>> s2 = unicodedata.normalize('NFC', b'r\xc3\xa9c'.decode('utf8'))
>>> print(s1, s2)
réc réc
>>> s1 == s2
True