加载 yaml 文件时编码错误

Question

我在 python 2.7.13 中使用 ruaml.yaml 版本 0.15.74。由于外部给定的限制，我必须使用这种版本。

我的最终目标是读取 yaml 文件和 select 它的某些部分，将其保存在 pandas 数据框中，最后将其写入 csv 文件。为此，我有以下自定义 'DoubleMergeKeyEnabler(object)'.

import pandas as pd
import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

我使用以下方法加载 yaml 文件：

df = pd.DataFrame(columns=['text1', 'text2'])

 with open ('test.yaml' as f:
     data = yaml.load(f)

然后我 select 我的 yaml 文件的特定部分并尝试定义一个 id 来跟踪它（将是 pandas 数据框条目名称的名称) 并将其存储在 pandas 数据框中。

_item = data.get('items')
for i in range(0, len(_item)):
    if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
        _txt_to_trans = _item[i].get('representation')
        df.loc[_id] = [_txt_to_trans, '']

这是 yaml 文件的给出方式。这个我也改不了。

groups:
  - &group-dp
    title: "Abschätzungen"
    reference: "group-dp"
    required: true
    description: >
    help_text: |


items:
  - type: "Group"
    <<: *group-dp
    visible: true
    multiple: false
    representation: "Abschätzungen"

我收到以下错误消息

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-18-1fa5952ce8cf> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7455hqj''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7455hqj''');exec(compile(__code, '''/home/nicolas/Desktop/test.py''', 'exec'));

/home/nicolas/Desktop/test.py in <module>()
     39 _item = data.get('items')
     40 for i in range(0, len(_item)):
---> 41     if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
     42         _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
     43         _txt_to_trans = _item[i].get('representation')

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 5: ordinal not in range(128)

In [19]:

我需要以某种方式进行解码，但这不起作用。我该如何解决这个问题？完整测试代码如下

import pandas as pd
import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

df = pd.DataFrame(columns=['text1', 'text2'])

with open ('/home/nicolas/Desktop/test.yaml') as f:
    data = yaml.load(f)

_item = data.get('items')
for i in range(0, len(_item)):
    if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
        _txt_to_trans = _item[i].get('representation')
        df.loc[_id] = [_txt_to_trans, '']

Answer 1

根据你的回溯，问题是

if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)

测试字符串而不是 Unicode 是否是 in 键，因此 Python 尝试将键转换为 ASCII 字符串，但在变音符号上失败。您应该测试 Unicode 序列是否为 in 键：

if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)

并避免使用 str() Unicode 转换也会导致以下行。

以下适用于 2.7：

# encoding: utf-8

import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

data = yaml.load("""\
groups:
  - &group-dp
    title: "Abschätzungen"
    reference: "group-dp"
    required: true
    description: >
    help_text: |


items:
  - type: "Group"
    <<: *group-dp
    visible: true
    multiple: false
    representation: "Abschätzungen"
""")

_item = data.get('items')
for i in range(0, len(_item)):
    if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = u'test' + u'_' + u'items' + u'_' + unicode(_item[i].get(u'representation')).replace(u" ", u"_")
        _txt_to_trans = _item[i].get(u'representation')

所以 for 循环需要在几个地方进行一些调整以保持基于 Unicode。您将不得不重新插入熊猫的相关内容。

加载 yaml 文件时编码错误

wrong encoding in loading yaml file

python-2.7

ruamel.yaml