使用 Pandas 从 HDFStore 读取特殊字符时出现 UnicodeDecodeerror

UnicodeDecode error when reading special characters from HDFStore with Pandas

我需要在 HDFStore 中存储大量消息,其中一些包含表情符号或特殊字符,如 éěščřžýáí。一切似乎都正常,直到我尝试加载它,然后它因以下错误而崩溃。这是以错误结束的示例代码

import pandas as pd

df = pd.DataFrame(columns=["A"])
toAppend = {"A": "é"}
df = df.append(toAppend, ignore_index = True)
df['A'] = df['A'].astype(str)

store = pd.HDFStore(r'thiswillcrash.h5')
store.put('df', df, format='table', encoding="utf-8")
d = store["df"]
print(d)

store.close()

这里是错误

---------------------------------------------------------------------------
    UnicodeDecodeError                        Traceback (most recent call last)
    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _unconvert_string_array(data, nan_rep, encoding)
       4407                 dtype = "S{0}".format(itemsize)
    -> 4408             data = data.astype(dtype, copy=False).astype(object, copy=False)
       4409         except (Exception) as e:

    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)

    During handling of the above exception, another exception occurred:

    UnicodeDecodeError                        Traceback (most recent call last)
    <ipython-input-8-f2a5372d5498> in <module>()
          8 store = pd.HDFStore(r'iwillcrash18.h5')
          9 store.put('df', df, format='table', encoding="utf-8")
    ---> 10 d = store["df"]
         11 print(d)
         12 

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in __getitem__(self, key)
        416 
        417     def __getitem__(self, key):
    --> 418         return self.get(key)
        419 
        420     def __setitem__(self, key, value):

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in get(self, key)
        626         if group is None:
        627             raise KeyError('No object named %s in the file' % key)
    --> 628         return self._read_group(group)
        629 
        630     def select(self, key, where=None, start=None, stop=None, columns=None,

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _read_group(self, group, **kwargs)
       1274         s = self._create_storer(group)
       1275         s.infer_axes()
    -> 1276         return s.read(**kwargs)
       1277 
       1278 

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in read(self, where, columns, **kwargs)
       4006     def read(self, where=None, columns=None, **kwargs):
       4007 
    -> 4008         if not self.read_axes(where=where, **kwargs):
       4009             return None
       4010 

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in read_axes(self, where, **kwargs)
       3218         for a in self.axes:
       3219             a.set_info(self.info)
    -> 3220             a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding)
       3221 
       3222         return True

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in convert(self, values, nan_rep, encoding)
       2071         if _ensure_decoded(self.kind) == u('string'):
       2072             self.data = _unconvert_string_array(
    -> 2073                 self.data, nan_rep=nan_rep, encoding=encoding)
       2074 
       2075         return self

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _unconvert_string_array(data, nan_rep, encoding)
       4409         except (Exception) as e:
       4410             f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
    -> 4411             data = f(data)
       4412 
       4413     if nan_rep is None:

    C:\Users\Filip\Anaconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
       1698             vargs.extend([kwargs[_n] for _n in names])
       1699 
    -> 1700         return self._vectorize_call(func=func, args=vargs)
       1701 
       1702     def _get_ufunc_and_otypes(self, func, args):

    C:\Users\Filip\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
       1767                       for _a in args]
       1768 
    -> 1769             outputs = ufunc(*inputs)
       1770 
       1771             if ufunc.nout == 1:

    C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in <lambda>(x)
       4408             data = data.astype(dtype, copy=False).astype(object, copy=False)
       4409         except (Exception) as e:
    -> 4410             f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
       4411             data = f(data)
       4412 

    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data

我有 Pandas 0.16.2 和 PyTables 3.2.2

这是一个错误,现在应该已解决,请参阅 this link 查看更多详细信息