在 R 单元格、rpy2、Jupyter Notebook 中使用 pandas 数据帧时出错

Error when using pandas dataframe in R cell, in rpy2, Jupyter Notebook

我想在 Jupyter Notebook 中使用 ggplot2。但是,当我尝试制作一个 R 魔法单元并引入一个变量时,出现错误。

代码如下(一段表示一个单元格):

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2

%matplotlib inline
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython

%%R
library(ggplot2)

data = pd.read_csv('train_titanic.csv')

%%R -i data -w 900 -h 480 -u px

最后一个单元格出现以下错误(包括回溯):

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
     54         try:
---> 55             od[name] = conversion.py2rpy(values)
     56         except Exception as e:

~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
    839 
--> 840         return dispatch(args[0].__class__)(*args, **kw)
    841 

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasseries(obj)
    125             if type(x) is not homogeneous_type:
--> 126                 raise ValueError('Series can only be of one type, or None.')
    127         # TODO: Could this be merged with obj.type.name == 'O' case above ?

ValueError: Series can only be of one type, or None.

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
    367         try:
--> 368             mv = memoryview(obj)
    369             res = cls.from_memoryview(mv)

TypeError: memoryview: a bytes-like object is required, not 'Series'

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
<ipython-input-14-75e210679e4a> in <module>
----> 1 get_ipython().run_cell_magic('R', '-i data -w 900 -h 480 -u px', '\n\n')

~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2360             with self.builtin_trap:
   2361                 args = (magic_arg_s, cell)
-> 2362                 result = fn(*args, **kwargs)
   2363             return result
   2364 

</home/morgan/anaconda3/envs/catenv/lib/python3.7/site-packages/decorator.py:decorator-gen-130> in R(self, line, cell, local_ns)

~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    185     # but it's overkill for just that one bit of state.
    186     def magic_deco(arg):
--> 187         call = lambda f, *a, **k: f(*a, **k)
    188 
    189         if callable(arg):

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/ipython/rmagic.py in R(self, line, cell, local_ns)
    721                         raise NameError("name '%s' is not defined" % input)
    722                 with localconverter(converter) as cv:
--> 723                     ro.r.assign(input, val)
    724 
    725         tmpd = self.setup_graphics(args)

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
    190                 kwargs[r_k] = v
    191         return (super(SignatureTranslatedFunction, self)
--> 192                 .__call__(*args, **kwargs))
    193 
    194 

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
    111 
    112     def __call__(self, *args, **kwargs):
--> 113         new_args = [conversion.py2rpy(a) for a in args]
    114         new_kwargs = {}
    115         for k, v in kwargs.items():

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in <listcomp>(.0)
    111 
    112     def __call__(self, *args, **kwargs):
--> 113         new_args = [conversion.py2rpy(a) for a in args]
    114         new_kwargs = {}
    115         for k, v in kwargs.items():

~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
    838                             '1 positional argument')
    839 
--> 840         return dispatch(args[0].__class__)(*args, **kw)
    841 
    842     funcname = getattr(func, '__name__', 'singledispatch function')

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
     59                           'The error is: %s'
     60                           % (name, str(e)))
---> 61             od[name] = StrVector(values)
     62 
     63     return DataFrame(od)

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/vectors.py in __init__(self, obj)
    382 
    383     def __init__(self, obj):
--> 384         super().__init__(obj)
    385         self._add_rops()
    386 

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in __init__(self, obj)
    286             super().__init__(obj)
    287         elif isinstance(obj, collections.abc.Sized):
--> 288             super().__init__(type(self).from_object(obj).__sexp__)
    289         else:
    290             raise TypeError('The constructor must be called '

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
    370         except (TypeError, ValueError):
    371             try:
--> 372                 res = cls.from_iterable(obj)
    373             except ValueError:
    374                 msg = ('The class methods from_memoryview() and '

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _(*args, **kwargs)
     26 def _cdata_res_to_rinterface(function):
     27     def _(*args, **kwargs):
---> 28         cdata = function(*args, **kwargs)
     29         # TODO: test cdata is of the expected CType
     30         return _cdata_to_rinterface(cdata)

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_iterable(cls, iterable, populate_func)
    317             if populate_func is None:
    318                 cls._populate_r_vector(iterable,
--> 319                                        r_vector)
    320             else:
    321                 populate_func(iterable, r_vector)

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(cls, iterable, r_vector)
    300                                   r_vector,
    301                                   cls._R_SET_VECTOR_ELT,
--> 302                                   cls._CAST_IN)
    303 
    304     @classmethod

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(iterable, r_vector, set_elt, cast_value)
    237 def _populate_r_vector(iterable, r_vector, set_elt, cast_value):
    238     for i, v in enumerate(iterable):
--> 239         set_elt(r_vector, i, cast_value(v))
    240 
    241 

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _as_charsxp_cdata(x)
    430         return x.__sexp__._cdata
    431     else:
--> 432         return conversion._str_to_charsxp(x)
    433 
    434 

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_charsxp(val)
    118         s = rlib.R_NaString
    119     else:
--> 120         cchar = _str_to_cchar(val)
    121         s = rlib.Rf_mkCharCE(cchar, _CE_UTF8)
    122     return s

~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_cchar(s, encoding)
     97 def _str_to_cchar(s, encoding: str = 'utf-8'):
     98     # TODO: use isStrinb and installTrChar
---> 99     b = s.encode(encoding)
    100     return ffi.new('char[]', b)
    101 

AttributeError: 'float' object has no attribute 'encode'

所以我发现在导入我的 pandas 数据框对象时甚至无法启动 R 魔术单元。但是,我尝试在单元格内创建 R 向量,发现我可以使用 ggplot2 绘制这些向量,没有任何问题。

我正在使用 Python 3.7.6rpy2 3.1.0jupyter-notebook 6.0.3并在 Windows 子系统上使用 Ubuntu 18.04.2 LTS Linux。

问题很可能是一个(或多个)列具有不止一种类型 - 因此不可能将数据传输到 R 向量(它只能包含一种数据类型)。追溯可能是压倒性的,但这里是相关部分:

ValueError: Series can only be of one type, or None.

是哪一栏?如果不查看您加载的数据集很难说,但我的一般解决方案是检查列中的类型:

types = data.applymap(type).apply(set)
types[types.apply(len) > 1]

以上代码段返回的任何内容都可能是罪魁祸首。有许多不同的方法来处理这个问题,这取决于数据的确切性质。我经常使用的解决方法包括:

  • 调用 data = data.infer_objects() - 如果 pandas 没有赶上 dtype 更改并且仍然存储数据(次优)Python 对象
  • 如果字符串列中有缺失值(例如 str_columns = str_columns.fillna('')),则用空字符串或字符串常量填充 NaN
  • dates.apply(pd.to_datetime, axis=1) 如果你有 datetime 个对象但是 dtype 是对象
  • 如果您混合使用 datedatetime 个对象,则使用 df.applymap(lambda x: datetime.combine(x, datetime.min.time()) if not isinstance(x, datetime) else x)

在极少数情况下 pandas 存储数据的方式与 rpy2 预期的不同(经过某些操作);然后将数据帧写入 csv 文件并再次从磁盘读取它会有所帮助 - 但这可能不是您在此处面临的问题,因为您是从新读取的数据帧开始的。

我刚刚注意到这个问题可能有一个更简单的原因。出于某种原因,pandas2ri 要求您在导入后调用 pandas2ri.activate()。这解决了我的问题。