加入两个 modin.pandas.DataFrame(s)

Join two modin.pandas.DataFrame(s)

我已经尝试 join/merge/concat 两个 modin.pandas DataFrame 但失败了。有没有人成功执行此操作?这是大数据 modin-project pandas 的实现。

来源在这里: https://github.com/modin-project/modin/blob/master/modin/pandas/dataframe.pyhttps://github.com/modin-project/modin/blob/master/modin/pandas/concat.py

示例:

import modin.pandas as pd

vals = pd.DataFrame([1,2,3,4], index=['2018-01-01','2018-01-02','2018-01-03','2018-01-04'], columns=['Col1']); # print(vals) # 
table = pd.DataFrame([5,6,7,8], index=['2018-01-01','2018-01-02','2018-01-03','2018-01-04'], columns=['Col2']); # print(table)

拳头尝试:modin.pandas.DataFrame.join

result = table.join(other=vals, on=None, how='outer', sort=False); # Modin: Doesn't work

print(result)

对应的错误信息:

Suppressing duplicate error message.
Suppressing duplicate error message.
Suppressing duplicate error message.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    393                             if callable(meth):
    394                                 return meth(obj, self, cycle)
--> 395             return _default_pprint(obj, self, cycle)
    396         finally:
    397             self.end_group()

~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _default_pprint(obj, p, cycle)
    508     if _safe_getattr(klass, '__repr__', None) is not object.__repr__:
    509         # A user-provided repr. Find newlines and replace them with p.break_()
--> 510         _repr_pprint(obj, p, cycle)
    511         return
    512     p.begin_group(1, '<')

~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    699     """A pprint that just redirects to the normal repr function."""
    700     # Find newlines and replace them with p.break_()
--> 701     output = repr(obj)
    702     for idx,output_line in enumerate(output.splitlines()):
    703         if idx:

~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in __repr__(self)
    454         if len(self._row_metadata) <= 60 and \
    455            len(self._col_metadata) <= 20:
--> 456             return repr(self._repr_pandas_builder())
    457         # The split here is so that we don't repr pandas row lengths.
    458         result = self._repr_pandas_builder()

~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_pandas_builder(self)
    382         # If we don't exceed the maximum number of values on either dimension
    383         if len(self.index) <= 60 and len(self.columns) <= 20:
--> 384             return to_pandas(self)
    385 
    386         if len(self.index) >= 60:

~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in to_pandas(df)
    259     """
    260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
--> 261     pandas_df.index = df.index
    262     pandas_df.columns = df.columns
    263     return pandas_df

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
   3625         try:
   3626             object.__getattribute__(self, name)
-> 3627             return object.__setattr__(self, name, value)
   3628         except AttributeError:
   3629             pass

pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
    557 
    558     def _set_axis(self, axis, labels):
--> 559         self._data.set_axis(axis, labels)
    560         self._clear_item_cache()
    561 

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels)
   3072             raise ValueError('Length mismatch: Expected axis has %d elements, '
   3073                              'new values have %d elements' %
-> 3074                              (old_len, new_len))
   3075 
   3076         self.axes[axis] = new_labels

ValueError: Length mismatch: Expected axis has 8 elements, new values have 4 elements

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_html_(self)
    473         if len(self._row_metadata) <= 60 and \
    474            len(self._col_metadata) <= 20:
--> 475             return self._repr_pandas_builder()._repr_html_()
    476         # We split so that we insert our correct dataframe dimensions.
    477         result = self._repr_pandas_builder()._repr_html_()

~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_pandas_builder(self)
    382         # If we don't exceed the maximum number of values on either dimension
    383         if len(self.index) <= 60 and len(self.columns) <= 20:
--> 384             return to_pandas(self)
    385 
    386         if len(self.index) >= 60:

~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in to_pandas(df)
    259     """
    260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
--> 261     pandas_df.index = df.index
    262     pandas_df.columns = df.columns
    263     return pandas_df

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
   3625         try:
   3626             object.__getattribute__(self, name)
-> 3627             return object.__setattr__(self, name, value)
   3628         except AttributeError:
   3629             pass

pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
    557 
    558     def _set_axis(self, axis, labels):
--> 559         self._data.set_axis(axis, labels)
    560         self._clear_item_cache()
    561 

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels)
   3072             raise ValueError('Length mismatch: Expected axis has %d elements, '
   3073                              'new values have %d elements' %
-> 3074                              (old_len, new_len))
   3075 
   3076         self.axes[axis] = new_labels

ValueError: Length mismatch: Expected axis has 8 elements, new values have 4 elements

第二次尝试:modin.pandas.concat

result = pd.concat([table, vals], axis=1,); # Modin: Doesn't work
print(result)

对应的错误信息:

> Suppressing duplicate error message. Suppressing duplicate error
> message. Suppressing duplicate error message.
> 
> --------------------------------------------------------------------------- ValueError                                Traceback (most recent call
> last) <ipython-input-3-4bf001fd75fb> in <module>()
>       2 result = pd.concat([table, vals], axis=1,); # Modin: Doesn't work
>       3 
> ----> 4 print(result)
> 
> ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
> __str__(self)
>     229 
>     230     def __str__(self):
> --> 231         return repr(self)
>     232 
>     233     def _repr_pandas_builder(self):
> 
> ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
> __repr__(self)
>     454         if len(self._row_metadata) <= 60 and \
>     455            len(self._col_metadata) <= 20:
> --> 456             return repr(self._repr_pandas_builder())
>     457         # The split here is so that we don't repr pandas row lengths.
>     458         result = self._repr_pandas_builder()
> 
> ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
> _repr_pandas_builder(self)
>     382         # If we don't exceed the maximum number of values on either dimension
>     383         if len(self.index) <= 60 and len(self.columns) <= 20:
> --> 384             return to_pandas(self)
>     385 
>     386         if len(self.index) >= 60:
> 
> ~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in
> to_pandas(df)
>     259     """
>     260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
> --> 261     pandas_df.index = df.index
>     262     pandas_df.columns = df.columns
>     263     return pandas_df
> 
> ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
> __setattr__(self, name, value)    3625         try:    3626             object.__getattribute__(self, name)
> -> 3627             return object.__setattr__(self, name, value)    3628         except AttributeError:    3629             pass
> 
> pandas/_libs/properties.pyx in
> pandas._libs.properties.AxisProperty.__set__()
> 
> ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
> _set_axis(self, axis, labels)
>     557 
>     558     def _set_axis(self, axis, labels):
> --> 559         self._data.set_axis(axis, labels)
>     560         self._clear_item_cache()
>     561 
> 
> ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in
> set_axis(self, axis, new_labels)    3072             raise
> ValueError('Length mismatch: Expected axis has %d elements, '    3073 
> 'new values have %d elements' %
> -> 3074                              (old_len, new_len))    3075     3076         self.axes[axis] = new_labels
> 
> ValueError: Length mismatch: Expected axis has 8 elements, new values
> have 4 elements

最近重写了后端,此问题已得到解决。以前,Modin 很难处理极小的分区。该修复程序尚未发布。如果你现在想尝试,请按以下方式安装 Modin(来自当前的主人):

pip install git+https://github.com/modin-project/modin

以这种方式安装后,我在本地测试了您的代码(两个版本),它适用于我。