为 pandas DataFrame 和原生 Python dict 创建一个 Mixin class
Creating a Mixin class for pandas DataFrame and native Python dict
如何为 pandas DataFrame 和本机 Python dict 创建混合 class 以便可以像嵌套 dict 一样访问数据框列?
从 开始,使用 df.loc()
函数是访问所需 row/column/slices 的方法。
但目标是使用与本机 Python 字典相同的语法访问二维数据帧。例如
>>> import pandas as pd
>>> df = pd.DataFrame([['x', 1,2,3,4,5], ['y', 6,7,8,9,10], ['z', 11,12,13,14,15]])
>>> df.columns = ['index', 'a', 'b', 'c', 'd', 'e']
>>> df = df.set_index(['index'])
>>> df
a b c d e
index
x 1 2 3 4 5
y 6 7 8 9 10
z 11 12 13 14 15
>>> df['x']
[1, 2, 3, 4, 5]
>>> df['x']['a']
1
>>> df['x']['a', 'b']
(1, 2)
>>> df['x']['a', 'd', 'c']
(1, 4, 3)
我试过这样创建一个 mixin class:
from pandas import DataFrame
class VegeTable(DataFrame, dict):
def __init__(self, *args, **kwargs):
DataFrame.__init__(self, *args, **kwargs)
def __getitem__(self, row_key, column_key):
if type(row_key) != list:
row_key = [row_key]
if type(column_key) != list:
column_key = [column_key]
return df.loc[row_key, column_key]
但我认为缺少某些东西,例如字典键访问不起作用,dict.get
returns 一个奇怪的值:
>>> from pandas import DataFrame
>>>
>>>
>>> class VegeTable(DataFrame, dict):
... def __init__(self, *args, **kwargs):
... DataFrame.__init__(self, *args, **kwargs)
... def __getitem__(self, row_key, column_key):
... if type(row_key) != list:
... row_key = [row_key]
... if type(column_key) != list:
... column_key = [column_key]
... return df.loc[row_key, column_key]
...
>>>
>>> vt = VegeTable([['x', 1,2,3,4,5], ['y', 6,7,8,9,10], ['z', 11,12,13,14,15]])
>>> vt.columns = ['index', 'a', 'b', 'c', 'd', 'e']
>>> vt = vt.set_index(['index'])
>>> vt
a b c d e
index
x 1 2 3 4 5
y 6 7 8 9 10
z 11 12 13 14 15
>>> vt['x']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/site-packages/pandas/core/frame.py", line 2062, in __getitem__
return self._getitem_column(key)
File "/usr/local/lib/python2.7/site-packages/pandas/core/frame.py", line 2069, in _getitem_column
return self._get_item_cache(key)
File "/usr/local/lib/python2.7/site-packages/pandas/core/generic.py", line 1534, in _get_item_cache
values = self._data.get(item)
File "/usr/local/lib/python2.7/site-packages/pandas/core/internals.py", line 3590, in get
loc = self.items.get_loc(item)
File "/usr/local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2395, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5239)
File "pandas/_libs/index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5085)
File "pandas/_libs/hashtable_class_helper.pxi", line 1207, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20405)
File "pandas/_libs/hashtable_class_helper.pxi", line 1215, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20359)
KeyError: 'x'
>>> vt.get(['x'])
>>> vt.get('x')
>>> vt.get('x', 'a')
'a'
>>> vt.get('x', ['a', 'b'])
['a', 'b']
>>> vt.get('x', ['a', 'b'])
如何为 pandas DataFrame 和本机 Python dict 创建混合 class 以便可以像嵌套 dict 一样访问数据框列? 这可能吗?如果是,怎么做?
Error in reasoning.
vt = vt.set_index(['index'])
This will redefine df
to <class 'pandas.core.frame.DataFrame'>
.
You have to overload it or Typecast
the resulting df
.
def __getitem__(self, row_key, column_key=None):
Only one Parameter is passed to def __getitem__(...
.
Multiple Parameter have to be inside [...]
,
e.g. vt['x', ['a', 'b', 'c']]
如果您接受这种略有不同的表示法,
这个实现做你想做的:
class DataFrame2(DataFrame):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, item):
if isinstance(item, tuple):
row = self.loc[item[0]]
sub_item = item[1]
if isinstance(sub_item, list):
r = [row.loc[key] for key in sub_item]
if len(r) == 1:
return r[0]
else:
return tuple(r)
else:
# NotImplemented, Parameter other than tuple('x', [list])
raise Exception(NotImplemented)
else:
return tuple(self.loc[item])
def set_index(self, index):
return DataFrame2(super().set_index(index))
# Usage:
df = DataFrame2(data)
df.columns = ['index', 'a', 'b', 'c', 'd', 'e']
df = df.set_index(['index'])
print('df[\'x\']={}\n'.format(df['x']))
print('df[\'x\'][\'a\']={}\n'.format(df['x',['a']]))
print('df[\'x\'][\'a\', \'b\']={}\n'.format(df['x', ['a', 'b']]))
print('df[\'x\'][\'a\', \'b\', \'c\']={}\n'.format(df['x', ['a', 'b', 'c']]))
Output:
df['x']=(1, 2, 3, 4, 5)
df['x']['a']=1
df['x']['a', 'b']=(1, 2)
df['x']['a', 'b', 'c']=(1, 2, 3)
使用 Python 测试:3.4.2
我不认为创建一个 mixin class 是个好主意。当您使用 pandas 时,您应该以 pandas 的方式思考。而且我也怀疑原生 Python 嵌套字典是否可以通过这种方式进行评估:
In []: df['x']['a', 'b']
但是,如果您坚持,请先尝试此代码:
In []: df.T.to_dict()
Out[]:
{'x': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5},
'y': {'a': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10},
'z': {'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15}}
如何为 pandas DataFrame 和本机 Python dict 创建混合 class 以便可以像嵌套 dict 一样访问数据框列?
从 df.loc()
函数是访问所需 row/column/slices 的方法。
但目标是使用与本机 Python 字典相同的语法访问二维数据帧。例如
>>> import pandas as pd
>>> df = pd.DataFrame([['x', 1,2,3,4,5], ['y', 6,7,8,9,10], ['z', 11,12,13,14,15]])
>>> df.columns = ['index', 'a', 'b', 'c', 'd', 'e']
>>> df = df.set_index(['index'])
>>> df
a b c d e
index
x 1 2 3 4 5
y 6 7 8 9 10
z 11 12 13 14 15
>>> df['x']
[1, 2, 3, 4, 5]
>>> df['x']['a']
1
>>> df['x']['a', 'b']
(1, 2)
>>> df['x']['a', 'd', 'c']
(1, 4, 3)
我试过这样创建一个 mixin class:
from pandas import DataFrame
class VegeTable(DataFrame, dict):
def __init__(self, *args, **kwargs):
DataFrame.__init__(self, *args, **kwargs)
def __getitem__(self, row_key, column_key):
if type(row_key) != list:
row_key = [row_key]
if type(column_key) != list:
column_key = [column_key]
return df.loc[row_key, column_key]
但我认为缺少某些东西,例如字典键访问不起作用,dict.get
returns 一个奇怪的值:
>>> from pandas import DataFrame
>>>
>>>
>>> class VegeTable(DataFrame, dict):
... def __init__(self, *args, **kwargs):
... DataFrame.__init__(self, *args, **kwargs)
... def __getitem__(self, row_key, column_key):
... if type(row_key) != list:
... row_key = [row_key]
... if type(column_key) != list:
... column_key = [column_key]
... return df.loc[row_key, column_key]
...
>>>
>>> vt = VegeTable([['x', 1,2,3,4,5], ['y', 6,7,8,9,10], ['z', 11,12,13,14,15]])
>>> vt.columns = ['index', 'a', 'b', 'c', 'd', 'e']
>>> vt = vt.set_index(['index'])
>>> vt
a b c d e
index
x 1 2 3 4 5
y 6 7 8 9 10
z 11 12 13 14 15
>>> vt['x']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/site-packages/pandas/core/frame.py", line 2062, in __getitem__
return self._getitem_column(key)
File "/usr/local/lib/python2.7/site-packages/pandas/core/frame.py", line 2069, in _getitem_column
return self._get_item_cache(key)
File "/usr/local/lib/python2.7/site-packages/pandas/core/generic.py", line 1534, in _get_item_cache
values = self._data.get(item)
File "/usr/local/lib/python2.7/site-packages/pandas/core/internals.py", line 3590, in get
loc = self.items.get_loc(item)
File "/usr/local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2395, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5239)
File "pandas/_libs/index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5085)
File "pandas/_libs/hashtable_class_helper.pxi", line 1207, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20405)
File "pandas/_libs/hashtable_class_helper.pxi", line 1215, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20359)
KeyError: 'x'
>>> vt.get(['x'])
>>> vt.get('x')
>>> vt.get('x', 'a')
'a'
>>> vt.get('x', ['a', 'b'])
['a', 'b']
>>> vt.get('x', ['a', 'b'])
如何为 pandas DataFrame 和本机 Python dict 创建混合 class 以便可以像嵌套 dict 一样访问数据框列? 这可能吗?如果是,怎么做?
Error in reasoning.
vt = vt.set_index(['index'])
This will redefinedf
to<class 'pandas.core.frame.DataFrame'>
.
You have to overload it orTypecast
the resultingdf
.
def __getitem__(self, row_key, column_key=None):
Only one Parameter is passed todef __getitem__(...
.
Multiple Parameter have to be inside[...]
, e.g. vt['x', ['a', 'b', 'c']]
如果您接受这种略有不同的表示法, 这个实现做你想做的:
class DataFrame2(DataFrame):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, item):
if isinstance(item, tuple):
row = self.loc[item[0]]
sub_item = item[1]
if isinstance(sub_item, list):
r = [row.loc[key] for key in sub_item]
if len(r) == 1:
return r[0]
else:
return tuple(r)
else:
# NotImplemented, Parameter other than tuple('x', [list])
raise Exception(NotImplemented)
else:
return tuple(self.loc[item])
def set_index(self, index):
return DataFrame2(super().set_index(index))
# Usage:
df = DataFrame2(data)
df.columns = ['index', 'a', 'b', 'c', 'd', 'e']
df = df.set_index(['index'])
print('df[\'x\']={}\n'.format(df['x']))
print('df[\'x\'][\'a\']={}\n'.format(df['x',['a']]))
print('df[\'x\'][\'a\', \'b\']={}\n'.format(df['x', ['a', 'b']]))
print('df[\'x\'][\'a\', \'b\', \'c\']={}\n'.format(df['x', ['a', 'b', 'c']]))
Output:
df['x']=(1, 2, 3, 4, 5) df['x']['a']=1 df['x']['a', 'b']=(1, 2) df['x']['a', 'b', 'c']=(1, 2, 3)
使用 Python 测试:3.4.2
我不认为创建一个 mixin class 是个好主意。当您使用 pandas 时,您应该以 pandas 的方式思考。而且我也怀疑原生 Python 嵌套字典是否可以通过这种方式进行评估:
In []: df['x']['a', 'b']
但是,如果您坚持,请先尝试此代码:
In []: df.T.to_dict()
Out[]:
{'x': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5},
'y': {'a': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10},
'z': {'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15}}