属性 Setter 用于 Pandas DataFrame 的子类
Property Setter for Subclass of Pandas DataFrame
我正在尝试设置 pd.DataFrame
的子 class,它在初始化时有两个必需的参数(group
和 timestamp_col
)。我想 运行 验证这些参数 group
和 timestamp_col
,所以我为每个属性设置了一个 setter 方法。这一切都有效,直到我尝试 set_index()
并获得 TypeError: 'NoneType' object is not iterable
。在 test_set_index
和 test_assignment_with_indexed_obj
中似乎没有参数被传递到我的 setter 函数。如果我将 if g == None: return
添加到我的 setter 函数,我可以通过测试用例,但我认为这不是正确的解决方案。
我应该如何对这些必需的参数实施 属性 验证?
下面是我的class:
import pandas as pd
import numpy as np
class HistDollarGains(pd.DataFrame):
@property
def _constructor(self):
return HistDollarGains._internal_ctor
_metadata = ["group", "timestamp_col", "_group", "_timestamp_col"]
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = None
kwargs["timestamp_col"] = None
return cls(*args, **kwargs)
def __init__(
self,
data,
group,
timestamp_col,
index=None,
columns=None,
dtype=None,
copy=True,
):
super(HistDollarGains, self).__init__(
data=data, index=index, columns=columns, dtype=dtype, copy=copy
)
self.group = group
self.timestamp_col = timestamp_col
@property
def group(self):
return self._group
@group.setter
def group(self, g):
if g == None:
return
if isinstance(g, str):
group_list = [g]
else:
group_list = g
if not set(group_list).issubset(self.columns):
raise ValueError("Data does not contain " + '[' + ', '.join(group_list) + ']')
self._group = group_list
@property
def timestamp_col(self):
return self._timestamp_col
@timestamp_col.setter
def timestamp_col(self, t):
if t == None:
return
if not t in self.columns:
raise ValueError("Data does not contain " + '[' + t + ']')
self._timestamp_col = t
这是我的测试用例:
import pytest
import pandas as pd
import numpy as np
from myclass import *
@pytest.fixture(scope="module")
def sample():
samp = pd.DataFrame(
[
{"timestamp": "2020-01-01", "group": "a", "dollar_gains": 100},
{"timestamp": "2020-01-01", "group": "b", "dollar_gains": 100},
{"timestamp": "2020-01-01", "group": "c", "dollar_gains": 110},
{"timestamp": "2020-01-01", "group": "a", "dollar_gains": 110},
{"timestamp": "2020-01-01", "group": "b", "dollar_gains": 90},
{"timestamp": "2020-01-01", "group": "d", "dollar_gains": 100},
]
)
return samp
@pytest.fixture(scope="module")
def sample_obj(sample):
return HistDollarGains(sample, "group", "timestamp")
def test_constructor_without_args(sample):
with pytest.raises(TypeError):
HistDollarGains(sample)
def test_constructor_with_string_group(sample):
hist_dg = HistDollarGains(sample, "group", "timestamp")
assert hist_dg.group == ["group"]
assert hist_dg.timestamp_col == "timestamp"
def test_constructor_with_list_group(sample):
hist_dg = HistDollarGains(sample, ["group", "timestamp"], "timestamp")
def test_constructor_with_invalid_group(sample):
with pytest.raises(ValueError):
HistDollarGains(sample, "invalid_group", np.random.choice(sample.columns))
def test_constructor_with_invalid_timestamp(sample):
with pytest.raises(ValueError):
HistDollarGains(sample, np.random.choice(sample.columns), "invalid_timestamp")
def test_assignment_with_indexed_obj(sample_obj):
b = sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col])
def test_set_index(sample_obj):
# print(isinstance(a, pd.DataFrame))
assert sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col]).index.names == ['group', 'timestamp']
set_index()
方法将在内部调用 self.copy()
以创建 DataFrame 对象的副本(请参阅源代码 here), inside which it uses your customized constructor method, _internal_ctor()
, to create the new object (source)。请注意,self._constructor()
与 self._internal_ctor()
相同,这是几乎所有 pandas 类 的通用内部方法,用于在深度复制或切片等操作期间创建新实例。你的问题其实来源于这个函数:
class HistDollarGains(pd.DataFrame):
...
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = None
kwargs["timestamp_col"] = None
return cls(*args, **kwargs) # this is equivalent to calling
# HistDollarGains(data, group=None, timestamp_col=None)
我猜你从 the github issue 复制了这段代码。
kwargs["**"] = None
行明确告诉构造函数将 None
设置为 group
和 timestamp_col
。最后 setter/validator 获得 None
作为新值并引发错误。
因此,您应该将可接受的值设置为 group
和 timestamp_col
。
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = []
kwargs["timestamp_col"] = 'timestamp' # or whatever name that makes your validator happy
return cls(*args, **kwargs)
然后你可以删除验证器中的if g == None: return
行。
我正在尝试设置 pd.DataFrame
的子 class,它在初始化时有两个必需的参数(group
和 timestamp_col
)。我想 运行 验证这些参数 group
和 timestamp_col
,所以我为每个属性设置了一个 setter 方法。这一切都有效,直到我尝试 set_index()
并获得 TypeError: 'NoneType' object is not iterable
。在 test_set_index
和 test_assignment_with_indexed_obj
中似乎没有参数被传递到我的 setter 函数。如果我将 if g == None: return
添加到我的 setter 函数,我可以通过测试用例,但我认为这不是正确的解决方案。
我应该如何对这些必需的参数实施 属性 验证?
下面是我的class:
import pandas as pd
import numpy as np
class HistDollarGains(pd.DataFrame):
@property
def _constructor(self):
return HistDollarGains._internal_ctor
_metadata = ["group", "timestamp_col", "_group", "_timestamp_col"]
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = None
kwargs["timestamp_col"] = None
return cls(*args, **kwargs)
def __init__(
self,
data,
group,
timestamp_col,
index=None,
columns=None,
dtype=None,
copy=True,
):
super(HistDollarGains, self).__init__(
data=data, index=index, columns=columns, dtype=dtype, copy=copy
)
self.group = group
self.timestamp_col = timestamp_col
@property
def group(self):
return self._group
@group.setter
def group(self, g):
if g == None:
return
if isinstance(g, str):
group_list = [g]
else:
group_list = g
if not set(group_list).issubset(self.columns):
raise ValueError("Data does not contain " + '[' + ', '.join(group_list) + ']')
self._group = group_list
@property
def timestamp_col(self):
return self._timestamp_col
@timestamp_col.setter
def timestamp_col(self, t):
if t == None:
return
if not t in self.columns:
raise ValueError("Data does not contain " + '[' + t + ']')
self._timestamp_col = t
这是我的测试用例:
import pytest
import pandas as pd
import numpy as np
from myclass import *
@pytest.fixture(scope="module")
def sample():
samp = pd.DataFrame(
[
{"timestamp": "2020-01-01", "group": "a", "dollar_gains": 100},
{"timestamp": "2020-01-01", "group": "b", "dollar_gains": 100},
{"timestamp": "2020-01-01", "group": "c", "dollar_gains": 110},
{"timestamp": "2020-01-01", "group": "a", "dollar_gains": 110},
{"timestamp": "2020-01-01", "group": "b", "dollar_gains": 90},
{"timestamp": "2020-01-01", "group": "d", "dollar_gains": 100},
]
)
return samp
@pytest.fixture(scope="module")
def sample_obj(sample):
return HistDollarGains(sample, "group", "timestamp")
def test_constructor_without_args(sample):
with pytest.raises(TypeError):
HistDollarGains(sample)
def test_constructor_with_string_group(sample):
hist_dg = HistDollarGains(sample, "group", "timestamp")
assert hist_dg.group == ["group"]
assert hist_dg.timestamp_col == "timestamp"
def test_constructor_with_list_group(sample):
hist_dg = HistDollarGains(sample, ["group", "timestamp"], "timestamp")
def test_constructor_with_invalid_group(sample):
with pytest.raises(ValueError):
HistDollarGains(sample, "invalid_group", np.random.choice(sample.columns))
def test_constructor_with_invalid_timestamp(sample):
with pytest.raises(ValueError):
HistDollarGains(sample, np.random.choice(sample.columns), "invalid_timestamp")
def test_assignment_with_indexed_obj(sample_obj):
b = sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col])
def test_set_index(sample_obj):
# print(isinstance(a, pd.DataFrame))
assert sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col]).index.names == ['group', 'timestamp']
set_index()
方法将在内部调用 self.copy()
以创建 DataFrame 对象的副本(请参阅源代码 here), inside which it uses your customized constructor method, _internal_ctor()
, to create the new object (source)。请注意,self._constructor()
与 self._internal_ctor()
相同,这是几乎所有 pandas 类 的通用内部方法,用于在深度复制或切片等操作期间创建新实例。你的问题其实来源于这个函数:
class HistDollarGains(pd.DataFrame):
...
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = None
kwargs["timestamp_col"] = None
return cls(*args, **kwargs) # this is equivalent to calling
# HistDollarGains(data, group=None, timestamp_col=None)
我猜你从 the github issue 复制了这段代码。
kwargs["**"] = None
行明确告诉构造函数将 None
设置为 group
和 timestamp_col
。最后 setter/validator 获得 None
作为新值并引发错误。
因此,您应该将可接受的值设置为 group
和 timestamp_col
。
@classmethod
def _internal_ctor(cls, *args, **kwargs):
kwargs["group"] = []
kwargs["timestamp_col"] = 'timestamp' # or whatever name that makes your validator happy
return cls(*args, **kwargs)
然后你可以删除验证器中的if g == None: return
行。