如何将 CustomDataAsset 传递给 DataContext 以达到 运行 批处理的自定义期望?
How to pass a CustomDataAsset to a DataContext to run custom expectations on a batch?
我有一个 CustomPandasDataset
具有自定义期望
from great_expectations.data_asset import DataAsset
from great_expectations.dataset import PandasDataset
from datetime import date, datetime, timedelta
class CustomPandasDataset(PandasDataset):
_data_asset_type = "CustomPandasDataset"
@DataAsset.expectation(["column", "datetime_match", "datetime_diff"])
def expect_column_max_value_to_match_datetime(self, column:str, datetime_match: datetime = None, datetime_diff: tuple = None) -> dict:
"""
Check if data is constantly updated by matching the max datetime column to a
datetime value or to a datetime difference.
"""
max_datetime = self[column].max()
if datetime_match is None:
from datetime import date
datetime_match = date.today()
if datetime_diff:
from datetime import timedelta
success = (datetime_match - timedelta(*datetime_diff)) <= max_datetime <= datetime_match
else:
success = (max_datetime == datetime_match)
result = {
"data_max_value": max_datetime,
"expected_max_value": str(datetime_match),
"expected_datetime_diff": datetime_diff
}
return {
"success": success,
"result": result
}
我想 运行 对给定 pandas 数据帧的期望 expect_column_max_value_to_match_datetime
expectation_suite_name = "df-raw-expectations"
suite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)
df_ge = ge.from_pandas(df, dataset_class=CustomPandasDataset)
batch_kwargs = {'dataset': df_ge, 'datasource': 'df_raw_datasource'}
# Get batch of data
batch = context.get_batch(batch_kwargs, suite)
这是我从 DataContext 获得的,现在我 运行 对这批
的期望
datetime_diff = 4,
batch.expect_column_max_value_to_match_datetime(column='DATE', datetime_diff=datetime_diff)
我收到以下错误
AttributeError: 'PandasDataset' object has no attribute 'expect_column_max_value_to_match_datetime'
根据文档,我在构建 GreatExpectations 数据集时指定了 dataset_class=CustomPandasDataset
属性,确实 运行 对 [=18= 的期望] 有效但不适用于数据批次。
根据docs
To use custom expectations in a datasource or DataContext, you need to define the custom DataAsset in the datasource configuration or batch_kwargs for a specific batch.
所以通过get_batch()
函数
的data_asset_type
参数传递CustomPandasDataset
# Get batch of data
batch = context.get_batch(batch_kwargs, suite, data_asset_type=CustomPandasDataset)
或者在context Configuration中定义
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.data_context import BaseDataContext
data_context_config = DataContextConfig(
...
datasources={
"sales_raw_datasource": {
"data_asset_type": {
"class_name": "CustomPandasDataset",
"module_name": "custom_dataset",
},
"class_name": "PandasDatasource",
"module_name": "great_expectations.datasource",
}
},
...
)
context = BaseDataContext(project_config=data_context_config)
其中 CustomPandasDataset
可从 module/script custom_dataset.py
我有一个 CustomPandasDataset
具有自定义期望
from great_expectations.data_asset import DataAsset
from great_expectations.dataset import PandasDataset
from datetime import date, datetime, timedelta
class CustomPandasDataset(PandasDataset):
_data_asset_type = "CustomPandasDataset"
@DataAsset.expectation(["column", "datetime_match", "datetime_diff"])
def expect_column_max_value_to_match_datetime(self, column:str, datetime_match: datetime = None, datetime_diff: tuple = None) -> dict:
"""
Check if data is constantly updated by matching the max datetime column to a
datetime value or to a datetime difference.
"""
max_datetime = self[column].max()
if datetime_match is None:
from datetime import date
datetime_match = date.today()
if datetime_diff:
from datetime import timedelta
success = (datetime_match - timedelta(*datetime_diff)) <= max_datetime <= datetime_match
else:
success = (max_datetime == datetime_match)
result = {
"data_max_value": max_datetime,
"expected_max_value": str(datetime_match),
"expected_datetime_diff": datetime_diff
}
return {
"success": success,
"result": result
}
我想 运行 对给定 pandas 数据帧的期望 expect_column_max_value_to_match_datetime
expectation_suite_name = "df-raw-expectations"
suite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)
df_ge = ge.from_pandas(df, dataset_class=CustomPandasDataset)
batch_kwargs = {'dataset': df_ge, 'datasource': 'df_raw_datasource'}
# Get batch of data
batch = context.get_batch(batch_kwargs, suite)
这是我从 DataContext 获得的,现在我 运行 对这批
的期望datetime_diff = 4,
batch.expect_column_max_value_to_match_datetime(column='DATE', datetime_diff=datetime_diff)
我收到以下错误
AttributeError: 'PandasDataset' object has no attribute 'expect_column_max_value_to_match_datetime'
根据文档,我在构建 GreatExpectations 数据集时指定了 dataset_class=CustomPandasDataset
属性,确实 运行 对 [=18= 的期望] 有效但不适用于数据批次。
根据docs
To use custom expectations in a datasource or DataContext, you need to define the custom DataAsset in the datasource configuration or batch_kwargs for a specific batch.
所以通过get_batch()
函数
data_asset_type
参数传递CustomPandasDataset
# Get batch of data
batch = context.get_batch(batch_kwargs, suite, data_asset_type=CustomPandasDataset)
或者在context Configuration中定义
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.data_context import BaseDataContext
data_context_config = DataContextConfig(
...
datasources={
"sales_raw_datasource": {
"data_asset_type": {
"class_name": "CustomPandasDataset",
"module_name": "custom_dataset",
},
"class_name": "PandasDatasource",
"module_name": "great_expectations.datasource",
}
},
...
)
context = BaseDataContext(project_config=data_context_config)
其中 CustomPandasDataset
可从 module/script custom_dataset.py