如何在 Python 数据科学过程中应用模板方法模式,同时不知道重复步骤的确切数量
How to apply template method pattern in Python data science process while not knowing exactly the number of repeating steps
当我需要 select 或从大量原始主题中识别目标主题时,我喜欢将模板方法模式应用于数据科学项目。我会根据这些受试者的不同特征,即年龄、性别、疾病状况等创建标签
我希望此代码可以在未来类似性质的项目中重复使用。但是所有项目都有些不同,select将主题纳入最终过滤池的标准各不相同。如何以灵活且可根据项目需要自定义的方式构建 subject_selection_steps
。目前,我的代码中只包含三个标签,但在不同的项目中可能需要或多或少。
import sys
from abc import ABC, abstractmethod
import pandas as pd
import datetime
import ctypes
import numpy as np
import random
import pysnooper
import var_creator.var_creator as vc
import feature_tagger.feature_tagger as ft
import data_descriptor.data_descriptor as dd
import data_transformer.data_transformer as dt
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import
class SubjectGrouping(ABC):
def __init__(self):
pass
def subject_selection_steps(self):
self._pandas_output_setting()
self.run_data_preparation()
self.import_processed_main_data()
self.inject_test_data()
self.create_all_subject_list()
self.CREATE_TAG1()
self.FILTER_SUBJECT_BY_TAG1()
self.CREATE_TAG2()
self.FILTER_SUBJECT_BY_TAG2()
self.CREATE_TAG3()
self.FILTER_SUBJECT_BY_TAG3()
self.finalize_data()
def _pandas_output_setting(self):
'''Set pandas output display setting'''
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 180)
@abstractmethod
def run_data_preparation(self):
'''Run data_preparation_steps from base class'''
pass
@abstractmethod
def import_processed_main_data(self):
'''Import processed main data'''
pass
def inject_test_data(self):
'''For unitest, by injecting mock cases that for sure fulfill/fail the defined subject selection criteria'''
pass
def create_all_subject_list(self):
'''Gather all the unique subject ids from all datasets and create a full subject list'''
pass
def CREATE_TAG1(self): pass
def FILTER_SUBJECT_BY_TAG1(self): pass
def CREATE_TAG2(self): pass
def FILTER_SUBJECT_BY_TAG2(self): pass
def CREATE_TAG3(self): pass
def FILTER_SUBJECT_BY_TAG3(self): pass
def finalize_data(self):
pass
class SubjectGrouping_Project1(SubjectGrouping, data_prep.DataPreparation_Project1):
def __init__(self):
self.df_dad = None
self.df_pc = None
self.df_nacrs = None
self.df_pin = None
self.df_reg = None
self.df_final_subject_group1 = None
self.df_final_subject_group2 = None
self.df_final_subject_group3 = None
self.control_panel = {
'save_file_switch': False, # WARNING: Will overwrite existing files
'df_subsampling_switch': True, # WARNING: Only switch to True when testing
'df_subsampling_n': 8999,
'random_seed': 888,
'df_remove_dup_switch': True,
'parse_date_switch': True,
'result_printout_switch': True,
'comp_loc': 'office',
'show_df_n_switch': False, # To be implemented. Show df length before and after record removal
'done_switch': False,
}
def run_data_preparation(self):
self.data_preparation_steps()
def import_processed_main_data(self):
x = prepped_data_import.PreppedDataImport_Project1()
x.data_preparation_steps()
x.prepped_data_import_steps()
df_dict = x.return_all_dfs()
self.df_d, self.df_p, self.df_n, self.df_p, self.df_r = (df_dict['DF_D'], df_dict['DF_P'],
df_dict['DF_N'], df_dict['DF_P'], df_dict['DF_R'])
del x
if __name__=='__main__':
x = SubjectGrouping_Project1()
x.subject_selection_steps()
考虑一个过滤器模式。它基本上允许根据定义的过滤器过滤对象列表,您可以在以后轻松引入新的过滤器,只需对代码进行最少的更改。
创建一个Criteria
接口或抽象class。
class Criteria():
def filter(self, request):
raise NotImplementedError("Should have implemented this")
并让每个过滤器都从 Criteria
class 扩展。让我们考虑其中一个过滤器是年龄过滤器
class AgeFilter(Criteria):
def __init__(self, age=20):
self.age = age
def filter(self, list):
filteredList = []
for item in self.list:
if (item.age > self.age):
# add to the filteredList
return filteredList
类似地,您可以通过从 Criteria
接口扩展来定义其他过滤器,例如 DiseaseFilter
、GenderFilter
。
您还可以通过定义 And
或 Or
过滤器对过滤器进行逻辑操作。例如
class AndFilter(Criteria):
def __init__(self, filter1, filter2):
self.filter1 = filter1
self.filter2 = filter2
def filter(self, list):
filteredList1 = filter1.filter(list)
filteredList2 = filter2.filter(filteredList1)
return filteredList2
假设您已经定义了过滤器,之后您的 subject_selection_steps
方法将如下所示,
def subject_selection_steps(self):
# define list of filters
filterList = [ageFilter1, maleFilter, MalariaAndJaundiceFilter]
result = personList
for criteria in filterList:
result = criteria.filter(result)
return result
当我需要 select 或从大量原始主题中识别目标主题时,我喜欢将模板方法模式应用于数据科学项目。我会根据这些受试者的不同特征,即年龄、性别、疾病状况等创建标签
我希望此代码可以在未来类似性质的项目中重复使用。但是所有项目都有些不同,select将主题纳入最终过滤池的标准各不相同。如何以灵活且可根据项目需要自定义的方式构建 subject_selection_steps
。目前,我的代码中只包含三个标签,但在不同的项目中可能需要或多或少。
import sys
from abc import ABC, abstractmethod
import pandas as pd
import datetime
import ctypes
import numpy as np
import random
import pysnooper
import var_creator.var_creator as vc
import feature_tagger.feature_tagger as ft
import data_descriptor.data_descriptor as dd
import data_transformer.data_transformer as dt
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import
class SubjectGrouping(ABC):
def __init__(self):
pass
def subject_selection_steps(self):
self._pandas_output_setting()
self.run_data_preparation()
self.import_processed_main_data()
self.inject_test_data()
self.create_all_subject_list()
self.CREATE_TAG1()
self.FILTER_SUBJECT_BY_TAG1()
self.CREATE_TAG2()
self.FILTER_SUBJECT_BY_TAG2()
self.CREATE_TAG3()
self.FILTER_SUBJECT_BY_TAG3()
self.finalize_data()
def _pandas_output_setting(self):
'''Set pandas output display setting'''
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 180)
@abstractmethod
def run_data_preparation(self):
'''Run data_preparation_steps from base class'''
pass
@abstractmethod
def import_processed_main_data(self):
'''Import processed main data'''
pass
def inject_test_data(self):
'''For unitest, by injecting mock cases that for sure fulfill/fail the defined subject selection criteria'''
pass
def create_all_subject_list(self):
'''Gather all the unique subject ids from all datasets and create a full subject list'''
pass
def CREATE_TAG1(self): pass
def FILTER_SUBJECT_BY_TAG1(self): pass
def CREATE_TAG2(self): pass
def FILTER_SUBJECT_BY_TAG2(self): pass
def CREATE_TAG3(self): pass
def FILTER_SUBJECT_BY_TAG3(self): pass
def finalize_data(self):
pass
class SubjectGrouping_Project1(SubjectGrouping, data_prep.DataPreparation_Project1):
def __init__(self):
self.df_dad = None
self.df_pc = None
self.df_nacrs = None
self.df_pin = None
self.df_reg = None
self.df_final_subject_group1 = None
self.df_final_subject_group2 = None
self.df_final_subject_group3 = None
self.control_panel = {
'save_file_switch': False, # WARNING: Will overwrite existing files
'df_subsampling_switch': True, # WARNING: Only switch to True when testing
'df_subsampling_n': 8999,
'random_seed': 888,
'df_remove_dup_switch': True,
'parse_date_switch': True,
'result_printout_switch': True,
'comp_loc': 'office',
'show_df_n_switch': False, # To be implemented. Show df length before and after record removal
'done_switch': False,
}
def run_data_preparation(self):
self.data_preparation_steps()
def import_processed_main_data(self):
x = prepped_data_import.PreppedDataImport_Project1()
x.data_preparation_steps()
x.prepped_data_import_steps()
df_dict = x.return_all_dfs()
self.df_d, self.df_p, self.df_n, self.df_p, self.df_r = (df_dict['DF_D'], df_dict['DF_P'],
df_dict['DF_N'], df_dict['DF_P'], df_dict['DF_R'])
del x
if __name__=='__main__':
x = SubjectGrouping_Project1()
x.subject_selection_steps()
考虑一个过滤器模式。它基本上允许根据定义的过滤器过滤对象列表,您可以在以后轻松引入新的过滤器,只需对代码进行最少的更改。
创建一个Criteria
接口或抽象class。
class Criteria():
def filter(self, request):
raise NotImplementedError("Should have implemented this")
并让每个过滤器都从 Criteria
class 扩展。让我们考虑其中一个过滤器是年龄过滤器
class AgeFilter(Criteria):
def __init__(self, age=20):
self.age = age
def filter(self, list):
filteredList = []
for item in self.list:
if (item.age > self.age):
# add to the filteredList
return filteredList
类似地,您可以通过从 Criteria
接口扩展来定义其他过滤器,例如 DiseaseFilter
、GenderFilter
。
您还可以通过定义 And
或 Or
过滤器对过滤器进行逻辑操作。例如
class AndFilter(Criteria):
def __init__(self, filter1, filter2):
self.filter1 = filter1
self.filter2 = filter2
def filter(self, list):
filteredList1 = filter1.filter(list)
filteredList2 = filter2.filter(filteredList1)
return filteredList2
假设您已经定义了过滤器,之后您的 subject_selection_steps
方法将如下所示,
def subject_selection_steps(self):
# define list of filters
filterList = [ageFilter1, maleFilter, MalariaAndJaundiceFilter]
result = personList
for criteria in filterList:
result = criteria.filter(result)
return result