如何在 Python 数据科学过程中应用模板方法模式,同时不知道重复步骤的确切数量

How to apply template method pattern in Python data science process while not knowing exactly the number of repeating steps

当我需要 select 或从大量原始主题中识别目标主题时,我喜欢将模板方法模式应用于数据科学项目。我会根据这些受试者的不同特征,即年龄、性别、疾病状况等创建标签

我希望此代码可以在未来类似性质的项目中重复使用。但是所有项目都有些不同,select将主题纳入最终过滤池的标准各不相同。如何以灵活且可根据项目需要自定义的方式构建 subject_selection_steps。目前,我的代码中只包含三个标签,但在不同的项目中可能需要或多或少。

import sys
from abc import ABC, abstractmethod
import pandas as pd
import datetime
import ctypes
import numpy as np
import random
import pysnooper
import var_creator.var_creator as vc
import feature_tagger.feature_tagger as ft
import data_descriptor.data_descriptor as dd
import data_transformer.data_transformer as dt
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import

class SubjectGrouping(ABC):
    def __init__(self):
        pass

    def subject_selection_steps(self):
        self._pandas_output_setting()
        self.run_data_preparation()
        self.import_processed_main_data()
        self.inject_test_data()
        self.create_all_subject_list()
        self.CREATE_TAG1()
        self.FILTER_SUBJECT_BY_TAG1()
        self.CREATE_TAG2()
        self.FILTER_SUBJECT_BY_TAG2()
        self.CREATE_TAG3()
        self.FILTER_SUBJECT_BY_TAG3()
        self.finalize_data()        

    def _pandas_output_setting(self):
        '''Set pandas output display setting'''
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 180)

    @abstractmethod
    def run_data_preparation(self):
        '''Run data_preparation_steps from base class'''
        pass

    @abstractmethod
    def import_processed_main_data(self):
        '''Import processed main data'''
        pass

    def inject_test_data(self):
        '''For unitest, by injecting mock cases that for sure fulfill/fail the defined subject selection criteria'''
        pass

    def create_all_subject_list(self):
        '''Gather all the unique subject ids from all datasets and create a full subject list'''
        pass

    def CREATE_TAG1(self): pass
    def FILTER_SUBJECT_BY_TAG1(self): pass
    def CREATE_TAG2(self): pass
    def FILTER_SUBJECT_BY_TAG2(self): pass
    def CREATE_TAG3(self): pass
    def FILTER_SUBJECT_BY_TAG3(self): pass

    def finalize_data(self): 
        pass

class SubjectGrouping_Project1(SubjectGrouping, data_prep.DataPreparation_Project1):
    def __init__(self):
        self.df_dad = None
        self.df_pc = None
        self.df_nacrs = None
        self.df_pin = None
        self.df_reg = None
        self.df_final_subject_group1 = None
        self.df_final_subject_group2 = None
        self.df_final_subject_group3 = None
        self.control_panel = {
            'save_file_switch': False, # WARNING: Will overwrite existing files
            'df_subsampling_switch': True,  # WARNING: Only switch to True when testing
            'df_subsampling_n': 8999,
            'random_seed': 888,
            'df_remove_dup_switch': True,
            'parse_date_switch': True,
            'result_printout_switch': True,
            'comp_loc': 'office',
            'show_df_n_switch': False, # To be implemented. Show df length before and after record removal
            'done_switch': False,
            }

    def run_data_preparation(self):
        self.data_preparation_steps()

    def import_processed_main_data(self):
        x = prepped_data_import.PreppedDataImport_Project1()
        x.data_preparation_steps()
        x.prepped_data_import_steps()
        df_dict = x.return_all_dfs()
        self.df_d, self.df_p, self.df_n, self.df_p, self.df_r = (df_dict['DF_D'], df_dict['DF_P'], 
            df_dict['DF_N'], df_dict['DF_P'], df_dict['DF_R'])
        del x

if __name__=='__main__':
    x = SubjectGrouping_Project1()
    x.subject_selection_steps()

考虑一个过滤器模式。它基本上允许根据定义的过滤器过滤对象列表,您可以在以后轻松引入新的过滤器,只需对代码进行最少的更改。

创建一个Criteria接口或抽象class。

class Criteria():
    def filter(self, request):
        raise NotImplementedError("Should have implemented this")

并让每个过滤器都从 Criteria class 扩展。让我们考虑其中一个过滤器是年龄过滤器

class AgeFilter(Criteria):

    def __init__(self, age=20):
        self.age = age

    def filter(self, list):
        filteredList = []
        for item in self.list:
            if (item.age > self.age):
                # add to the filteredList

        return filteredList

类似地,您可以通过从 Criteria 接口扩展来定义其他过滤器,例如 DiseaseFilterGenderFilter

您还可以通过定义 AndOr 过滤器对过滤器进行逻辑操作。例如

class AndFilter(Criteria):

    def __init__(self, filter1, filter2):
        self.filter1 = filter1
        self.filter2 = filter2

    def filter(self, list):
        filteredList1 = filter1.filter(list)
        filteredList2 = filter2.filter(filteredList1)
        return filteredList2

假设您已经定义了过滤器,之后您的 subject_selection_steps 方法将如下所示,

def subject_selection_steps(self):
        # define list of filters
        filterList = [ageFilter1, maleFilter, MalariaAndJaundiceFilter]
        result = personList

        for criteria in filterList:
            result = criteria.filter(result)

        return result