使用包含 headers(如 pd.Dataframe)的新数据使用 Azure 机器学习进行预测

Making predictions with Azure Machine learning with new data that contains headers (like pd.Dataframe)

我的问题在某种程度上与 https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html 有关 - 但是,提供的解决方案似乎不起作用。

我正在使用 heart-disease 数据集构建一个简单模型,但我将其包装到管道中,因为我使用了一些特征化步骤(缩放、编码等)。下面是完整的脚本:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle

# data input
df = pd.read_csv('heart.csv')

# numerical variables
num_cols = ['age',
            'trestbps',
            'chol',
            'thalach',
            'oldpeak'
]

# categorical variables
cat_cols = ['sex',
            'cp',
            'fbs',
            'restecg',
            'exang',
            'slope',
            'ca',
            'thal']

# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))

# target variable
y = df['target']

# features
X = df.drop(['target'], axis=1)

# data split:

# random seed
np.random.seed(42)

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y)

# double check
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# pipeline for numerical data
num_preprocessing = Pipeline([('num_imputer', SimpleImputer(strategy='mean')), # imputing with mean
                                                   ('minmaxscaler', MinMaxScaler())]) # scaling

# pipeline for categorical data
cat_preprocessing = Pipeline([('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')), # filling missing values
                                                ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))]) # One Hot Encoding

# preprocessor - combining pipelines
preprocessor = ColumnTransformer([
                                  ('categorical', cat_preprocessing, cat_cols),
                                  ('numerical', num_preprocessing, num_cols)
                                                           ])

# initial model parameters
log_ini_params = {'penalty': 'l2', 
                  'tol': 0.0073559740277086005, 
                  'C': 1.1592424247511928, 
                  'fit_intercept': True, 
                  'solver': 'liblinear'}

# model - Pipeline
log_clf = Pipeline([('preprocessor', preprocessor),
                  ('clf', LogisticRegression(**log_ini_params))])

log_clf.fit(X_train, y_train)

# dumping the model
f = 'model/log.pkl'
with open(f, 'wb') as file:
    pickle.dump(log_clf, file)

# loading it
loaded_model = joblib.load(f)

# double check on a single datapoint
new_data = pd.DataFrame({'age': 71,
                         'sex': 0,
                         'cp': 0,
                         'trestbps': 112,
                         'chol': 203,
                         'fbs': 0,
                         'restecg': 1,
                         'thalach': 185,
                         'exang': 0,
                         'oldpeak': 0.1,
                         'slope': 2,
                         'ca': 0,
                          'thal': 2}, index=[0])

loaded_model.predict(new_data)

...而且效果很好。然后我使用以下步骤将模型部署到 Azure Web 服务:

  1. 我创建 score.py 文件
import joblib
from azureml.core.model import Model
import json

def init():
    global model
    model_path = Model.get_model_path('log') # logistic
    print('Model Path is  ', model_path)
    model = joblib.load(model_path)


def run(data):
    try:
        data = json.loads(data)
        result = model.predict(data['data'])
        # any data type, as long as it is JSON serializable.
        return {'data' : result.tolist() , 'message' : 'Successfully classified heart diseases'}
    except Exception as e:
        error = str(e)
        return {'data' : error , 'message' : 'Failed to classify heart diseases'}
  1. 我部署模型:
from azureml.core import Workspace
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core.conda_dependencies import CondaDependencies

ws = Workspace.from_config()

model = Model.register(workspace = ws,
              model_path ='model/log.pkl',
              model_name = 'log',
              tags = {'version': '1'},
              description = 'Heart disease classification',
              )

# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0'], conda_packages = ['scikit-learn==0.23.2'])
env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')

myenv = Environment.get(workspace=ws, name='env')

myenv.save_to_directory('./environ', overwrite=True)

aciconfig = AciWebservice.deploy_configuration(
            cpu_cores=1,
            memory_gb=1,
            tags={'data':'heart disease classifier'},
            description='Classification of heart diseases',
            )

inference_config = InferenceConfig(entry_script='score.py', environment=myenv)

service = Model.deploy(workspace=ws,
                name='hd-model-log',
                models=[model],
                inference_config=inference_config,
                deployment_config=aciconfig, 
                overwrite = True)

service.wait_for_deployment(show_output=True)
url = service.scoring_uri
print(url)

部署正常:

Succeeded ACI service creation operation finished, operation "Succeeded"

但我无法用新数据做出任何预测。我尝试使用:

import pandas as pd

new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
                         [80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
                         columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])

根据这个主题的答案 (https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html) 我转换数据:

test_sample = json.dumps({'data': new_data.to_dict(orient='records')})

并尝试做出一些预测:

import json
import requests
data = test_sample
headers = {'Content-Type':'application/json'}
r = requests.post(url, data=data, headers = headers)
print(r.status_code)
print(r.json())

但是,我遇到了一个错误:

200 {'data': "Expected 2D array, got 1D array instead:\narray=[{'age': 71, 'sex': 0, 'cp': 0, 'trestbps': 112, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': > 2}\n {'age': 80, 'sex': 0, 'cp': 0, 'trestbps': 115, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': 0}].\nReshape your data either using array.reshape(-1, > 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.", 'message': 'Failed to classify heart diseases'}

如何将输入数据调整为这种形式的预测并添加其他输出,如 predict_proba,以便我可以将它们存储在单独的输出数据集中?

我知道这个错误与 score.py 文件的“运行”部分或调用网络服务的最后一个代码单元有某种关系,但我无法找到它。

非常感谢您的帮助。

主要问题是分类变量的转换。处理分类变量的传统方法是使用 OneHotEncoder

# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))

转换数据需要像下面提到的那样应用:

from sklearn.preprocessing import MinMaxScaler
cat_col =['sex',
            'cp',
            'fbs',
            'restecg',
            'exang',
            'slope',
            'ca',
            'thal']

df_2 = pd.get_dummies(data[cat_col], drop_first = True)

[0,1]将在应用假人后形成,然后

new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
                         [80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
                         columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])

这可以通过较少的语法更改来应用。

编辑:

new_data = {
  "Inputs": {
    "data": [
      {
        "age": 71,
        "sex": "0",
        "cp": "0",
        "trestbps": 112,
        "chol": 203,
        "fbs": "0",
        "restecg": "1",
        "thalach": 185,
        "exang": "0",
        "oldpeak": 0.1,
        "slope": "2",
        "ca": "0",
        "thal": "2"
      }
    ]
  }
}

我相信我设法解决了问题 - 尽管我遇到了一些严重的问题。 :)

  1. 如此处所述here - 我编辑了 score.py 脚本:
import joblib
from azureml.core.model import Model
import numpy as np
import json
import pandas as pd
import numpy as np

from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType
    
data_sample = PandasParameterType(pd.DataFrame({'age': pd.Series([0], dtype='int64'),
                                                'sex': pd.Series(['example_value'], dtype='object'),
                                                'cp': pd.Series(['example_value'], dtype='object'),
                                                'trestbps': pd.Series([0], dtype='int64'),
                                                'chol': pd.Series([0], dtype='int64'),
                                                'fbs': pd.Series(['example_value'], dtype='object'),
                                                'restecg': pd.Series(['example_value'], dtype='object'),
                                                'thalach': pd.Series([0], dtype='int64'),
                                                'exang': pd.Series(['example_value'], dtype='object'),
                                                'oldpeak': pd.Series([0.0], dtype='float64'),
                                                'slope': pd.Series(['example_value'], dtype='object'),
                                                'ca': pd.Series(['example_value'], dtype='object'),
                                                'thal': pd.Series(['example_value'], dtype='object')}))

input_sample = StandardPythonParameterType({'data': data_sample})
result_sample = NumpyParameterType(np.array([0]))
output_sample = StandardPythonParameterType({'Results':result_sample})

def init():
    global model
    # Example when the model is a file
    model_path = Model.get_model_path('log') # logistic
    print('Model Path is  ', model_path)
    model = joblib.load(model_path)

@input_schema('Inputs', input_sample)
@output_schema(output_sample)
def run(Inputs):
    try:
        data = Inputs['data']
        result = model.predict_proba(data)
        return result.tolist()
    except Exception as e:
        error = str(e)
        return error
  1. 在部署步骤中我调整了CondaDependencies:
# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0', 'inference-schema==1.3.0'], conda_packages = ['scikit-learn==0.22.2.post1'])
env.python.conda_dependencies = cd
# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')

作为

a) 需要在Dependencies文件中包含inference-schema b) 由于 this issue

,我将 scikit-learn 降级为 scikit-learn==0.22.2.post1 版本

现在,当我为模型提供新数据时:

new_data = {
  "Inputs": {
    "data": [
      {
        "age": 71,
        "sex": "0",
        "cp": "0",
        "trestbps": 112,
        "chol": 203,
        "fbs": "0",
        "restecg": "1",
        "thalach": 185,
        "exang": "0",
        "oldpeak": 0.1,
        "slope": "2",
        "ca": "0",
        "thal": "2"
      }
    ]
  }
}

并将其用于预测:

import json
import requests
data = new_data
headers = {'Content-Type':'application/json'}
r = requests.post(url, str.encode(json.dumps(data)), headers = headers)
print(r.status_code)
print(r.json())

我得到:

200 [[0.02325369841858338, 0.9767463015814166]]

呃!也许有人会从我痛苦的学习之路中受益! :)