使用包含 headers(如 pd.Dataframe)的新数据使用 Azure 机器学习进行预测
Making predictions with Azure Machine learning with new data that contains headers (like pd.Dataframe)
我的问题在某种程度上与 https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html 有关 - 但是,提供的解决方案似乎不起作用。
我正在使用 heart-disease 数据集构建一个简单模型,但我将其包装到管道中,因为我使用了一些特征化步骤(缩放、编码等)。下面是完整的脚本:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle
# data input
df = pd.read_csv('heart.csv')
# numerical variables
num_cols = ['age',
'trestbps',
'chol',
'thalach',
'oldpeak'
]
# categorical variables
cat_cols = ['sex',
'cp',
'fbs',
'restecg',
'exang',
'slope',
'ca',
'thal']
# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))
# target variable
y = df['target']
# features
X = df.drop(['target'], axis=1)
# data split:
# random seed
np.random.seed(42)
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
stratify=y)
# double check
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# pipeline for numerical data
num_preprocessing = Pipeline([('num_imputer', SimpleImputer(strategy='mean')), # imputing with mean
('minmaxscaler', MinMaxScaler())]) # scaling
# pipeline for categorical data
cat_preprocessing = Pipeline([('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')), # filling missing values
('onehot', OneHotEncoder(drop='first', handle_unknown='error'))]) # One Hot Encoding
# preprocessor - combining pipelines
preprocessor = ColumnTransformer([
('categorical', cat_preprocessing, cat_cols),
('numerical', num_preprocessing, num_cols)
])
# initial model parameters
log_ini_params = {'penalty': 'l2',
'tol': 0.0073559740277086005,
'C': 1.1592424247511928,
'fit_intercept': True,
'solver': 'liblinear'}
# model - Pipeline
log_clf = Pipeline([('preprocessor', preprocessor),
('clf', LogisticRegression(**log_ini_params))])
log_clf.fit(X_train, y_train)
# dumping the model
f = 'model/log.pkl'
with open(f, 'wb') as file:
pickle.dump(log_clf, file)
# loading it
loaded_model = joblib.load(f)
# double check on a single datapoint
new_data = pd.DataFrame({'age': 71,
'sex': 0,
'cp': 0,
'trestbps': 112,
'chol': 203,
'fbs': 0,
'restecg': 1,
'thalach': 185,
'exang': 0,
'oldpeak': 0.1,
'slope': 2,
'ca': 0,
'thal': 2}, index=[0])
loaded_model.predict(new_data)
...而且效果很好。然后我使用以下步骤将模型部署到 Azure Web 服务:
- 我创建 score.py 文件
import joblib
from azureml.core.model import Model
import json
def init():
global model
model_path = Model.get_model_path('log') # logistic
print('Model Path is ', model_path)
model = joblib.load(model_path)
def run(data):
try:
data = json.loads(data)
result = model.predict(data['data'])
# any data type, as long as it is JSON serializable.
return {'data' : result.tolist() , 'message' : 'Successfully classified heart diseases'}
except Exception as e:
error = str(e)
return {'data' : error , 'message' : 'Failed to classify heart diseases'}
- 我部署模型:
from azureml.core import Workspace
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core.conda_dependencies import CondaDependencies
ws = Workspace.from_config()
model = Model.register(workspace = ws,
model_path ='model/log.pkl',
model_name = 'log',
tags = {'version': '1'},
description = 'Heart disease classification',
)
# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0'], conda_packages = ['scikit-learn==0.23.2'])
env.python.conda_dependencies = cd
# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')
myenv = Environment.get(workspace=ws, name='env')
myenv.save_to_directory('./environ', overwrite=True)
aciconfig = AciWebservice.deploy_configuration(
cpu_cores=1,
memory_gb=1,
tags={'data':'heart disease classifier'},
description='Classification of heart diseases',
)
inference_config = InferenceConfig(entry_script='score.py', environment=myenv)
service = Model.deploy(workspace=ws,
name='hd-model-log',
models=[model],
inference_config=inference_config,
deployment_config=aciconfig,
overwrite = True)
service.wait_for_deployment(show_output=True)
url = service.scoring_uri
print(url)
部署正常:
Succeeded
ACI service creation operation finished, operation "Succeeded"
但我无法用新数据做出任何预测。我尝试使用:
import pandas as pd
new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
[80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])
根据这个主题的答案 (https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html) 我转换数据:
test_sample = json.dumps({'data': new_data.to_dict(orient='records')})
并尝试做出一些预测:
import json
import requests
data = test_sample
headers = {'Content-Type':'application/json'}
r = requests.post(url, data=data, headers = headers)
print(r.status_code)
print(r.json())
但是,我遇到了一个错误:
200
{'data': "Expected 2D array, got 1D array instead:\narray=[{'age': 71, 'sex': 0, 'cp': 0, 'trestbps': 112, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': > 2}\n {'age': 80, 'sex': 0, 'cp': 0, 'trestbps': 115, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': 0}].\nReshape your data either using array.reshape(-1, > 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.", 'message': 'Failed to classify heart diseases'}
如何将输入数据调整为这种形式的预测并添加其他输出,如 predict_proba,以便我可以将它们存储在单独的输出数据集中?
我知道这个错误与 score.py 文件的“运行”部分或调用网络服务的最后一个代码单元有某种关系,但我无法找到它。
非常感谢您的帮助。
主要问题是分类变量的转换。处理分类变量的传统方法是使用 OneHotEncoder
# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))
转换数据需要像下面提到的那样应用:
from sklearn.preprocessing import MinMaxScaler
cat_col =['sex',
'cp',
'fbs',
'restecg',
'exang',
'slope',
'ca',
'thal']
df_2 = pd.get_dummies(data[cat_col], drop_first = True)
[0,1]将在应用假人后形成,然后
new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
[80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])
这可以通过较少的语法更改来应用。
编辑:
new_data = {
"Inputs": {
"data": [
{
"age": 71,
"sex": "0",
"cp": "0",
"trestbps": 112,
"chol": 203,
"fbs": "0",
"restecg": "1",
"thalach": 185,
"exang": "0",
"oldpeak": 0.1,
"slope": "2",
"ca": "0",
"thal": "2"
}
]
}
}
我相信我设法解决了问题 - 尽管我遇到了一些严重的问题。 :)
- 如此处所述here - 我编辑了
score.py
脚本:
import joblib
from azureml.core.model import Model
import numpy as np
import json
import pandas as pd
import numpy as np
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType
data_sample = PandasParameterType(pd.DataFrame({'age': pd.Series([0], dtype='int64'),
'sex': pd.Series(['example_value'], dtype='object'),
'cp': pd.Series(['example_value'], dtype='object'),
'trestbps': pd.Series([0], dtype='int64'),
'chol': pd.Series([0], dtype='int64'),
'fbs': pd.Series(['example_value'], dtype='object'),
'restecg': pd.Series(['example_value'], dtype='object'),
'thalach': pd.Series([0], dtype='int64'),
'exang': pd.Series(['example_value'], dtype='object'),
'oldpeak': pd.Series([0.0], dtype='float64'),
'slope': pd.Series(['example_value'], dtype='object'),
'ca': pd.Series(['example_value'], dtype='object'),
'thal': pd.Series(['example_value'], dtype='object')}))
input_sample = StandardPythonParameterType({'data': data_sample})
result_sample = NumpyParameterType(np.array([0]))
output_sample = StandardPythonParameterType({'Results':result_sample})
def init():
global model
# Example when the model is a file
model_path = Model.get_model_path('log') # logistic
print('Model Path is ', model_path)
model = joblib.load(model_path)
@input_schema('Inputs', input_sample)
@output_schema(output_sample)
def run(Inputs):
try:
data = Inputs['data']
result = model.predict_proba(data)
return result.tolist()
except Exception as e:
error = str(e)
return error
- 在部署步骤中我调整了
CondaDependencies
:
# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0', 'inference-schema==1.3.0'], conda_packages = ['scikit-learn==0.22.2.post1'])
env.python.conda_dependencies = cd
# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')
作为
a) 需要在Dependencies
文件中包含inference-schema
b) 由于 this issue
,我将 scikit-learn
降级为 scikit-learn==0.22.2.post1
版本
现在,当我为模型提供新数据时:
new_data = {
"Inputs": {
"data": [
{
"age": 71,
"sex": "0",
"cp": "0",
"trestbps": 112,
"chol": 203,
"fbs": "0",
"restecg": "1",
"thalach": 185,
"exang": "0",
"oldpeak": 0.1,
"slope": "2",
"ca": "0",
"thal": "2"
}
]
}
}
并将其用于预测:
import json
import requests
data = new_data
headers = {'Content-Type':'application/json'}
r = requests.post(url, str.encode(json.dumps(data)), headers = headers)
print(r.status_code)
print(r.json())
我得到:
200 [[0.02325369841858338, 0.9767463015814166]]
呃!也许有人会从我痛苦的学习之路中受益! :)
我的问题在某种程度上与 https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html 有关 - 但是,提供的解决方案似乎不起作用。
我正在使用 heart-disease 数据集构建一个简单模型,但我将其包装到管道中,因为我使用了一些特征化步骤(缩放、编码等)。下面是完整的脚本:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle
# data input
df = pd.read_csv('heart.csv')
# numerical variables
num_cols = ['age',
'trestbps',
'chol',
'thalach',
'oldpeak'
]
# categorical variables
cat_cols = ['sex',
'cp',
'fbs',
'restecg',
'exang',
'slope',
'ca',
'thal']
# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))
# target variable
y = df['target']
# features
X = df.drop(['target'], axis=1)
# data split:
# random seed
np.random.seed(42)
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
stratify=y)
# double check
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# pipeline for numerical data
num_preprocessing = Pipeline([('num_imputer', SimpleImputer(strategy='mean')), # imputing with mean
('minmaxscaler', MinMaxScaler())]) # scaling
# pipeline for categorical data
cat_preprocessing = Pipeline([('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')), # filling missing values
('onehot', OneHotEncoder(drop='first', handle_unknown='error'))]) # One Hot Encoding
# preprocessor - combining pipelines
preprocessor = ColumnTransformer([
('categorical', cat_preprocessing, cat_cols),
('numerical', num_preprocessing, num_cols)
])
# initial model parameters
log_ini_params = {'penalty': 'l2',
'tol': 0.0073559740277086005,
'C': 1.1592424247511928,
'fit_intercept': True,
'solver': 'liblinear'}
# model - Pipeline
log_clf = Pipeline([('preprocessor', preprocessor),
('clf', LogisticRegression(**log_ini_params))])
log_clf.fit(X_train, y_train)
# dumping the model
f = 'model/log.pkl'
with open(f, 'wb') as file:
pickle.dump(log_clf, file)
# loading it
loaded_model = joblib.load(f)
# double check on a single datapoint
new_data = pd.DataFrame({'age': 71,
'sex': 0,
'cp': 0,
'trestbps': 112,
'chol': 203,
'fbs': 0,
'restecg': 1,
'thalach': 185,
'exang': 0,
'oldpeak': 0.1,
'slope': 2,
'ca': 0,
'thal': 2}, index=[0])
loaded_model.predict(new_data)
...而且效果很好。然后我使用以下步骤将模型部署到 Azure Web 服务:
- 我创建 score.py 文件
import joblib
from azureml.core.model import Model
import json
def init():
global model
model_path = Model.get_model_path('log') # logistic
print('Model Path is ', model_path)
model = joblib.load(model_path)
def run(data):
try:
data = json.loads(data)
result = model.predict(data['data'])
# any data type, as long as it is JSON serializable.
return {'data' : result.tolist() , 'message' : 'Successfully classified heart diseases'}
except Exception as e:
error = str(e)
return {'data' : error , 'message' : 'Failed to classify heart diseases'}
- 我部署模型:
from azureml.core import Workspace
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core.conda_dependencies import CondaDependencies
ws = Workspace.from_config()
model = Model.register(workspace = ws,
model_path ='model/log.pkl',
model_name = 'log',
tags = {'version': '1'},
description = 'Heart disease classification',
)
# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0'], conda_packages = ['scikit-learn==0.23.2'])
env.python.conda_dependencies = cd
# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')
myenv = Environment.get(workspace=ws, name='env')
myenv.save_to_directory('./environ', overwrite=True)
aciconfig = AciWebservice.deploy_configuration(
cpu_cores=1,
memory_gb=1,
tags={'data':'heart disease classifier'},
description='Classification of heart diseases',
)
inference_config = InferenceConfig(entry_script='score.py', environment=myenv)
service = Model.deploy(workspace=ws,
name='hd-model-log',
models=[model],
inference_config=inference_config,
deployment_config=aciconfig,
overwrite = True)
service.wait_for_deployment(show_output=True)
url = service.scoring_uri
print(url)
部署正常:
Succeeded ACI service creation operation finished, operation "Succeeded"
但我无法用新数据做出任何预测。我尝试使用:
import pandas as pd
new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
[80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])
根据这个主题的答案 (https://docs.microsoft.com/en-us/answers/questions/217305/data-input-format-call-the-service-for-azure-ml-ti.html) 我转换数据:
test_sample = json.dumps({'data': new_data.to_dict(orient='records')})
并尝试做出一些预测:
import json
import requests
data = test_sample
headers = {'Content-Type':'application/json'}
r = requests.post(url, data=data, headers = headers)
print(r.status_code)
print(r.json())
但是,我遇到了一个错误:
200 {'data': "Expected 2D array, got 1D array instead:\narray=[{'age': 71, 'sex': 0, 'cp': 0, 'trestbps': 112, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': > 2}\n {'age': 80, 'sex': 0, 'cp': 0, 'trestbps': 115, 'chol': 203, 'fbs': 0, 'restecg': 1, 'thalach': 185, 'exang': 0, 'oldpeak': 0.1, 'slope': 2, 'ca': 0, 'thal': 0}].\nReshape your data either using array.reshape(-1, > 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.", 'message': 'Failed to classify heart diseases'}
如何将输入数据调整为这种形式的预测并添加其他输出,如 predict_proba,以便我可以将它们存储在单独的输出数据集中?
我知道这个错误与 score.py 文件的“运行”部分或调用网络服务的最后一个代码单元有某种关系,但我无法找到它。
非常感谢您的帮助。
主要问题是分类变量的转换。处理分类变量的传统方法是使用 OneHotEncoder
# changing format of the categorical variables
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('object'))
转换数据需要像下面提到的那样应用:
from sklearn.preprocessing import MinMaxScaler
cat_col =['sex',
'cp',
'fbs',
'restecg',
'exang',
'slope',
'ca',
'thal']
df_2 = pd.get_dummies(data[cat_col], drop_first = True)
[0,1]将在应用假人后形成,然后
new_data = pd.DataFrame([[71, 0, 0, 112, 203, 0, 1, 185, 0, 0.1, 2, 0, 2],
[80, 0, 0, 115, 203, 0, 1, 185, 0, 0.1, 2, 0, 0]],
columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])
这可以通过较少的语法更改来应用。
编辑:
new_data = {
"Inputs": {
"data": [
{
"age": 71,
"sex": "0",
"cp": "0",
"trestbps": 112,
"chol": 203,
"fbs": "0",
"restecg": "1",
"thalach": 185,
"exang": "0",
"oldpeak": 0.1,
"slope": "2",
"ca": "0",
"thal": "2"
}
]
}
}
我相信我设法解决了问题 - 尽管我遇到了一些严重的问题。 :)
- 如此处所述here - 我编辑了
score.py
脚本:
import joblib
from azureml.core.model import Model
import numpy as np
import json
import pandas as pd
import numpy as np
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType
data_sample = PandasParameterType(pd.DataFrame({'age': pd.Series([0], dtype='int64'),
'sex': pd.Series(['example_value'], dtype='object'),
'cp': pd.Series(['example_value'], dtype='object'),
'trestbps': pd.Series([0], dtype='int64'),
'chol': pd.Series([0], dtype='int64'),
'fbs': pd.Series(['example_value'], dtype='object'),
'restecg': pd.Series(['example_value'], dtype='object'),
'thalach': pd.Series([0], dtype='int64'),
'exang': pd.Series(['example_value'], dtype='object'),
'oldpeak': pd.Series([0.0], dtype='float64'),
'slope': pd.Series(['example_value'], dtype='object'),
'ca': pd.Series(['example_value'], dtype='object'),
'thal': pd.Series(['example_value'], dtype='object')}))
input_sample = StandardPythonParameterType({'data': data_sample})
result_sample = NumpyParameterType(np.array([0]))
output_sample = StandardPythonParameterType({'Results':result_sample})
def init():
global model
# Example when the model is a file
model_path = Model.get_model_path('log') # logistic
print('Model Path is ', model_path)
model = joblib.load(model_path)
@input_schema('Inputs', input_sample)
@output_schema(output_sample)
def run(Inputs):
try:
data = Inputs['data']
result = model.predict_proba(data)
return result.tolist()
except Exception as e:
error = str(e)
return error
- 在部署步骤中我调整了
CondaDependencies
:
# to install required packages
env = Environment('env')
cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0', 'inference-schema==1.3.0'], conda_packages = ['scikit-learn==0.22.2.post1'])
env.python.conda_dependencies = cd
# Register environment to re-use later
env.register(workspace = ws)
print('Registered Environment')
作为
a) 需要在Dependencies
文件中包含inference-schema
b) 由于 this issue
scikit-learn
降级为 scikit-learn==0.22.2.post1
版本
现在,当我为模型提供新数据时:
new_data = {
"Inputs": {
"data": [
{
"age": 71,
"sex": "0",
"cp": "0",
"trestbps": 112,
"chol": 203,
"fbs": "0",
"restecg": "1",
"thalach": 185,
"exang": "0",
"oldpeak": 0.1,
"slope": "2",
"ca": "0",
"thal": "2"
}
]
}
}
并将其用于预测:
import json
import requests
data = new_data
headers = {'Content-Type':'application/json'}
r = requests.post(url, str.encode(json.dumps(data)), headers = headers)
print(r.status_code)
print(r.json())
我得到:
200 [[0.02325369841858338, 0.9767463015814166]]
呃!也许有人会从我痛苦的学习之路中受益! :)