顶点管道:CustomPythonPackageTrainingJobRunOp 不提供 WorkerPoolSpecs
Vertex Pipeline: CustomPythonPackageTrainingJobRunOp not supplying WorkerPoolSpecs
我正在尝试 运行 在 Vertex AI 上使用 Kubeflow 管道的自定义包训练管道。我将训练代码打包在 Google Cloud Storage 中,我的管道是:
import kfp
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google import experimental
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
@kfp.dsl.pipeline(name=pipeline_name, pipeline_root=pipeline_root_path)
def pipeline():
training_job_run_op = gcc_aip.CustomPythonPackageTrainingJobRunOp(
project=project_id,
display_name=training_job_name,
model_display_name=model_display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=python_module,
container_uri=container_uri,
staging_bucket=staging_bucket,
model_serving_container_image_uri=model_serving_container_image_uri)
# Upload model
model_upload_op = gcc_aip.ModelUploadOp(
project=project_id,
display_name=model_display_name,
artifact_uri=output_dir,
serving_container_image_uri=model_serving_container_image_uri,
)
model_upload_op.after(training_job_run_op)
# Deploy model
model_deploy_op = gcc_aip.ModelDeployOp(
project=project_id,
model=model_upload_op.outputs["model"],
endpoint=aiplatform.Endpoint(
endpoint_name='0000000000').resource_name,
deployed_model_display_name=model_display_name,
machine_type="n1-standard-2",
traffic_percentage=100)
compiler.Compiler().compile(pipeline_func=pipeline,
package_path=pipeline_spec_path)
当我尝试在 Vertex AI 上 运行 此管道时,出现以下错误:
{
"insertId": "qd9wxrfnoviyr",
"jsonPayload": {
"levelname": "ERROR",
"message": "google.api_core.exceptions.InvalidArgument: 400 List of found errors:\t1.Field: job_spec.worker_pool_specs; Message: At least one worker pool should be specified.\t\n"
}
}
我原来的 CustomPythonPackageTrainingJobRunOp
没有定义 worker_pool_spec
这是错误的原因。在我指定 replica_count
和 machine_type
之后,错误解决了。最终训练操作是:
training_job_run_op = gcc_aip.CustomPythonPackageTrainingJobRunOp(
project=project_id,
display_name=training_job_name,
model_display_name=model_display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=python_module,
container_uri=container_uri,
staging_bucket=staging_bucket,
base_output_dir=output_dir,
model_serving_container_image_uri=model_serving_container_image_uri,
replica_count=1,
machine_type="n1-standard-4")
我正在尝试 运行 在 Vertex AI 上使用 Kubeflow 管道的自定义包训练管道。我将训练代码打包在 Google Cloud Storage 中,我的管道是:
import kfp
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google import experimental
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
@kfp.dsl.pipeline(name=pipeline_name, pipeline_root=pipeline_root_path)
def pipeline():
training_job_run_op = gcc_aip.CustomPythonPackageTrainingJobRunOp(
project=project_id,
display_name=training_job_name,
model_display_name=model_display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=python_module,
container_uri=container_uri,
staging_bucket=staging_bucket,
model_serving_container_image_uri=model_serving_container_image_uri)
# Upload model
model_upload_op = gcc_aip.ModelUploadOp(
project=project_id,
display_name=model_display_name,
artifact_uri=output_dir,
serving_container_image_uri=model_serving_container_image_uri,
)
model_upload_op.after(training_job_run_op)
# Deploy model
model_deploy_op = gcc_aip.ModelDeployOp(
project=project_id,
model=model_upload_op.outputs["model"],
endpoint=aiplatform.Endpoint(
endpoint_name='0000000000').resource_name,
deployed_model_display_name=model_display_name,
machine_type="n1-standard-2",
traffic_percentage=100)
compiler.Compiler().compile(pipeline_func=pipeline,
package_path=pipeline_spec_path)
当我尝试在 Vertex AI 上 运行 此管道时,出现以下错误:
{
"insertId": "qd9wxrfnoviyr",
"jsonPayload": {
"levelname": "ERROR",
"message": "google.api_core.exceptions.InvalidArgument: 400 List of found errors:\t1.Field: job_spec.worker_pool_specs; Message: At least one worker pool should be specified.\t\n"
}
}
我原来的 CustomPythonPackageTrainingJobRunOp
没有定义 worker_pool_spec
这是错误的原因。在我指定 replica_count
和 machine_type
之后,错误解决了。最终训练操作是:
training_job_run_op = gcc_aip.CustomPythonPackageTrainingJobRunOp(
project=project_id,
display_name=training_job_name,
model_display_name=model_display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=python_module,
container_uri=container_uri,
staging_bucket=staging_bucket,
base_output_dir=output_dir,
model_serving_container_image_uri=model_serving_container_image_uri,
replica_count=1,
machine_type="n1-standard-4")