作业云调度程序(Google 云)无法 运行 调度管道
Jobs-Cloud Scheduler (Google Cloud) fails to run scheduled pipelines
我来这里是因为我在 Google Cloud 中遇到了预定作业的问题。
在 Vertex AI Workbench 中,我在 Python 3 中创建了一个笔记本,它创建了一个管道,使用来自 public 信用卡数据集的数据来训练 AutoML。
如果我 运行 作业在其创建结束时,一切正常。但是,如果我在 Job Cloud Scheduler 中安排作业 运行 as described here,管道已启用但 运行 失败。
这是我的代码:
import os
# import sys
import google.cloud.aiplatform as aip
import kfp
# from kfp.v2.dsl import component
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
# from kfp.v2.google.client import AIPlatformClient
PROJECT_ID = "fraud-detection-project-329506"
REGION = "us-central1"
credential_path = r"C:\Users\...\fraud-detection-project-329506-4d16889a494a.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
BUCKET_NAME = "gs://..."
SERVICE_ACCOUNT = "...@fraud-detection-project-329506.iam.gserviceaccount.com"
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
PIPELINE_ROOT = "{}/dataset".format(BUCKET_NAME)
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)
# file names
TRAIN_FILE_NAME = "creditcard_train.csv"
TEST_FILE_NAME = "creditcard_test.csv"
# path for train and test dataset
gcs_csv_path_train = f"{PIPELINE_ROOT}/{TRAIN_FILE_NAME}"
gcs_csv_path_test = f"{PIPELINE_ROOT}/{TEST_FILE_NAME}"
#gcs location where the output is to be written to
gcs_destination_prefix = "{}/output".format(BUCKET_NAME)
@kfp.dsl.pipeline(name="automl-tab-training-v2")
def pipeline(project: str = PROJECT_ID):
# create tabular dataset
dataset_create_op = gcc_aip.TabularDatasetCreateOp(
project=project, display_name="creditcard", gcs_source=gcs_csv_path_train
)
# Training with AutoML
training_op = gcc_aip.AutoMLTabularTrainingJobRunOp(
project=project,
display_name="train-automl-fraud-detection",
optimization_prediction_type="classification",
column_transformations=[
{"numeric": {"column_name": "Time"}},
{"numeric": {"column_name": "V1"}},
{"numeric": {"column_name": "V2"}},
{"numeric": {"column_name": "V3"}},
{"numeric": {"column_name": "V4"}},
{"numeric": {"column_name": "V5"}},
{"numeric": {"column_name": "V6"}},
{"numeric": {"column_name": "V7"}},
{"numeric": {"column_name": "V8"}},
{"numeric": {"column_name": "V9"}},
{"numeric": {"column_name": "V10"}},
{"numeric": {"column_name": "V11"}},
{"numeric": {"column_name": "V12"}},
{"numeric": {"column_name": "V13"}},
{"numeric": {"column_name": "V14"}},
{"numeric": {"column_name": "V15"}},
{"numeric": {"column_name": "V16"}},
{"numeric": {"column_name": "V17"}},
{"numeric": {"column_name": "V18"}},
{"numeric": {"column_name": "V19"}},
{"numeric": {"column_name": "V20"}},
{"numeric": {"column_name": "V21"}},
{"numeric": {"column_name": "V22"}},
{"numeric": {"column_name": "V23"}},
{"numeric": {"column_name": "V24"}},
{"numeric": {"column_name": "V25"}},
{"numeric": {"column_name": "V26"}},
{"numeric": {"column_name": "V27"}},
{"numeric": {"column_name": "V28"}},
{"numeric": {"column_name": "Amount"}},
],
dataset=dataset_create_op.outputs["dataset"],#dataset_with_FeatEng,
target_column="Class",
budget_milli_node_hours=1000,
)
# batch prediction after training
batchprediction_op = gcc_aip.ModelBatchPredictOp(
model=training_op.outputs["model"],
job_display_name='prediction1',
gcs_source=gcs_csv_path_test,
project=project,
machine_type="n1-standard-2",
gcs_destination_prefix=gcs_destination_prefix,
)
COMPILED_PIPELINE_PATH = r"C:\Users\...\tabular_classification_pipeline.json"
SCHEDULE = "5 5 * * *"
DISPLAY_NAME = 'fraud_detection'
# compile pipeline
compiler.Compiler().compile(
pipeline_func=pipeline,
package_path=COMPILED_PIPELINE_PATH,
)
# job run after its creation
job = aip.PipelineJob(
display_name=DISPLAY_NAME,
template_path=COMPILED_PIPELINE_PATH,
pipeline_root=PIPELINE_ROOT,
)
job.run()
# api_client = AIPlatformClient(project_id=PROJECT_ID, region=REGION)
# schedule training/prediction every day at a certain hour
# api_client.create_schedule_from_job_spec(
# job_spec_path=COMPILED_PIPELINE_PATH,
# pipeline_root=PIPELINE_ROOT,
# schedule=SCHEDULE,
# )
查看错误日志,发现:
{
httpRequest: {
status: 404
}
insertId: "13yj575g2rylrz9"
jsonPayload: {
@type: "type.googleapis.com/google.cloud.scheduler.logging.AttemptFinished"
jobName: "projects/fraud-detection-project-329506/locations/us-central1/jobs/pipeline_pipeline_179e648c_0-11-a-a-a"
status: "NOT_FOUND"
targetType: "HTTP"
url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
}
logName: "projects/fraud-detection-project-329506/logs/cloudscheduler.googleapis.com%2Fexecutions"
receiveTimestamp: "2021-10-19T18:00:00.309225533Z"
resource: {
labels: {
job_id: "pipeline_pipeline_179e648c_0-11-a-a-a"
location: "us-central1"
project_id: "fraud-detection-project-329506"
}
type: "cloud_scheduler_job"
}
severity: "ERROR"
timestamp: "2021-10-19T18:00:00.309225533Z"
}
这是否意味着我必须在 运行 之前创建 URL 笔记本?我不知道如何继续下去。
提前谢谢你。
根据您分享的错误,显然 Cloud Function 无法创建作业。
status: "NOT_FOUND"
targetType: "HTTP"
url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
Cloud Function 方面的一个可能原因可能是 Cloud Build API 之前未在您的项目中使用或被禁用。你能检查它是否启用并重试吗?如果您最近启用了此 API,请等待几分钟让操作传播到系统并重试。
我来这里是因为我在 Google Cloud 中遇到了预定作业的问题。 在 Vertex AI Workbench 中,我在 Python 3 中创建了一个笔记本,它创建了一个管道,使用来自 public 信用卡数据集的数据来训练 AutoML。 如果我 运行 作业在其创建结束时,一切正常。但是,如果我在 Job Cloud Scheduler 中安排作业 运行 as described here,管道已启用但 运行 失败。
这是我的代码:
import os
# import sys
import google.cloud.aiplatform as aip
import kfp
# from kfp.v2.dsl import component
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
# from kfp.v2.google.client import AIPlatformClient
PROJECT_ID = "fraud-detection-project-329506"
REGION = "us-central1"
credential_path = r"C:\Users\...\fraud-detection-project-329506-4d16889a494a.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
BUCKET_NAME = "gs://..."
SERVICE_ACCOUNT = "...@fraud-detection-project-329506.iam.gserviceaccount.com"
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
PIPELINE_ROOT = "{}/dataset".format(BUCKET_NAME)
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)
# file names
TRAIN_FILE_NAME = "creditcard_train.csv"
TEST_FILE_NAME = "creditcard_test.csv"
# path for train and test dataset
gcs_csv_path_train = f"{PIPELINE_ROOT}/{TRAIN_FILE_NAME}"
gcs_csv_path_test = f"{PIPELINE_ROOT}/{TEST_FILE_NAME}"
#gcs location where the output is to be written to
gcs_destination_prefix = "{}/output".format(BUCKET_NAME)
@kfp.dsl.pipeline(name="automl-tab-training-v2")
def pipeline(project: str = PROJECT_ID):
# create tabular dataset
dataset_create_op = gcc_aip.TabularDatasetCreateOp(
project=project, display_name="creditcard", gcs_source=gcs_csv_path_train
)
# Training with AutoML
training_op = gcc_aip.AutoMLTabularTrainingJobRunOp(
project=project,
display_name="train-automl-fraud-detection",
optimization_prediction_type="classification",
column_transformations=[
{"numeric": {"column_name": "Time"}},
{"numeric": {"column_name": "V1"}},
{"numeric": {"column_name": "V2"}},
{"numeric": {"column_name": "V3"}},
{"numeric": {"column_name": "V4"}},
{"numeric": {"column_name": "V5"}},
{"numeric": {"column_name": "V6"}},
{"numeric": {"column_name": "V7"}},
{"numeric": {"column_name": "V8"}},
{"numeric": {"column_name": "V9"}},
{"numeric": {"column_name": "V10"}},
{"numeric": {"column_name": "V11"}},
{"numeric": {"column_name": "V12"}},
{"numeric": {"column_name": "V13"}},
{"numeric": {"column_name": "V14"}},
{"numeric": {"column_name": "V15"}},
{"numeric": {"column_name": "V16"}},
{"numeric": {"column_name": "V17"}},
{"numeric": {"column_name": "V18"}},
{"numeric": {"column_name": "V19"}},
{"numeric": {"column_name": "V20"}},
{"numeric": {"column_name": "V21"}},
{"numeric": {"column_name": "V22"}},
{"numeric": {"column_name": "V23"}},
{"numeric": {"column_name": "V24"}},
{"numeric": {"column_name": "V25"}},
{"numeric": {"column_name": "V26"}},
{"numeric": {"column_name": "V27"}},
{"numeric": {"column_name": "V28"}},
{"numeric": {"column_name": "Amount"}},
],
dataset=dataset_create_op.outputs["dataset"],#dataset_with_FeatEng,
target_column="Class",
budget_milli_node_hours=1000,
)
# batch prediction after training
batchprediction_op = gcc_aip.ModelBatchPredictOp(
model=training_op.outputs["model"],
job_display_name='prediction1',
gcs_source=gcs_csv_path_test,
project=project,
machine_type="n1-standard-2",
gcs_destination_prefix=gcs_destination_prefix,
)
COMPILED_PIPELINE_PATH = r"C:\Users\...\tabular_classification_pipeline.json"
SCHEDULE = "5 5 * * *"
DISPLAY_NAME = 'fraud_detection'
# compile pipeline
compiler.Compiler().compile(
pipeline_func=pipeline,
package_path=COMPILED_PIPELINE_PATH,
)
# job run after its creation
job = aip.PipelineJob(
display_name=DISPLAY_NAME,
template_path=COMPILED_PIPELINE_PATH,
pipeline_root=PIPELINE_ROOT,
)
job.run()
# api_client = AIPlatformClient(project_id=PROJECT_ID, region=REGION)
# schedule training/prediction every day at a certain hour
# api_client.create_schedule_from_job_spec(
# job_spec_path=COMPILED_PIPELINE_PATH,
# pipeline_root=PIPELINE_ROOT,
# schedule=SCHEDULE,
# )
查看错误日志,发现:
{
httpRequest: {
status: 404
}
insertId: "13yj575g2rylrz9"
jsonPayload: {
@type: "type.googleapis.com/google.cloud.scheduler.logging.AttemptFinished"
jobName: "projects/fraud-detection-project-329506/locations/us-central1/jobs/pipeline_pipeline_179e648c_0-11-a-a-a"
status: "NOT_FOUND"
targetType: "HTTP"
url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
}
logName: "projects/fraud-detection-project-329506/logs/cloudscheduler.googleapis.com%2Fexecutions"
receiveTimestamp: "2021-10-19T18:00:00.309225533Z"
resource: {
labels: {
job_id: "pipeline_pipeline_179e648c_0-11-a-a-a"
location: "us-central1"
project_id: "fraud-detection-project-329506"
}
type: "cloud_scheduler_job"
}
severity: "ERROR"
timestamp: "2021-10-19T18:00:00.309225533Z"
}
这是否意味着我必须在 运行 之前创建 URL 笔记本?我不知道如何继续下去。 提前谢谢你。
根据您分享的错误,显然 Cloud Function 无法创建作业。
status: "NOT_FOUND"
targetType: "HTTP"
url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
Cloud Function 方面的一个可能原因可能是 Cloud Build API 之前未在您的项目中使用或被禁用。你能检查它是否启用并重试吗?如果您最近启用了此 API,请等待几分钟让操作传播到系统并重试。