Airflow xcom 只拉 returns 字符串
Airflow xcom pull only returns string
我有一个气流管道,我需要从 pubsub 订阅中获取文件名,然后将该文件导入到云 sql 实例中。我使用 CloudSqlInstanceImportOperator 导入 CSV 文件。此运算符需要一个主体,其中包含文件名和其他参数。由于我在运行时读取该文件名,因此我还必须在运行时定义主体。这一切都有效。但是当我从 xcom 中拉出主体时,它 returns 一个字符串而不是 python 字典。所以 CloudSqlInstanceImportOperator 给了我以下错误(我的猜测是,因为正文是一个字符串而不是字典):
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models/taskinstance.py", line 984, in _run_raw_tas
result = task_copy.execute(context=context
File "/usr/local/lib/airflow/airflow/contrib/operators/gcp_sql_operator.py", line 715, in execut
self._validate_body_fields(
File "/usr/local/lib/airflow/airflow/contrib/operators/gcp_sql_operator.py", line 712, in _validate_body_field
api_version=self.api_version).validate(self.body
File "/usr/local/lib/airflow/airflow/contrib/utils/gcp_field_validator.py", line 420, in validat
dictionary_to_validate=body_to_validate
File "/usr/local/lib/airflow/airflow/contrib/utils/gcp_field_validator.py", line 341, in _validate_fiel
value = dictionary_to_validate.get(field_name
AttributeError: 'str' object has no attribute 'get
这是我使用的代码:
import json
import os
from datetime import datetime, timedelta
import ast
from airflow import DAG
from airflow.contrib.operators.gcs_to_gcs import GoogleCloudStorageToGoogleCloudStorageOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.sensors.pubsub_sensor import PubSubPullSensor
from airflow.contrib.sensors.gcs_sensor import GoogleCloudStoragePrefixSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.contrib.operators.gcp_sql_operator import CloudSqlInstanceImportOperator
def create_dag(dag_id,default_args):
BUCKET = "{{ var.value.gp2pg_bucket }}"
GCP_PROJECT_ID = "{{ var.value.gp2pg_project_id }}"
INSTANCE_NAME = "{{ var.value.gp2pg_instance_name }}"
def define_import_body(file,**kwargs):
import_body = {
"importContext": {
"importUser": "databasename",
"database": "databaseuser",
"fileType": "csv",
"uri": "bucketname" + file,
"csvImportOptions": {
"table": "schema.tablename",
"columns": ["columns1",
"column2"]}
}
}
task_instance = kwargs['task_instance']
task_instance.xcom_push(key='import_body', value=import_body)
print(import_body)
def get_filename(var,**kwargs):
message = ast.literal_eval(var)
file = message[0].get('message').get('attributes').get('objectId')
task_instance = kwargs['task_instance']
task_instance.xcom_push(key='filename', value=file)
print(file)
dag = DAG(dag_id=dag_id, schedule_interval=None, default_args=default_args)
with dag:
t1 = PubSubPullSensor(task_id='pull-messages',
project="projectname",
ack_messages=True,
max_messages=1,
subscription="subscribtionname")
message = "{{ task_instance.xcom_pull() }}"
t2 = PythonOperator(
task_id='get_filename',
python_callable=get_filename,
op_kwargs={'var': message},
provide_context=True,
)
file = "{{ task_instance.xcom_pull(task_ids='get_filename', key='filename') }}"
t3 = PythonOperator(
task_id='define_import_body',
python_callable=define_import_body,
op_kwargs={'file': file},
provide_context=True,
)
import_body = "{{ task_instance.xcom_pull(task_ids='define_import_body', key='import_body') }}"
t4 = CloudSqlInstanceImportOperator(
project_id=GCP_PROJECT_ID,
body= import_body,
instance=INSTANCE_NAME,
gcp_conn_id='postgres_default',
task_id='sql_import_task',
validate_body=True,
)
t5 = GoogleCloudStorageToGoogleCloudStorageOperator(
task_id='copy_files',
source_bucket=BUCKET,
source_object=file,
destination_bucket=BUCKET,
destination_object='processed/import/'+file, )
t1 >> t2 >> t3 >> t4 >> t5
return dag
dags_folder = os.getenv('DAGS_FOLDER', "./dags")
flow_config = open(f'{dags_folder}/gp2pg/flow_config.json', 'r').read()
for key, values in json.loads(flow_config).items():
default_args = {
"owner": "owner",
"start_date": datetime(2020, 1, 1),
"email": [],
"email_on_failure": False,
"email_on_retry": False,
"retries": 0,
"retry_delay": timedelta(minutes=5),
}
dag_id = f"gp2pg_{key}_data_to_pg"
globals()[dag_id] = create_dag(dag_id, default_args)
知道如何解决这个问题吗?
第一个CloudSqlInstanceImportOperator
是deprecated. You should use CloudSQLImportInstanceOperator from providers
body
参数需要像 docs 中解释的那样是字典。
XCOM 是数据库中的 table。数据保存为字符串。
您不能将 dict 存储在数据库中,因为 dict 是内存对象中的 Python 。
您可能有一个 Json (字符串)。尝试将其转换为字典:
body=json.loads(import_body)
编辑:(在评论中讨论后)
您需要用 PythonOperator 包装您的运算符,以便您可以将 xcom
转换为字典并使用它。
def my_func(ds, **kwargs):
ti = kwargs['ti']
body = ti.xcom_pull(task_ids='privious_task_id')
import_body = json.loads(body)
op = CloudSqlInstanceImportOperator(
project_id=GCP_PROJECT_ID,
body=import_body,
instance=INSTANCE_NAME,
gcp_conn_id='postgres_default',
task_id='sql_import_task',
validate_body=True,
)
op.execute()
p = PythonOperator(task_id='python_task', python_callable=my_func)
编辑:对于 Airflow >= 2.1.0:
Airflow 添加了将字段呈现为原生 Python 对象的功能。
您需要在 DAG 构造函数中设置 render_template_as_native_obj=True
。您可以按照此 documentation 示例进行操作。
从 Airflow 2.1(即将发布)开始,您可以将 render_template_as_native_obj=True
传递给 dag,Airflow 将 return Python 类型(字典、整数等)字符串。无需更改其他代码。看到这个 pull request
dag = DAG(
dag_id="example_template_as_python_object",
schedule_interval=None,
start_date=days_ago(2),
render_template_as_native_obj=True,
)
我有一个气流管道,我需要从 pubsub 订阅中获取文件名,然后将该文件导入到云 sql 实例中。我使用 CloudSqlInstanceImportOperator 导入 CSV 文件。此运算符需要一个主体,其中包含文件名和其他参数。由于我在运行时读取该文件名,因此我还必须在运行时定义主体。这一切都有效。但是当我从 xcom 中拉出主体时,它 returns 一个字符串而不是 python 字典。所以 CloudSqlInstanceImportOperator 给了我以下错误(我的猜测是,因为正文是一个字符串而不是字典):
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models/taskinstance.py", line 984, in _run_raw_tas
result = task_copy.execute(context=context
File "/usr/local/lib/airflow/airflow/contrib/operators/gcp_sql_operator.py", line 715, in execut
self._validate_body_fields(
File "/usr/local/lib/airflow/airflow/contrib/operators/gcp_sql_operator.py", line 712, in _validate_body_field
api_version=self.api_version).validate(self.body
File "/usr/local/lib/airflow/airflow/contrib/utils/gcp_field_validator.py", line 420, in validat
dictionary_to_validate=body_to_validate
File "/usr/local/lib/airflow/airflow/contrib/utils/gcp_field_validator.py", line 341, in _validate_fiel
value = dictionary_to_validate.get(field_name
AttributeError: 'str' object has no attribute 'get
这是我使用的代码:
import json
import os
from datetime import datetime, timedelta
import ast
from airflow import DAG
from airflow.contrib.operators.gcs_to_gcs import GoogleCloudStorageToGoogleCloudStorageOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.sensors.pubsub_sensor import PubSubPullSensor
from airflow.contrib.sensors.gcs_sensor import GoogleCloudStoragePrefixSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.contrib.operators.gcp_sql_operator import CloudSqlInstanceImportOperator
def create_dag(dag_id,default_args):
BUCKET = "{{ var.value.gp2pg_bucket }}"
GCP_PROJECT_ID = "{{ var.value.gp2pg_project_id }}"
INSTANCE_NAME = "{{ var.value.gp2pg_instance_name }}"
def define_import_body(file,**kwargs):
import_body = {
"importContext": {
"importUser": "databasename",
"database": "databaseuser",
"fileType": "csv",
"uri": "bucketname" + file,
"csvImportOptions": {
"table": "schema.tablename",
"columns": ["columns1",
"column2"]}
}
}
task_instance = kwargs['task_instance']
task_instance.xcom_push(key='import_body', value=import_body)
print(import_body)
def get_filename(var,**kwargs):
message = ast.literal_eval(var)
file = message[0].get('message').get('attributes').get('objectId')
task_instance = kwargs['task_instance']
task_instance.xcom_push(key='filename', value=file)
print(file)
dag = DAG(dag_id=dag_id, schedule_interval=None, default_args=default_args)
with dag:
t1 = PubSubPullSensor(task_id='pull-messages',
project="projectname",
ack_messages=True,
max_messages=1,
subscription="subscribtionname")
message = "{{ task_instance.xcom_pull() }}"
t2 = PythonOperator(
task_id='get_filename',
python_callable=get_filename,
op_kwargs={'var': message},
provide_context=True,
)
file = "{{ task_instance.xcom_pull(task_ids='get_filename', key='filename') }}"
t3 = PythonOperator(
task_id='define_import_body',
python_callable=define_import_body,
op_kwargs={'file': file},
provide_context=True,
)
import_body = "{{ task_instance.xcom_pull(task_ids='define_import_body', key='import_body') }}"
t4 = CloudSqlInstanceImportOperator(
project_id=GCP_PROJECT_ID,
body= import_body,
instance=INSTANCE_NAME,
gcp_conn_id='postgres_default',
task_id='sql_import_task',
validate_body=True,
)
t5 = GoogleCloudStorageToGoogleCloudStorageOperator(
task_id='copy_files',
source_bucket=BUCKET,
source_object=file,
destination_bucket=BUCKET,
destination_object='processed/import/'+file, )
t1 >> t2 >> t3 >> t4 >> t5
return dag
dags_folder = os.getenv('DAGS_FOLDER', "./dags")
flow_config = open(f'{dags_folder}/gp2pg/flow_config.json', 'r').read()
for key, values in json.loads(flow_config).items():
default_args = {
"owner": "owner",
"start_date": datetime(2020, 1, 1),
"email": [],
"email_on_failure": False,
"email_on_retry": False,
"retries": 0,
"retry_delay": timedelta(minutes=5),
}
dag_id = f"gp2pg_{key}_data_to_pg"
globals()[dag_id] = create_dag(dag_id, default_args)
知道如何解决这个问题吗?
第一个CloudSqlInstanceImportOperator
是deprecated. You should use CloudSQLImportInstanceOperator from providers
body
参数需要像 docs 中解释的那样是字典。
XCOM 是数据库中的 table。数据保存为字符串。 您不能将 dict 存储在数据库中,因为 dict 是内存对象中的 Python 。 您可能有一个 Json (字符串)。尝试将其转换为字典:
body=json.loads(import_body)
编辑:(在评论中讨论后)
您需要用 PythonOperator 包装您的运算符,以便您可以将 xcom
转换为字典并使用它。
def my_func(ds, **kwargs):
ti = kwargs['ti']
body = ti.xcom_pull(task_ids='privious_task_id')
import_body = json.loads(body)
op = CloudSqlInstanceImportOperator(
project_id=GCP_PROJECT_ID,
body=import_body,
instance=INSTANCE_NAME,
gcp_conn_id='postgres_default',
task_id='sql_import_task',
validate_body=True,
)
op.execute()
p = PythonOperator(task_id='python_task', python_callable=my_func)
编辑:对于 Airflow >= 2.1.0:
Airflow 添加了将字段呈现为原生 Python 对象的功能。
您需要在 DAG 构造函数中设置 render_template_as_native_obj=True
。您可以按照此 documentation 示例进行操作。
从 Airflow 2.1(即将发布)开始,您可以将 render_template_as_native_obj=True
传递给 dag,Airflow 将 return Python 类型(字典、整数等)字符串。无需更改其他代码。看到这个 pull request
dag = DAG(
dag_id="example_template_as_python_object",
schedule_interval=None,
start_date=days_ago(2),
render_template_as_native_obj=True,
)