将 PCollection 作为 Apache Beam 的侧输入传递时出现 KeyError
KeyError on passing PCollection as side input on Apache Beam
我将 side_input
PCollection 作为侧输入传递给 ParDo
转换,但得到相同的 KeyError
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from beam_nuggets.io import relational_db
from processors.appendcol import AppendCol
from side_inputs.config import sideinput_bq_config
from source.config import source_config
with beam.Pipeline(options=PipelineOptions()) as si:
side_input = si | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc"
)
with beam.Pipeline(options=PipelineOptions()) as p:
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input)
)
下面是错误
Traceback (most recent call last):
File "athena/etl.py", line 40, in <module>
extract()
File "athena/etl.py", line 22, in extract
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 555, in __exit__
self.result = self.run()
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 534, in run
return self.runner.run_pipeline(self, self._options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/direct/direct_runner.py", line 119, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 175, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 186, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 329, in run_stages
runner_execution_context = execution.FnApiRunnerExecutionContext(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 323, in __init__
self._build_data_side_inputs_map(stages))
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 386, in _build_data_side_inputs_map
producing_stage = producing_stages_by_pcoll[side_pc]
KeyError: 'ref_PCollection_PCollection_5'
我正在从 PostgreSQL table 读取数据,PCollection 的每个元素都是一个字典。
我认为问题在于您有两个单独的管道试图协同工作。您应该将所有转换作为单个管道的一部分执行:
with beam.Pipeline(options=PipelineOptions()) as p:
side_input = p | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc")
my_pcoll = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input))
我将 side_input
PCollection 作为侧输入传递给 ParDo
转换,但得到相同的 KeyError
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from beam_nuggets.io import relational_db
from processors.appendcol import AppendCol
from side_inputs.config import sideinput_bq_config
from source.config import source_config
with beam.Pipeline(options=PipelineOptions()) as si:
side_input = si | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc"
)
with beam.Pipeline(options=PipelineOptions()) as p:
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input)
)
下面是错误
Traceback (most recent call last):
File "athena/etl.py", line 40, in <module>
extract()
File "athena/etl.py", line 22, in extract
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 555, in __exit__
self.result = self.run()
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 534, in run
return self.runner.run_pipeline(self, self._options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/direct/direct_runner.py", line 119, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 175, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 186, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 329, in run_stages
runner_execution_context = execution.FnApiRunnerExecutionContext(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 323, in __init__
self._build_data_side_inputs_map(stages))
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 386, in _build_data_side_inputs_map
producing_stage = producing_stages_by_pcoll[side_pc]
KeyError: 'ref_PCollection_PCollection_5'
我正在从 PostgreSQL table 读取数据,PCollection 的每个元素都是一个字典。
我认为问题在于您有两个单独的管道试图协同工作。您应该将所有转换作为单个管道的一部分执行:
with beam.Pipeline(options=PipelineOptions()) as p:
side_input = p | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc")
my_pcoll = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input))