将其他语言输出到 Cloud Pub/Sub 个主题
Outputting other languages to Cloud Pub/Sub topics
我正在使用 Google Cloud Functions 结合他们的 Vision API 和 Translate API 从图像中提取文本,然后将提取的文本翻译成各种语言。输出被传递到 Pub/Sub 主题,然后存储在 Cloud Storage 存储桶中。所有组件似乎都工作正常,除了我的输出没有针对该语言正确编码,所以我得到的字符串像 "Pâté Oeufs Mimosa" 而不是 "Pâté – Oeufs Mimosa".
这是我正在使用的相关代码:
# [START functions_ocr_detect]
def detect_text(bucket, filename):
print('Looking for text in image {}'.format(filename))
futures = []
text_detection_response = vision_client.text_detection({
'source': {'image_uri': 'gs://{}/{}'.format(bucket, filename)}
})
annotations = text_detection_response.text_annotations
if len(annotations) > 0:
text = annotations[0].description
else:
text = ''
print('Extracted text {} from image ({} chars).'.format(text, len(text)))
detect_language_response = translate_client.detect_language(text)
src_lang = detect_language_response['language']
print('Detected language {} for text {}.'.format(src_lang, text))
# Submit a message to the bus for each target language
for target_lang in TO_LANG:
topic_name = TRANSLATE_TOPIC
if src_lang == target_lang or src_lang == 'und':
topic_name = RESULT_TOPIC
message = {
'text': text,
'filename': filename,
'lang': target_lang,
'src_lang': src_lang
}
message_data = json.dumps(message).encode('utf-8')
topic_path = publisher.topic_path(project_id, topic_name)
future = publisher.publish(topic_path, data=message_data)
futures.append(future)
for future in futures:
future.result()
# [END functions_ocr_detect]
# [START message_validatation_helper]
def validate_message(message, param):
var = message.get(param)
if not var:
raise ValueError('{} is not provided. Make sure you have \
property {} in the request'.format(param, param))
return var
# [END message_validatation_helper]
# [START functions_ocr_process]
def process_image(file, context):
"""Cloud Function triggered by Cloud Storage when a file is changed.
Args:
file (dict): Metadata of the changed file, provided by the triggering
Cloud Storage event.
context (google.cloud.functions.Context): Metadata of triggering event.
Returns:
None; the output is written to stdout and Stackdriver Logging
"""
bucket = validate_message(file, 'bucket')
name = validate_message(file, 'name')
detect_text(bucket, name)
print('File {} processed.'.format(file['name']))
# [END functions_ocr_process]
# [START functions_ocr_translate]
def translate_text(event, context):
if event.get('data'):
message_data = base64.b64decode(event['data']).decode('utf-8')
message = json.loads(message_data)
else:
raise ValueError('Data sector is missing in the Pub/Sub message.')
text = validate_message(message, 'text')
filename = validate_message(message, 'filename')
target_lang = validate_message(message, 'lang')
src_lang = validate_message(message, 'src_lang')
print('Translating text into {}.'.format(target_lang))
translated_text = translate_client.translate(text,
target_language=target_lang,
source_language=src_lang)
topic_name = RESULT_TOPIC']
message = {
'text': translated_text['translatedText'],
'filename': filename,
'lang': target_lang,
}
message_data = json.dumps(message).encode('utf-8')
topic_path = publisher.topic_path(project_id, topic_name)
future = publisher.publish(topic_path, data=message_data)
future.result()
# [END functions_ocr_translate]
# [START functions_ocr_save]
def save_result(event, context):
if event.get('data'):
message_data = base64.b64decode(event['data']).decode('utf-8')
message = json.loads(message_data)
else:
raise ValueError('Data sector is missing in the Pub/Sub message.')
text = validate_message(message, 'text')
filename = validate_message(message, 'filename')
lang = validate_message(message, 'lang')
print('Received request to save file {}.'.format(filename))
bucket_name = RESULT_BUCKET
result_filename = '{}_{}.txt'.format(filename, lang)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(result_filename)
print('Saving result to {} in bucket {}.'.format(result_filename, bucket_name))
blob.upload_from_string(text)
print('File saved.')
# [END functions_ocr_save]
A blob / object 具有 content_encoding
、content_type
和 content_language
header... 这可能应该是:content_type='text/plain'; charset='utf-8', content_encoding='utf-8', content_language='fr'
.
我正在使用 Google Cloud Functions 结合他们的 Vision API 和 Translate API 从图像中提取文本,然后将提取的文本翻译成各种语言。输出被传递到 Pub/Sub 主题,然后存储在 Cloud Storage 存储桶中。所有组件似乎都工作正常,除了我的输出没有针对该语言正确编码,所以我得到的字符串像 "Pâté Oeufs Mimosa" 而不是 "Pâté – Oeufs Mimosa".
这是我正在使用的相关代码:
# [START functions_ocr_detect]
def detect_text(bucket, filename):
print('Looking for text in image {}'.format(filename))
futures = []
text_detection_response = vision_client.text_detection({
'source': {'image_uri': 'gs://{}/{}'.format(bucket, filename)}
})
annotations = text_detection_response.text_annotations
if len(annotations) > 0:
text = annotations[0].description
else:
text = ''
print('Extracted text {} from image ({} chars).'.format(text, len(text)))
detect_language_response = translate_client.detect_language(text)
src_lang = detect_language_response['language']
print('Detected language {} for text {}.'.format(src_lang, text))
# Submit a message to the bus for each target language
for target_lang in TO_LANG:
topic_name = TRANSLATE_TOPIC
if src_lang == target_lang or src_lang == 'und':
topic_name = RESULT_TOPIC
message = {
'text': text,
'filename': filename,
'lang': target_lang,
'src_lang': src_lang
}
message_data = json.dumps(message).encode('utf-8')
topic_path = publisher.topic_path(project_id, topic_name)
future = publisher.publish(topic_path, data=message_data)
futures.append(future)
for future in futures:
future.result()
# [END functions_ocr_detect]
# [START message_validatation_helper]
def validate_message(message, param):
var = message.get(param)
if not var:
raise ValueError('{} is not provided. Make sure you have \
property {} in the request'.format(param, param))
return var
# [END message_validatation_helper]
# [START functions_ocr_process]
def process_image(file, context):
"""Cloud Function triggered by Cloud Storage when a file is changed.
Args:
file (dict): Metadata of the changed file, provided by the triggering
Cloud Storage event.
context (google.cloud.functions.Context): Metadata of triggering event.
Returns:
None; the output is written to stdout and Stackdriver Logging
"""
bucket = validate_message(file, 'bucket')
name = validate_message(file, 'name')
detect_text(bucket, name)
print('File {} processed.'.format(file['name']))
# [END functions_ocr_process]
# [START functions_ocr_translate]
def translate_text(event, context):
if event.get('data'):
message_data = base64.b64decode(event['data']).decode('utf-8')
message = json.loads(message_data)
else:
raise ValueError('Data sector is missing in the Pub/Sub message.')
text = validate_message(message, 'text')
filename = validate_message(message, 'filename')
target_lang = validate_message(message, 'lang')
src_lang = validate_message(message, 'src_lang')
print('Translating text into {}.'.format(target_lang))
translated_text = translate_client.translate(text,
target_language=target_lang,
source_language=src_lang)
topic_name = RESULT_TOPIC']
message = {
'text': translated_text['translatedText'],
'filename': filename,
'lang': target_lang,
}
message_data = json.dumps(message).encode('utf-8')
topic_path = publisher.topic_path(project_id, topic_name)
future = publisher.publish(topic_path, data=message_data)
future.result()
# [END functions_ocr_translate]
# [START functions_ocr_save]
def save_result(event, context):
if event.get('data'):
message_data = base64.b64decode(event['data']).decode('utf-8')
message = json.loads(message_data)
else:
raise ValueError('Data sector is missing in the Pub/Sub message.')
text = validate_message(message, 'text')
filename = validate_message(message, 'filename')
lang = validate_message(message, 'lang')
print('Received request to save file {}.'.format(filename))
bucket_name = RESULT_BUCKET
result_filename = '{}_{}.txt'.format(filename, lang)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(result_filename)
print('Saving result to {} in bucket {}.'.format(result_filename, bucket_name))
blob.upload_from_string(text)
print('File saved.')
# [END functions_ocr_save]
A blob / object 具有 content_encoding
、content_type
和 content_language
header... 这可能应该是:content_type='text/plain'; charset='utf-8', content_encoding='utf-8', content_language='fr'
.