如何使用 Google Cloud Translate API 来翻译批量数据?

how to use Google Cloud Translate API for translating bulk data?

我有一个包含数千行多种语言的 csv 文件,我正在考虑使用 google 云翻译 API 将外语文本翻译成英语。我已经使用了一个简单的代码来确定是否一切正常并且代码是 运行 顺利。

from google.cloud import translate_v2 as translate
from time import sleep
from tqdm.notebook import tqdm
import multiprocessing as mp
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "file path.py"
translate_client = translate.Client()
text = "Good Morning, My Name is X."
target ="ja"
output = translate_client.translate(text, target_language=target)
print(output)

我现在想导入 csv 文件(使用 pandas)并翻译文本并将输出保存为 csv 文件。但不知道我该怎么做。我发现的大多数示例都像上面一样停止翻译示例文本。

任何人都可以建议我该怎么做吗?

要使用 Google Cloud Translation API 翻译 csv 文件中的文本并将输出保存在同一个 CSV 文件中,您可以使用以下代码:

import csv
from pathlib import Path


def translate_text(target, text):
    """Translates text into the target language.
    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)

    # print(u"Text: {}".format(result["input"]))
    # print(u"Translation: {}".format(result["translatedText"]))
    # print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result["translatedText"]


def main(input_file, translate_to):
    """
    Translate a text file and save as a CSV file
    using Google Cloud Translation API
    """
    input_file_path = Path(input_file)
    target_lang = translate_to
    output_file_path = input_file_path.with_suffix('.csv')

    with open(input_file_path) as f:
        list_lines = f.readlines()
        total_lines = len(list_lines)
    with open(output_file_path, 'w') as csvfile:
        my_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
        my_writer.writerow(['id', 'original_text', 'translated_text'])

        for i, each_line in enumerate(list_lines):
            line_id = f'{i + 1:04}'
            original_text = each_line.strip('\n')  # Strip for the writer(*).
            translated_text = translate_text(
                target=target_lang,
                text=each_line)
            my_writer.writerow([line_id, original_text, translated_text])  # (*)
            # Progress monitor, non-essential.
            print(f"""
{line_id}/{total_lines:04}
  {original_text}
  {translated_text}""")


if __name__ == '__main__':
    origin_file = input('Input text file? >> ')
    output_lang = input('Output language? >> ')
    main(input_file=origin_file,
         translate_to=output_lang)

示例:

将输入文件中的文本翻译成目标语言“es”,输出存储在同一个 csv 文件中。

输入:

new.csv

How are you doing,Is everything fine there
Do it today

输出:

new.csv

id,original_text,translated_text
0001,"How are you doing,Is everything fine there",¿Cómo estás? ¿Está todo bien allí?
0002,Do it today,Hazlo hoy