从 Python 中的 download_as_string 访问 blob 对象中的数据

Accessing data in blob object from download_as_string in Python

我正在尝试访问和修改从 Google Cloud Storage 在 Google Cloud Functions 中提取的换行符 JSON 文件中的数据。结果总是显示为数字,尽管它不是 JSON 中的数据。

我看到 download_as_string() for blob object returns Bytes (https://googleapis.github.io/google-cloud-python/latest/_modules/google/cloud/storage/blob.html#Blob.download_as_string) 但在我看到的任何引用中,每个人都可以很好地访问他们的数据。

我正在 Cloud Functions 中执行此操作,但我认为我的问题适用于任何 GCP 工具。

我下面的示例应该简单地加载换行符 JSON 数据,将其添加到列表中,select 前两个字典条目,转换回换行符 JSON 并输出到 JSON GCS 上的文件。下面列出了示例、代码和错误输出。

示例换行符JSON 输入

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}
{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}
{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}

云函数中的代码

import requests
import json
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):
    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "sample_bucket"
    filename = 'sample_json_output.json'

    # get bucket with name
    bucket = read_storage_client.get_bucket('sample_bucket')
    # get bucket data as blob
    blob = bucket.get_blob('sample_json.json')
    # download as string
    json_data = blob.download_as_string()

    # create list 
    website_list = []
    for u,y in enumerate(json_data):
        website_list.append(y)

    # select first two
    website_list = website_list[0:2]

    # Create new-line JSON
    results_ready = '\n'.join(json.dumps(item) for item in website_list)

    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(results_ready)

sample_json_output.json 文件中的当前输出

123
34

预期输出

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}

更新 6/6:如果我直接从 download_to_string blob 写入文件,那么它会写入 JSON 文件完美,但我需要先访问内容。

import requests
import json
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):

    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "sample_bucket"
    filename = 'sample_json_output.json'

    # get bucket with name
    bucket = read_storage_client.get_bucket('sample_bucket')

    # get bucket data as blob
    blob = bucket.get_blob('sample_json.json')

    # convert to string
    json_data = blob.download_as_string()


    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(json_data)

更新 6/6 输出

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}
{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}
{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}

当您读取 json_data 中的 blob 时,您将获得一个字节对象,当您对其进行迭代时,您将获得每个字符的数字表示形式。下面是一个从字节对象

创建字典列表的示例
json_data                                                                                                                                                                                                 
b'{"Website": "Google", "URL": "Google.com", "ID": 1}\n{"Website": "Bing", "URL": "Bing.com", "ID": 2}\n{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}\n{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}\n'

type(json_data)                                                                                                                                                                                           
bytes

website_list = [json.loads(row.decode('utf-8')) for row in json_data.split(b'\n') if row]                                                                                                                 

website_list                                                                                                                                                                                              
[{'Website': 'Google', 'URL': 'Google.com', 'ID': 1},
 {'Website': 'Bing', 'URL': 'Bing.com', 'ID': 2},
 {'Website': 'Yahoo', 'URL': 'Yahoo.com', 'ID': 3},
 {'Website': 'Yandex', 'URL': 'Yandex.com', 'ID': 4}]

我能够在下面的代码中使用与您自己类似的方法和换行的 ndjson 库获得您想要的结果 JSON。

import requests
import json
import ndjson
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):

    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "bucket-name"
    filename = "sample_json_output.json"

    # get bucket with name
    bucket = read_storage_client.get_bucket(bucket_name)

    # get bucket data as blob
    blob = bucket.get_blob("sample_json.json")

    # convert to string
    json_data_string = blob.download_as_string()

    json_data = ndjson.loads(json_data_string)

    list = []
    for item in json_data:
        list.append(item)

    list1 = list[0:2]

    result = ""
    for item in list1:
        result = result + str(item) + "\n"


    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(result)