使用服务主体从 ADLS gen2 读取镶木地板文件
Reading parquet file from ADLS gen2 using service principal
我正在使用 azure-storage-file-datalake 包连接 ADLS gen2
from azure.identity import ClientSecretCredential
# service principal credential
tenant_id = 'xxxxxxx'
client_id = 'xxxxxxxxx'
client_secret = 'xxxxxxxx'
storage_account_name = 'xxxxxxxx'
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=credential) # I have also tried blob instead of dfs in account_url
我必须从中读取镶木地板文件的 ADLS gen2 中的文件夹结构如下所示。在 ADLS gen2 的容器中,我们 folder_a 包含 folder_b,其中有镶木地板文件。
folder_a
|-folder_b
parquet_file1
从 gen1 存储我们曾经像这样读取 parquet 文件。
from azure.datalake.store import lib
from azure.datalake.store.core import AzureDLFileSystem
import pyarrow.parquet as pq
adls = lib.auth(tenant_id=directory_id,
client_id=app_id,
client_secret=app_key)
adl = AzureDLFileSystem(adls, store_name=adls_name)
f = adl.open(file, 'rb') # 'file is parquet file with path of parquet file folder_a/folder_b/parquet_file1'
table = pq.read_table(f)
我们如何进行 gen2 存储,我们卡在了这一点
http://peter-hoffmann.com/2020/azure-data-lake-storage-gen-2-with-python.html就是我们所关注的link。
注意 - 我们没有使用数据块来执行此操作
问题请参考以下代码
from azure.identity import ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
import pyarrow.parquet as pq
import io
client_id = ''
client_secret = ''
tenant_id = ''
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
storage_account_name = 'testadls05'
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=credential)
file_system = '<container name>'
file_system_client = service_client.get_file_system_client(file_system)
file_path = ''
file_client = file_system_client.get_file_client(file_path)
data = file_client.download_file(0)
with io.BytesIO() as b:
data.readinto(b)
table = pq.read_table(b)
print(table)
我正在使用 azure-storage-file-datalake 包连接 ADLS gen2
from azure.identity import ClientSecretCredential
# service principal credential
tenant_id = 'xxxxxxx'
client_id = 'xxxxxxxxx'
client_secret = 'xxxxxxxx'
storage_account_name = 'xxxxxxxx'
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=credential) # I have also tried blob instead of dfs in account_url
我必须从中读取镶木地板文件的 ADLS gen2 中的文件夹结构如下所示。在 ADLS gen2 的容器中,我们 folder_a 包含 folder_b,其中有镶木地板文件。
folder_a
|-folder_b
parquet_file1
从 gen1 存储我们曾经像这样读取 parquet 文件。
from azure.datalake.store import lib
from azure.datalake.store.core import AzureDLFileSystem
import pyarrow.parquet as pq
adls = lib.auth(tenant_id=directory_id,
client_id=app_id,
client_secret=app_key)
adl = AzureDLFileSystem(adls, store_name=adls_name)
f = adl.open(file, 'rb') # 'file is parquet file with path of parquet file folder_a/folder_b/parquet_file1'
table = pq.read_table(f)
我们如何进行 gen2 存储,我们卡在了这一点
http://peter-hoffmann.com/2020/azure-data-lake-storage-gen-2-with-python.html就是我们所关注的link。
注意 - 我们没有使用数据块来执行此操作
问题请参考以下代码
from azure.identity import ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
import pyarrow.parquet as pq
import io
client_id = ''
client_secret = ''
tenant_id = ''
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
storage_account_name = 'testadls05'
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=credential)
file_system = '<container name>'
file_system_client = service_client.get_file_system_client(file_system)
file_path = ''
file_client = file_system_client.get_file_client(file_path)
data = file_client.download_file(0)
with io.BytesIO() as b:
data.readinto(b)
table = pq.read_table(b)
print(table)