获取我的数据块工作区中所有笔记本的列表
Get list of all notebooks in my databricks workspace
如何获取工作区中所有笔记本的列表并将它们的名称连同完整路径一起存储在 csv 文件中,我已尝试使用 Databricks CLI 选项,但似乎没有递归操作。
正如我们在代码中看到的,没有递归选项:
https://github.com/databricks/databricks-cli/blob/master/databricks_cli/workspace/cli.py (def ls_cli)
示例解决方案是在 python 中导入 cli 并扩展它:
from databricks_cli.sdk import ApiClient
from databricks_cli.sdk import service
host = "your_host"
token = "your_token"
client = ApiClient(host=host, token=token)
objects = []
workspace = service.WorkspaceService(client)
def list_workspace_objects(path):
elements = workspace.list(path).get('objects')
if elements is not None:
for object in elements:
objects.append(object)
if(object['object_type'] == 'DIRECTORY'):
list_workspace_objects(object['path'])
list_workspace_objects("/")
print(objects)
您可以直接使用下面的代码。注意:测试代码
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="databricks Instance"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
"path": "/"
})
headers = {
'Authorization': 'Bearer token',
'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
for element in mylist['objects']:
if element['object_type'] == 'NOTEBOOK':
notebooks.append(element)
if element['object_type'] == 'DIRECTORY':
payload_inner = json.dumps({
"path": element['path']
})
response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
if len(response_inner) != 0:
list_notebooks(response_inner)
return notebooks
result = list_notebooks(response)
print(result[0])
如何获取工作区中所有笔记本的列表并将它们的名称连同完整路径一起存储在 csv 文件中,我已尝试使用 Databricks CLI 选项,但似乎没有递归操作。
正如我们在代码中看到的,没有递归选项: https://github.com/databricks/databricks-cli/blob/master/databricks_cli/workspace/cli.py (def ls_cli)
示例解决方案是在 python 中导入 cli 并扩展它:
from databricks_cli.sdk import ApiClient
from databricks_cli.sdk import service
host = "your_host"
token = "your_token"
client = ApiClient(host=host, token=token)
objects = []
workspace = service.WorkspaceService(client)
def list_workspace_objects(path):
elements = workspace.list(path).get('objects')
if elements is not None:
for object in elements:
objects.append(object)
if(object['object_type'] == 'DIRECTORY'):
list_workspace_objects(object['path'])
list_workspace_objects("/")
print(objects)
您可以直接使用下面的代码。注意:测试代码
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="databricks Instance"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
"path": "/"
})
headers = {
'Authorization': 'Bearer token',
'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
for element in mylist['objects']:
if element['object_type'] == 'NOTEBOOK':
notebooks.append(element)
if element['object_type'] == 'DIRECTORY':
payload_inner = json.dumps({
"path": element['path']
})
response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
if len(response_inner) != 0:
list_notebooks(response_inner)
return notebooks
result = list_notebooks(response)
print(result[0])