AWS S3 - 在嵌套 JSON 文件中搜索特定文本
AWS S3 - Search a specific text in nested JSON file
{
"repo": [
{
"book": 47,
"version": 1,
"bookName": "Book1",
"chapters": [
{
"chapterId": 1,
"chapterContents": [
{
"line": 1,
"Text": "1. The first text of chapter 1 book 1"
},
{
"line": 2,
"Text": "2. The second text of chapter 1 book 1"
}
]
},
{
"chapterId": 2,
"chapterContents": [
{
"line": 1,
"Text": "1. The first text of chapter 2 book 1"
}
]
}
]
}
]
}
这是在 Amazon S3 中存储为 JSON 文件的数据模型的格式。有很多书,有很多章节和文本内容。要求是在所有书籍中搜索特定文本,并列出找到该文本的行、章、书和版本。如何对嵌套数组 JSON 文件进行 S3 查询?
您可以使用 Amazon Athena 从 S3 查询嵌套的 JSON 数据。
应该可以在 json 中搜索值。你可以找到一个例子 here:
WITH dataset AS (
SELECT * FROM (VALUES
(JSON '{"name": "Bob Smith", "org": "legal", "projects": ["project1"]}'),
(JSON '{"name": "Susan Smith", "org": "engineering", "projects": ["project1", "project2", "project3"]}'),
(JSON '{"name": "Jane Smith", "org": "finance", "projects": ["project1", "project2"]}')
) AS t (users)
)
SELECT json_extract_scalar(users, '$.name') AS user
FROM dataset
WHERE json_array_contains(json_extract(users, '$.projects'), 'project2')
{
"repo": [
{
"book": 47,
"version": 1,
"bookName": "Book1",
"chapters": [
{
"chapterId": 1,
"chapterContents": [
{
"line": 1,
"Text": "1. The first text of chapter 1 book 1"
},
{
"line": 2,
"Text": "2. The second text of chapter 1 book 1"
}
]
},
{
"chapterId": 2,
"chapterContents": [
{
"line": 1,
"Text": "1. The first text of chapter 2 book 1"
}
]
}
]
}
]
}
这是在 Amazon S3 中存储为 JSON 文件的数据模型的格式。有很多书,有很多章节和文本内容。要求是在所有书籍中搜索特定文本,并列出找到该文本的行、章、书和版本。如何对嵌套数组 JSON 文件进行 S3 查询?
您可以使用 Amazon Athena 从 S3 查询嵌套的 JSON 数据。
应该可以在 json 中搜索值。你可以找到一个例子 here:
WITH dataset AS (
SELECT * FROM (VALUES
(JSON '{"name": "Bob Smith", "org": "legal", "projects": ["project1"]}'),
(JSON '{"name": "Susan Smith", "org": "engineering", "projects": ["project1", "project2", "project3"]}'),
(JSON '{"name": "Jane Smith", "org": "finance", "projects": ["project1", "project2"]}')
) AS t (users)
)
SELECT json_extract_scalar(users, '$.name') AS user
FROM dataset
WHERE json_array_contains(json_extract(users, '$.projects'), 'project2')