如何使用 BigQuery 高效地 select 记录另一个 table 中匹配的子字符串?
How to efficiently select records matching substring in another table using BigQuery?
我有一个 table 的几百万个字符串,我想将其与一个 table 的大约两万个字符串进行匹配,如下所示:
#standardSQL
SELECT record.* FROM `record`
JOIN `fragment` ON record.name
LIKE CONCAT('%', fragment.name, '%')
不幸的是,这需要很长时间。
考虑到 fragment
table 只有 20k 条记录,我可以使用 UDF 将其加载到 JavaScript 数组中并以这种方式匹配吗?我现在正试图弄清楚如何做到这一点,但也许我已经可以在这里做一些魔术来加快速度。我尝试了 CROSS JOIN
并且很快就超出了资源。我也尝试过使用 EXISTS
但我无法在该子查询的 WHERE
中引用 record.name
而不会出现错误。
示例使用 Public 数据
这似乎反映了大约相同的数量数据...
#standardSQL
WITH record AS (
SELECT LOWER(text) AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT LOWER(name) AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY name
)
SELECT record.* FROM `record`
JOIN `fragment` ON record.name
LIKE CONCAT('%', fragment.name, '%')
以下适用于 BigQuery 标准 SQL
#standardSQL
WITH record AS (
SELECT LOWER(text) AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT DISTINCT LOWER(name) AS name
FROM `bigquery-public-data.usa_names.usa_1910_current`
), temp_record AS (
SELECT record, TO_JSON_STRING(record) id, name, item
FROM record, UNNEST(REGEXP_EXTRACT_ALL(name, r'\w+')) item
), temp_fragment AS (
SELECT name, item FROM fragment, UNNEST(REGEXP_EXTRACT_ALL(name, r'\w+')) item
)
SELECT AS VALUE ANY_VALUE(record) FROM (
SELECT ANY_VALUE(record) record, id, r.name name, f.name fragment_name
FROM temp_record r
JOIN temp_fragment f
USING(item)
GROUP BY id, name, fragment_name
)
WHERE name LIKE CONCAT('%', fragment_name, '%')
GROUP BY id
以上在 375 秒内完成,而原始查询仍在 运行 2740 秒并保持 运行,所以我什至不会等待它完成
Mikhail 的回答似乎更快 - 但让我们有一个不需要 SPLIT
也不需要将文本分成单词的答案。
首先,用所有要搜索的词计算一个正则表达式:
#standardSQL
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT name AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY name
)
SELECT FORMAT('(%s)',STRING_AGG(name,'|'))
FROM fragment
现在您可以获取生成的字符串,并在 REGEX
忽略大小写的情况下使用它:
#standardSQL
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
), largestring AS (
SELECT '(?i)(mary|margaret|helen|more_names|more_names|more_names|josniel|khaiden|sergi)'
)
SELECT record.* FROM `record`
WHERE REGEXP_CONTAINS(record.name, (SELECT * FROM largestring))
(~510 秒)
正如我的问题所回避的那样,我使用 JavaScript UDF 开发了一个版本,它解决了这个问题,尽管速度比我接受的答案慢。为了完整起见,我将其发布在这里,因为也许有人(将来像我自己)会发现它有用。
CREATE TEMPORARY FUNCTION CONTAINS_ANY(str STRING, fragments ARRAY<STRING>)
RETURNS STRING
LANGUAGE js AS """
for (var i in fragments) {
if (str.indexOf(fragments[i]) >= 0) {
return fragments[i];
}
}
return null;
""";
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
WHERE text IS NOT NULL
), fragment AS (
SELECT name AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE name IS NOT NULL
GROUP BY name
), fragment_array AS (
SELECT ARRAY_AGG(name) AS names, COUNT(*) AS count
FROM fragment
GROUP BY LENGTH(name)
), records_with_fragments AS (
SELECT record.name,
CONTAINS_ANY(record.name, fragment_array.names)
AS fragment_name
FROM record INNER JOIN fragment_array
ON CONTAINS_ANY(name, fragment_array.names) IS NOT NULL
)
SELECT * EXCEPT(rownum) FROM (
SELECT record.name,
records_with_fragments.fragment_name,
ROW_NUMBER() OVER (PARTITION BY record.name) AS rownum
FROM record
INNER JOIN records_with_fragments
ON records_with_fragments.name = record.name
AND records_with_fragments.fragment_name IS NOT NULL
) WHERE rownum = 1
想法是片段列表相对较小,可以在数组中处理,类似于 Felipe 使用正则表达式的答案。我做的第一件事是创建一个 fragment_array
table ,它按片段长度分组......一种防止 over-sized 数组 which[=34= 的廉价方法] 我发现会导致 UDF 超时。
接下来,我创建一个名为 records_with_fragments
的 table,它将这些数组连接到原始记录,使用 JavaScript UDF CONTAINS_ANY()
仅查找包含匹配片段的那些.这将导致 table 包含一些重复项,因为一条记录可能匹配多个片段。
最后的SELECT
然后拉入原来的record
table,加入到records_with_fragments
判断哪个片段匹配,同样使用ROW_NUMBER()
防止重复的功能,例如仅显示由其 name
.
唯一标识的每条记录的第一行
现在,我在最终查询中进行连接的原因是因为在我的实际数据中,除了要匹配的字符串之外,还有更多我想要的字段。早些时候在我的实际数据中,我创建了一个 table 的 DISTINCT
字符串,稍后需要是 re-joined.
瞧! 不是最优雅的,但它完成了工作。
我有一个 table 的几百万个字符串,我想将其与一个 table 的大约两万个字符串进行匹配,如下所示:
#standardSQL
SELECT record.* FROM `record`
JOIN `fragment` ON record.name
LIKE CONCAT('%', fragment.name, '%')
不幸的是,这需要很长时间。
考虑到 fragment
table 只有 20k 条记录,我可以使用 UDF 将其加载到 JavaScript 数组中并以这种方式匹配吗?我现在正试图弄清楚如何做到这一点,但也许我已经可以在这里做一些魔术来加快速度。我尝试了 CROSS JOIN
并且很快就超出了资源。我也尝试过使用 EXISTS
但我无法在该子查询的 WHERE
中引用 record.name
而不会出现错误。
示例使用 Public 数据
这似乎反映了大约相同的数量数据...
#standardSQL
WITH record AS (
SELECT LOWER(text) AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT LOWER(name) AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY name
)
SELECT record.* FROM `record`
JOIN `fragment` ON record.name
LIKE CONCAT('%', fragment.name, '%')
以下适用于 BigQuery 标准 SQL
#standardSQL
WITH record AS (
SELECT LOWER(text) AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT DISTINCT LOWER(name) AS name
FROM `bigquery-public-data.usa_names.usa_1910_current`
), temp_record AS (
SELECT record, TO_JSON_STRING(record) id, name, item
FROM record, UNNEST(REGEXP_EXTRACT_ALL(name, r'\w+')) item
), temp_fragment AS (
SELECT name, item FROM fragment, UNNEST(REGEXP_EXTRACT_ALL(name, r'\w+')) item
)
SELECT AS VALUE ANY_VALUE(record) FROM (
SELECT ANY_VALUE(record) record, id, r.name name, f.name fragment_name
FROM temp_record r
JOIN temp_fragment f
USING(item)
GROUP BY id, name, fragment_name
)
WHERE name LIKE CONCAT('%', fragment_name, '%')
GROUP BY id
以上在 375 秒内完成,而原始查询仍在 运行 2740 秒并保持 运行,所以我什至不会等待它完成
Mikhail 的回答似乎更快 - 但让我们有一个不需要 SPLIT
也不需要将文本分成单词的答案。
首先,用所有要搜索的词计算一个正则表达式:
#standardSQL
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
), fragment AS (
SELECT name AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY name
)
SELECT FORMAT('(%s)',STRING_AGG(name,'|'))
FROM fragment
现在您可以获取生成的字符串,并在 REGEX
忽略大小写的情况下使用它:
#standardSQL
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
), largestring AS (
SELECT '(?i)(mary|margaret|helen|more_names|more_names|more_names|josniel|khaiden|sergi)'
)
SELECT record.* FROM `record`
WHERE REGEXP_CONTAINS(record.name, (SELECT * FROM largestring))
(~510 秒)
正如我的问题所回避的那样,我使用 JavaScript UDF 开发了一个版本,它解决了这个问题,尽管速度比我接受的答案慢。为了完整起见,我将其发布在这里,因为也许有人(将来像我自己)会发现它有用。
CREATE TEMPORARY FUNCTION CONTAINS_ANY(str STRING, fragments ARRAY<STRING>)
RETURNS STRING
LANGUAGE js AS """
for (var i in fragments) {
if (str.indexOf(fragments[i]) >= 0) {
return fragments[i];
}
}
return null;
""";
WITH record AS (
SELECT text AS name
FROM `bigquery-public-data.hacker_news.comments`
WHERE text IS NOT NULL
), fragment AS (
SELECT name AS name, COUNT(*)
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE name IS NOT NULL
GROUP BY name
), fragment_array AS (
SELECT ARRAY_AGG(name) AS names, COUNT(*) AS count
FROM fragment
GROUP BY LENGTH(name)
), records_with_fragments AS (
SELECT record.name,
CONTAINS_ANY(record.name, fragment_array.names)
AS fragment_name
FROM record INNER JOIN fragment_array
ON CONTAINS_ANY(name, fragment_array.names) IS NOT NULL
)
SELECT * EXCEPT(rownum) FROM (
SELECT record.name,
records_with_fragments.fragment_name,
ROW_NUMBER() OVER (PARTITION BY record.name) AS rownum
FROM record
INNER JOIN records_with_fragments
ON records_with_fragments.name = record.name
AND records_with_fragments.fragment_name IS NOT NULL
) WHERE rownum = 1
想法是片段列表相对较小,可以在数组中处理,类似于 Felipe 使用正则表达式的答案。我做的第一件事是创建一个 fragment_array
table ,它按片段长度分组......一种防止 over-sized 数组 which[=34= 的廉价方法] 我发现会导致 UDF 超时。
接下来,我创建一个名为 records_with_fragments
的 table,它将这些数组连接到原始记录,使用 JavaScript UDF CONTAINS_ANY()
仅查找包含匹配片段的那些.这将导致 table 包含一些重复项,因为一条记录可能匹配多个片段。
最后的SELECT
然后拉入原来的record
table,加入到records_with_fragments
判断哪个片段匹配,同样使用ROW_NUMBER()
防止重复的功能,例如仅显示由其 name
.
现在,我在最终查询中进行连接的原因是因为在我的实际数据中,除了要匹配的字符串之外,还有更多我想要的字段。早些时候在我的实际数据中,我创建了一个 table 的 DISTINCT
字符串,稍后需要是 re-joined.
瞧! 不是最优雅的,但它完成了工作。