我如何计算 TF/IDF 与 SQL (BigQuery)
How can I compute TF/IDF with SQL (BigQuery)
我正在对 reddit 评论进行文本分析,我想在 BigQuery 中计算 TF-IDF。
此查询分为 5 个阶段:
- 获取我感兴趣的所有reddit帖子。规范化单词(LOWER,只有字母和
'
,取消转义一些HTML)。将这些单词拆分成一个数组。
- 计算每个文档中每个词的 tf(词频)- 计算它在每个文档中出现的次数,相对于所述文档中的词数。
- 对于每个单词,计算包含它的文档数。
- 从(3.)中得到idf(逆向文档频率):"inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient"
- 将tf*idf相乘得到tf-idf。
这个查询通过将获得的值向上传递到链中,设法一次完成。
#standardSQL
WITH words_by_post AS (
SELECT CONCAT(link_id, '/', id) id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(REGEXP_REPLACE(LOWER(body), '&', '&'), r'&[a-z]{2,4};', '*')
, r'[a-z]{2,20}\'?[a-z]+') words
, COUNT(*) OVER() docs_n
FROM `fh-bigquery.reddit_comments.2017_07`
WHERE body NOT IN ('[deleted]', '[removed]')
AND subreddit = 'movies'
AND score > 100
), words_tf AS (
SELECT id, word, COUNT(*) / ARRAY_LENGTH(ANY_VALUE(words)) tf, ARRAY_LENGTH(ANY_VALUE(words)) words_in_doc
, ANY_VALUE(docs_n) docs_n
FROM words_by_post, UNNEST(words) word
GROUP BY id, word
HAVING words_in_doc>30
), docs_idf AS (
SELECT tf.id, word, tf.tf, ARRAY_LENGTH(tfs) docs_with_word, LOG(docs_n/ARRAY_LENGTH(tfs)) idf
FROM (
SELECT word, ARRAY_AGG(STRUCT(tf, id, words_in_doc)) tfs, ANY_VALUE(docs_n) docs_n
FROM words_tf
GROUP BY 1
), UNNEST(tfs) tf
)
SELECT *, tf*idf tfidf
FROM docs_idf
WHERE docs_with_word > 1
ORDER BY tfidf DESC
LIMIT 1000
Stack Overflow 数据集版本:
#standardSQL
WITH words_by_post AS (
SELECT id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
LOWER(CONCAT(title, ' ', body))
, r'&', '&')
, r'&[a-z]*;', '')
, r'<[= \-:a-z0-9/\."]*>', '')
, r'[a-z]{2,20}\'?[a-z]+') words
, title, body
, COUNT(*) OVER() docs_n
FROM `bigquery-public-data.Whosebug.posts_questions`
WHERE score >= 150
), words_tf AS (
SELECT id, words
, ARRAY(
SELECT AS STRUCT w word, COUNT(*)/ARRAY_LENGTH(words) tf
FROM UNNEST(words) a
JOIN (SELECT DISTINCT w FROM UNNEST(words) w) b
ON a=b.w
WHERE w NOT IN ('the', 'and', 'for', 'this', 'that', 'can', 'but')
GROUP BY word ORDER BY word
) tfs
, ARRAY_LENGTH((words)) words_in_doc
, docs_n
, title, body
FROM words_by_post
WHERE ARRAY_LENGTH(words)>20
), docs_idf AS (
SELECT *, LOG(docs_n/docs_with_word) idf
FROM (
SELECT id, word, tf.tf, COUNTIF(word IN UNNEST(words)) OVER(PARTITION BY word) docs_with_word, docs_n
, title, body
FROM words_tf, UNNEST(tfs) tf
)
)
SELECT id, ARRAY_AGG(STRUCT(word, tf*idf AS tf_idf, docs_with_word) ORDER BY tf*idf DESC) tfidfs
# , ANY_VALUE(title) title, ANY_VALUE(body) body # makes query slower
FROM docs_idf
WHERE docs_with_word > 1
GROUP BY 1
与上一个答案相比的改进:需要跨数据集少一个 GROUP BY,帮助查询 运行 更快。
这个可能更容易理解 - 采用已经包含每个电视台和每天的单词数的数据集:
# in this query the combination of date+station represents a "document"
WITH data AS (
SELECT *
FROM `gdelt-bq.gdeltv2.iatv_1grams`
WHERE DATE BETWEEN 20190601 AND 20190629
AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
# how many times a word is mentioned in each "document"
SELECT word, SUM(count) counts, date, station
FROM data
GROUP BY 1, 3, 4
)
, day_station AS (
# total # of words in each "document"
SELECT SUM(count) counts, date, station
FROM data
GROUP BY 2,3
)
, tf AS (
# TF for a word in a "document"
SELECT word, date, station, a.counts/b.counts tf
FROM word_day_station a
JOIN day_station b
USING(date, station)
)
, word_in_docs AS (
# how many "documents" have a word
SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
FROM word_day_station
GROUP BY 1
)
, total_docs AS (
# total # of docs
SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
FROM data
)
, idf AS (
# IDF for a word
SELECT word, LOG(total_docs.total_docs/indocs) idf
FROM word_in_docs
CROSS JOIN total_docs
)
SELECT date,
ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
FROM (
SELECT word, date, station, tf.tf * idf.idf tfidf
FROM tf
JOIN idf
USING(word)
)
GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC
我正在对 reddit 评论进行文本分析,我想在 BigQuery 中计算 TF-IDF。
此查询分为 5 个阶段:
- 获取我感兴趣的所有reddit帖子。规范化单词(LOWER,只有字母和
'
,取消转义一些HTML)。将这些单词拆分成一个数组。 - 计算每个文档中每个词的 tf(词频)- 计算它在每个文档中出现的次数,相对于所述文档中的词数。
- 对于每个单词,计算包含它的文档数。
- 从(3.)中得到idf(逆向文档频率):"inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient"
- 将tf*idf相乘得到tf-idf。
这个查询通过将获得的值向上传递到链中,设法一次完成。
#standardSQL
WITH words_by_post AS (
SELECT CONCAT(link_id, '/', id) id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(REGEXP_REPLACE(LOWER(body), '&', '&'), r'&[a-z]{2,4};', '*')
, r'[a-z]{2,20}\'?[a-z]+') words
, COUNT(*) OVER() docs_n
FROM `fh-bigquery.reddit_comments.2017_07`
WHERE body NOT IN ('[deleted]', '[removed]')
AND subreddit = 'movies'
AND score > 100
), words_tf AS (
SELECT id, word, COUNT(*) / ARRAY_LENGTH(ANY_VALUE(words)) tf, ARRAY_LENGTH(ANY_VALUE(words)) words_in_doc
, ANY_VALUE(docs_n) docs_n
FROM words_by_post, UNNEST(words) word
GROUP BY id, word
HAVING words_in_doc>30
), docs_idf AS (
SELECT tf.id, word, tf.tf, ARRAY_LENGTH(tfs) docs_with_word, LOG(docs_n/ARRAY_LENGTH(tfs)) idf
FROM (
SELECT word, ARRAY_AGG(STRUCT(tf, id, words_in_doc)) tfs, ANY_VALUE(docs_n) docs_n
FROM words_tf
GROUP BY 1
), UNNEST(tfs) tf
)
SELECT *, tf*idf tfidf
FROM docs_idf
WHERE docs_with_word > 1
ORDER BY tfidf DESC
LIMIT 1000
Stack Overflow 数据集版本:
#standardSQL
WITH words_by_post AS (
SELECT id, REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
LOWER(CONCAT(title, ' ', body))
, r'&', '&')
, r'&[a-z]*;', '')
, r'<[= \-:a-z0-9/\."]*>', '')
, r'[a-z]{2,20}\'?[a-z]+') words
, title, body
, COUNT(*) OVER() docs_n
FROM `bigquery-public-data.Whosebug.posts_questions`
WHERE score >= 150
), words_tf AS (
SELECT id, words
, ARRAY(
SELECT AS STRUCT w word, COUNT(*)/ARRAY_LENGTH(words) tf
FROM UNNEST(words) a
JOIN (SELECT DISTINCT w FROM UNNEST(words) w) b
ON a=b.w
WHERE w NOT IN ('the', 'and', 'for', 'this', 'that', 'can', 'but')
GROUP BY word ORDER BY word
) tfs
, ARRAY_LENGTH((words)) words_in_doc
, docs_n
, title, body
FROM words_by_post
WHERE ARRAY_LENGTH(words)>20
), docs_idf AS (
SELECT *, LOG(docs_n/docs_with_word) idf
FROM (
SELECT id, word, tf.tf, COUNTIF(word IN UNNEST(words)) OVER(PARTITION BY word) docs_with_word, docs_n
, title, body
FROM words_tf, UNNEST(tfs) tf
)
)
SELECT id, ARRAY_AGG(STRUCT(word, tf*idf AS tf_idf, docs_with_word) ORDER BY tf*idf DESC) tfidfs
# , ANY_VALUE(title) title, ANY_VALUE(body) body # makes query slower
FROM docs_idf
WHERE docs_with_word > 1
GROUP BY 1
与上一个答案相比的改进:需要跨数据集少一个 GROUP BY,帮助查询 运行 更快。
这个可能更容易理解 - 采用已经包含每个电视台和每天的单词数的数据集:
# in this query the combination of date+station represents a "document"
WITH data AS (
SELECT *
FROM `gdelt-bq.gdeltv2.iatv_1grams`
WHERE DATE BETWEEN 20190601 AND 20190629
AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
# how many times a word is mentioned in each "document"
SELECT word, SUM(count) counts, date, station
FROM data
GROUP BY 1, 3, 4
)
, day_station AS (
# total # of words in each "document"
SELECT SUM(count) counts, date, station
FROM data
GROUP BY 2,3
)
, tf AS (
# TF for a word in a "document"
SELECT word, date, station, a.counts/b.counts tf
FROM word_day_station a
JOIN day_station b
USING(date, station)
)
, word_in_docs AS (
# how many "documents" have a word
SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
FROM word_day_station
GROUP BY 1
)
, total_docs AS (
# total # of docs
SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
FROM data
)
, idf AS (
# IDF for a word
SELECT word, LOG(total_docs.total_docs/indocs) idf
FROM word_in_docs
CROSS JOIN total_docs
)
SELECT date,
ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
FROM (
SELECT word, date, station, tf.tf * idf.idf tfidf
FROM tf
JOIN idf
USING(word)
)
GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC