我如何计算 TF/IDF 与 SQL (BigQuery)

How can I compute TF/IDF with SQL (BigQuery)

我正在对 reddit 评论进行文本分析,我想在 BigQuery 中计算 TF-IDF。

此查询分为 5 个阶段:

  1. 获取我感兴趣的所有reddit帖子。规范化单词(LOWER,只有字母和',取消转义一些HTML)。将这些单词拆分成一个数组。
  2. 计算每个文档中每个词的 tf(词频)- 计算它在每个文档中出现的次数,相对于所述文档中的词数。
  3. 对于每个单词,计算包含它的文档数。
  4. 从(3.)中得到idf(逆向文档频率):"inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient"
  5. 将tf*idf相乘得到tf-idf。

这个查询通过将获得的值向上传递到链中,设法一次完成。

#standardSQL
WITH words_by_post AS (
  SELECT CONCAT(link_id, '/', id) id, REGEXP_EXTRACT_ALL(
    REGEXP_REPLACE(REGEXP_REPLACE(LOWER(body), '&', '&'), r'&[a-z]{2,4};', '*')
      , r'[a-z]{2,20}\'?[a-z]+') words
  , COUNT(*) OVER() docs_n
  FROM `fh-bigquery.reddit_comments.2017_07`  
  WHERE body NOT IN ('[deleted]', '[removed]')
  AND subreddit = 'movies'
  AND score > 100
), words_tf AS (
  SELECT id, word, COUNT(*) / ARRAY_LENGTH(ANY_VALUE(words)) tf, ARRAY_LENGTH(ANY_VALUE(words)) words_in_doc
    , ANY_VALUE(docs_n) docs_n
  FROM words_by_post, UNNEST(words) word
  GROUP BY id, word
  HAVING words_in_doc>30
), docs_idf AS (
  SELECT tf.id, word, tf.tf, ARRAY_LENGTH(tfs) docs_with_word, LOG(docs_n/ARRAY_LENGTH(tfs)) idf
  FROM (
    SELECT word, ARRAY_AGG(STRUCT(tf, id, words_in_doc)) tfs, ANY_VALUE(docs_n) docs_n
    FROM words_tf
    GROUP BY 1
  ), UNNEST(tfs) tf
)    


SELECT *, tf*idf tfidf
FROM docs_idf
WHERE docs_with_word > 1
ORDER BY tfidf DESC
LIMIT 1000

Stack Overflow 数据集版本:

#standardSQL
WITH words_by_post AS (
  SELECT id, REGEXP_EXTRACT_ALL(
       REGEXP_REPLACE(
       REGEXP_REPLACE(
       REGEXP_REPLACE(
         LOWER(CONCAT(title, ' ', body))
         , r'&', '&')
         , r'&[a-z]*;', '')
         , r'<[= \-:a-z0-9/\."]*>', '')
      , r'[a-z]{2,20}\'?[a-z]+') words
     , title, body
  , COUNT(*) OVER() docs_n
  FROM `bigquery-public-data.Whosebug.posts_questions`  
  WHERE score >= 150
), words_tf AS (
  SELECT id, words
  , ARRAY(
     SELECT AS STRUCT w word, COUNT(*)/ARRAY_LENGTH(words) tf
     FROM UNNEST(words) a 
     JOIN (SELECT DISTINCT w FROM UNNEST(words) w) b 
     ON a=b.w 
     WHERE w NOT IN ('the', 'and', 'for', 'this', 'that', 'can', 'but') 
     GROUP BY word ORDER BY word
   ) tfs
   , ARRAY_LENGTH((words)) words_in_doc
   , docs_n
   , title, body
  FROM words_by_post
  WHERE ARRAY_LENGTH(words)>20
), docs_idf AS (
  SELECT *, LOG(docs_n/docs_with_word) idf
  FROM (
    SELECT id, word, tf.tf, COUNTIF(word IN UNNEST(words)) OVER(PARTITION BY word) docs_with_word, docs_n
     , title, body
    FROM words_tf, UNNEST(tfs) tf
  )
)    


SELECT id, ARRAY_AGG(STRUCT(word, tf*idf AS tf_idf, docs_with_word) ORDER BY tf*idf DESC) tfidfs
#  , ANY_VALUE(title) title, ANY_VALUE(body) body # makes query slower
FROM docs_idf
WHERE docs_with_word > 1
GROUP BY 1

与上一个答案相比的改进:需要跨数据集少一个 GROUP BY,帮助查询 运行 更快。

这个可能更容易理解 - 采用已经包含每个电视台和每天的单词数的数据集:

# in this query the combination of date+station represents a "document"

WITH data AS (
  SELECT *
  FROM `gdelt-bq.gdeltv2.iatv_1grams`
  WHERE DATE BETWEEN 20190601 AND 20190629
  AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
  # how many times a word is mentioned in each "document"
  SELECT word, SUM(count) counts, date, station
  FROM data
  GROUP BY 1, 3, 4
)
, day_station AS (
  # total # of words in each "document" 
  SELECT SUM(count) counts, date, station
  FROM data
  GROUP BY 2,3
)
, tf AS (
  # TF for a word in a "document"
  SELECT word, date, station, a.counts/b.counts tf
  FROM word_day_station a
  JOIN day_station b
  USING(date, station)
)
, word_in_docs AS (
  # how many "documents" have a word
  SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
  FROM word_day_station
  GROUP BY 1
)
, total_docs AS (
  # total # of docs
  SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
  FROM data
)
, idf AS (
  # IDF for a word
  SELECT word, LOG(total_docs.total_docs/indocs) idf
  FROM word_in_docs
  CROSS JOIN total_docs
)

SELECT date,
  ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
  SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
  FROM (
    SELECT word, date, station, tf.tf * idf.idf tfidf
    FROM tf
    JOIN idf
    USING(word)
  )
  GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC