如何在 Google BigQuery 中执行三元组运算?
How to perform trigram operations in Google BigQuery?
我确实使用 PostgreSQL 中的 pg_trgm
模块来使用三元组计算两个字符串之间的相似度。特别是我使用:
similarity(text, text)
其中 returns returns 一个数字,表示两个参数的相似程度(在 0 和 1 之间)。
如何在 Google BigQuery 上执行相似函数(或等效函数)?
在下面试试。至少作为增强的蓝图
SELECT text1, text2, similarity FROM
JS(
// input table
(
SELECT * FROM
(SELECT 'mikhail' AS text1, 'mikhail' AS text2),
(SELECT 'mikhail' AS text1, 'mike' AS text2),
(SELECT 'mikhail' AS text1, 'michael' AS text2),
(SELECT 'mikhail' AS text1, 'javier' AS text2),
(SELECT 'mikhail' AS text1, 'thomas' AS text2)
) ,
// input columns
text1, text2,
// output schema
"[{name: 'text1', type:'string'},
{name: 'text2', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_text1;
try {
the_text1 = decodeURI(r.text1).toLowerCase();
} catch (ex) {
the_text1 = r.text1.toLowerCase();
}
try {
the_text2 = decodeURI(r.text2).toLowerCase();
} catch (ex) {
the_text2 = r.text2.toLowerCase();
}
emit({text1: the_text1, text2: the_text2,
similarity: 1 - Levenshtein.get(the_text1, the_text2) / the_text1.length});
}"
)
ORDER BY similarity DESC
这是基于@thomaspark
https://storage.googleapis.com/thomaspark-sandbox/udf-examples/pataky.js 的轻微修改
我did it喜欢这样:
CREATE TEMP FUNCTION trigram_similarity(a STRING, b STRING) AS (
(
WITH a_trigrams AS (
SELECT
DISTINCT tri_a
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(a), ''), [3,3])) AS tri_a
),
b_trigrams AS (
SELECT
DISTINCT tri_b
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(b), ''), [3,3])) AS tri_b
)
SELECT
COUNTIF(tri_b IS NOT NULL) / COUNT(*)
FROM
a_trigrams
LEFT JOIN b_trigrams ON tri_a = tri_b
)
);
这是与 Postgres's pg_trgm 的比较:
select trigram_similarity('saemus', 'seamus');
-- 0.25 vs. pg_trgm 0.272727
select trigram_similarity('shamus', 'seamus');
-- 0.5 vs. pg_trgm 0.4
我确实使用 PostgreSQL 中的 pg_trgm
模块来使用三元组计算两个字符串之间的相似度。特别是我使用:
similarity(text, text)
其中 returns returns 一个数字,表示两个参数的相似程度(在 0 和 1 之间)。
如何在 Google BigQuery 上执行相似函数(或等效函数)?
在下面试试。至少作为增强的蓝图
SELECT text1, text2, similarity FROM
JS(
// input table
(
SELECT * FROM
(SELECT 'mikhail' AS text1, 'mikhail' AS text2),
(SELECT 'mikhail' AS text1, 'mike' AS text2),
(SELECT 'mikhail' AS text1, 'michael' AS text2),
(SELECT 'mikhail' AS text1, 'javier' AS text2),
(SELECT 'mikhail' AS text1, 'thomas' AS text2)
) ,
// input columns
text1, text2,
// output schema
"[{name: 'text1', type:'string'},
{name: 'text2', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_text1;
try {
the_text1 = decodeURI(r.text1).toLowerCase();
} catch (ex) {
the_text1 = r.text1.toLowerCase();
}
try {
the_text2 = decodeURI(r.text2).toLowerCase();
} catch (ex) {
the_text2 = r.text2.toLowerCase();
}
emit({text1: the_text1, text2: the_text2,
similarity: 1 - Levenshtein.get(the_text1, the_text2) / the_text1.length});
}"
)
ORDER BY similarity DESC
这是基于@thomaspark
https://storage.googleapis.com/thomaspark-sandbox/udf-examples/pataky.js 的轻微修改我did it喜欢这样:
CREATE TEMP FUNCTION trigram_similarity(a STRING, b STRING) AS (
(
WITH a_trigrams AS (
SELECT
DISTINCT tri_a
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(a), ''), [3,3])) AS tri_a
),
b_trigrams AS (
SELECT
DISTINCT tri_b
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(b), ''), [3,3])) AS tri_b
)
SELECT
COUNTIF(tri_b IS NOT NULL) / COUNT(*)
FROM
a_trigrams
LEFT JOIN b_trigrams ON tri_a = tri_b
)
);
这是与 Postgres's pg_trgm 的比较:
select trigram_similarity('saemus', 'seamus');
-- 0.25 vs. pg_trgm 0.272727
select trigram_similarity('shamus', 'seamus');
-- 0.5 vs. pg_trgm 0.4