SQL余弦相似度计算
SQL Computation of Cosine Similarity
假设您在数据库中有一个 table 构造如下:
create table data (v int, base int, w_td float);
insert into data values (99,1,4);
insert into data values (99,2,3);
insert into data values (99,3,4);
insert into data values (1234,2,5);
insert into data values (1234,3,2);
insert into data values (1234,4,3);
要清楚select * from data
应该输出:
v |base|w_td
--------------
99 |1 |4.0
99 |2 |3.0
99 |3 |4.0
1234|2 |5.0
1234|3 |2.0
1234|4 |3.0
请注意,由于向量存储在数据库中,我们只需要存储非零条目。在这个例子中,我们在 $\mathbb{R} 中只有两个向量 $v_{99} = (4,3,4,0)$ 和 $v_{1234} = (0,5,2,3)$ ^4$.
这些向量的余弦相似度应该是 $\displaystyle \frac{23}{\sqrt{41 \cdot 38}} = 0.5826987807288609$.
如何几乎只使用 SQL
来计算余弦相似度?
我说几乎是因为您将需要 sqrt
函数,它在基本 SQL
实现中并不总是提供,例如 sqlite3
!
中没有
with norms as (
select v,
sum(w_td * w_td) as w2
from data
group by v
)
select
x.v as ego,y.v as v,nx.w2 as x2, ny.w2 as y2,
sum(x.w_td * y.w_td) as innerproduct,
sum(x.w_td * y.w_td) / sqrt(nx.w2 * ny.w2) as cosinesimilarity
from data as x
join data as y
on (x.base=y.base)
join norms as nx
on (nx.v=x.v)
join norms as ny
on (ny.v=y.v)
where x.v < y.v
group by 1,2,3,4
order by 6 desc
产量
ego|v |x2 |y2 |innerproduct|cosinesimilarity
--------------------------------------------------
99 |1234|41.0|38.0|23.0 |0.5826987807288609
假设您在数据库中有一个 table 构造如下:
create table data (v int, base int, w_td float);
insert into data values (99,1,4);
insert into data values (99,2,3);
insert into data values (99,3,4);
insert into data values (1234,2,5);
insert into data values (1234,3,2);
insert into data values (1234,4,3);
要清楚select * from data
应该输出:
v |base|w_td
--------------
99 |1 |4.0
99 |2 |3.0
99 |3 |4.0
1234|2 |5.0
1234|3 |2.0
1234|4 |3.0
请注意,由于向量存储在数据库中,我们只需要存储非零条目。在这个例子中,我们在 $\mathbb{R} 中只有两个向量 $v_{99} = (4,3,4,0)$ 和 $v_{1234} = (0,5,2,3)$ ^4$.
这些向量的余弦相似度应该是 $\displaystyle \frac{23}{\sqrt{41 \cdot 38}} = 0.5826987807288609$.
如何几乎只使用 SQL
来计算余弦相似度?
我说几乎是因为您将需要 sqrt
函数,它在基本 SQL
实现中并不总是提供,例如 sqlite3
!
with norms as (
select v,
sum(w_td * w_td) as w2
from data
group by v
)
select
x.v as ego,y.v as v,nx.w2 as x2, ny.w2 as y2,
sum(x.w_td * y.w_td) as innerproduct,
sum(x.w_td * y.w_td) / sqrt(nx.w2 * ny.w2) as cosinesimilarity
from data as x
join data as y
on (x.base=y.base)
join norms as nx
on (nx.v=x.v)
join norms as ny
on (ny.v=y.v)
where x.v < y.v
group by 1,2,3,4
order by 6 desc
产量
ego|v |x2 |y2 |innerproduct|cosinesimilarity
--------------------------------------------------
99 |1234|41.0|38.0|23.0 |0.5826987807288609