SQL 中的条件概率
Conditional probability in SQL
我想我有点陷入了死胡同。
假设我有一个数据集,这很简单 -
person_id 和 book_id。这几乎是事实 table 说某人 X 买了书 A、B 和 C。
我知道如何找出有多少人同时购买了 Book X 和 Book Y。
这是
select a.book_id as B1, b.book_id as B2, count(b.person_id) as
Bought_Together
from dbo.data a
cross join dbo.data b
where a.book_id != b.book_id and a.person_id = b.person_id
group by a.book_id, b.book_id
再一次,这是我的大脑决定关闭的地方。我知道我可能需要这样做
count(b.person_id) / all the people that bought book A * 100
但我不完全确定。
希望我说得够清楚了。
EDIT1:我目前正在使用 SQL Server 2017,所以我认为正确答案是 T-SQL?。
最后,格式应该与此类似。也不存在 A 可以购买三本 X 书的情况。
Book1 Book2 HowManyPeopleBoughtBook2
1 2 50%
1 3 7%
2 3 15%
2 1 40%
3 1 60%
3 2 20%
EDIT2:假设数据库中有数十万行。是的,这与我正在上的数据科学课程有点相关 - 因此有大量数据。
如果您想生成一起购买的成对书籍的所有可能组合以及购买该组合的人的百分比,以下内容可以提供帮助
create table data1(book_id int, person_id int)
insert into data1
select *
from (values(1,300)
,(2,300)
,(2,301)
,(1,301)
,(3,301)
)t(book_id,person_id)
with books
as (select distinct book_id
from data1 a
)
,tot_persons
as (select count(distinct person_id) as tot_cnt
from data1
)
,pairs
as (
select a.book_id as col1 /* This block generates all possible pair combinations of books*/
,b.book_id as col2
from books a
join books b
on a.book_id<b.book_id
)
select a.col1,a.col2
,count(b.person_id)*100/(select tot_cnt from tot_persons) as percent_of_persons_buying_both
from pairs a
join data1 b
on a.col1=b.book_id
where exists(select 1
from data1 b1
where b.person_id=b1.person_id
and a.col2=b1.book_id)
group by a.col1,a.col2
关于我的 phone,对打字错误表示歉意
SELECT
SUM(bought_b) * 100.0 / COUNT(*)
FROM
(
SELECT
person_id,
MAX(CASE WHEN book_id = 'A' THEN 1 END) AS bought_a,
MAX(CASE WHEN book_id = 'B' THEN 1 END) AS bought_b
FROM
data
WHERE
book_id IN ('A', 'B')
GROUP BY
person_id
)
person_stats
WHERE
bought_a = 1
关于我的 phone,对打字错误表示歉意
编辑:刚刚看到您想要所有组合,只需要一组组合。
WITH
book AS
(
SELECT DISTINCT book_id FROM data
)
SELECT
book_a_id,
book_b_id,
bought_b * 100.0 / bought_b
FROM
(
SELECT
book_a.book_id AS book_a_id,
book_b.book_id AS book_b_id,
COUNT(DISTINCT data_a.person_id) AS bought_a,
COUNT(DISTINCT data_b.person_id) AS bought_b
FROM
book AS book_a
CROSS JOIN
book AS book_b
INNER JOIN
data AS data_a
ON data_a.book_id = book_a.book_id
LEFT JOIN
data AS data_b
ON data_b.book_id = book_b.book_id
GROUP BY
book_a.book_id,
book_b.book_id
)
stats
您可以扩展您的逻辑来执行此操作:
select a.book_id as B1, b.book_id as B2,
count(b.book_id) as bought_second_book,
count(b.book_id) * 1.0 / book_cnt as ratio_Bought_Together
from (select a.*, count(*) over (partition by a.book_id) as book_cnt
from dbo.data a
) a left join
dbo.data b
on a.person_id = b.person_id and a.book_id <> b.book_id
group by a.book_id, b.book_id, a.book_cnt;
这是假设人们只买一次书。如果有重复项,那么 count(distinct)
会对此进行调整。
我想我有点陷入了死胡同。
假设我有一个数据集,这很简单 - person_id 和 book_id。这几乎是事实 table 说某人 X 买了书 A、B 和 C。
我知道如何找出有多少人同时购买了 Book X 和 Book Y。 这是
select a.book_id as B1, b.book_id as B2, count(b.person_id) as
Bought_Together
from dbo.data a
cross join dbo.data b
where a.book_id != b.book_id and a.person_id = b.person_id
group by a.book_id, b.book_id
再一次,这是我的大脑决定关闭的地方。我知道我可能需要这样做
count(b.person_id) / all the people that bought book A * 100
但我不完全确定。
希望我说得够清楚了。
EDIT1:我目前正在使用 SQL Server 2017,所以我认为正确答案是 T-SQL?。 最后,格式应该与此类似。也不存在 A 可以购买三本 X 书的情况。
Book1 Book2 HowManyPeopleBoughtBook2
1 2 50%
1 3 7%
2 3 15%
2 1 40%
3 1 60%
3 2 20%
EDIT2:假设数据库中有数十万行。是的,这与我正在上的数据科学课程有点相关 - 因此有大量数据。
如果您想生成一起购买的成对书籍的所有可能组合以及购买该组合的人的百分比,以下内容可以提供帮助
create table data1(book_id int, person_id int)
insert into data1
select *
from (values(1,300)
,(2,300)
,(2,301)
,(1,301)
,(3,301)
)t(book_id,person_id)
with books
as (select distinct book_id
from data1 a
)
,tot_persons
as (select count(distinct person_id) as tot_cnt
from data1
)
,pairs
as (
select a.book_id as col1 /* This block generates all possible pair combinations of books*/
,b.book_id as col2
from books a
join books b
on a.book_id<b.book_id
)
select a.col1,a.col2
,count(b.person_id)*100/(select tot_cnt from tot_persons) as percent_of_persons_buying_both
from pairs a
join data1 b
on a.col1=b.book_id
where exists(select 1
from data1 b1
where b.person_id=b1.person_id
and a.col2=b1.book_id)
group by a.col1,a.col2
关于我的 phone,对打字错误表示歉意
SELECT
SUM(bought_b) * 100.0 / COUNT(*)
FROM
(
SELECT
person_id,
MAX(CASE WHEN book_id = 'A' THEN 1 END) AS bought_a,
MAX(CASE WHEN book_id = 'B' THEN 1 END) AS bought_b
FROM
data
WHERE
book_id IN ('A', 'B')
GROUP BY
person_id
)
person_stats
WHERE
bought_a = 1
关于我的 phone,对打字错误表示歉意
编辑:刚刚看到您想要所有组合,只需要一组组合。
WITH
book AS
(
SELECT DISTINCT book_id FROM data
)
SELECT
book_a_id,
book_b_id,
bought_b * 100.0 / bought_b
FROM
(
SELECT
book_a.book_id AS book_a_id,
book_b.book_id AS book_b_id,
COUNT(DISTINCT data_a.person_id) AS bought_a,
COUNT(DISTINCT data_b.person_id) AS bought_b
FROM
book AS book_a
CROSS JOIN
book AS book_b
INNER JOIN
data AS data_a
ON data_a.book_id = book_a.book_id
LEFT JOIN
data AS data_b
ON data_b.book_id = book_b.book_id
GROUP BY
book_a.book_id,
book_b.book_id
)
stats
您可以扩展您的逻辑来执行此操作:
select a.book_id as B1, b.book_id as B2,
count(b.book_id) as bought_second_book,
count(b.book_id) * 1.0 / book_cnt as ratio_Bought_Together
from (select a.*, count(*) over (partition by a.book_id) as book_cnt
from dbo.data a
) a left join
dbo.data b
on a.person_id = b.person_id and a.book_id <> b.book_id
group by a.book_id, b.book_id, a.book_cnt;
这是假设人们只买一次书。如果有重复项,那么 count(distinct)
会对此进行调整。