在oracle中计算中位数的置信区间
calculating confidence intervals for a median in oracle
我正在尝试生成一个 oracle sql 查询,它不仅可以计算中位年龄,还可以计算 95% 的置信度。在这种情况下,要增加复杂性,需要跨组进行性别
我有 table 的人,他们的年龄和性别。我想确定每个组的中位年龄及其 95% 的置信区间。
我目前失败的尝试如下。
select gender,
median(age),
count(*),
percentile_cont(ROUND((COUNT(*)/2)-1.96*sqrt(COUNT(*))/2)/COUNT(*))
within GROUP (ORDER BY age) lowmedianage,
percentile_cont(ROUND((COUNT(*)/2)+1.96*sqrt(COUNT(*))/2)/COUNT(*))
within GROUP (ORDER BY age) highmedianage
from persontable
group by gender
我得到的错误不是 GROUP BY 表达式。
这里的问题是您将函数 count 作为 percentile_cont 的参数,它需要一个常量,而这个常量必须是 group by 子句的一部分。您可能可以在此处使用子查询。与此类似的内容:
select gender, median(age), count(*),
percentile_cont(low) within GROUP (ORDER BY age) lowmedianage,
percentile_cont(high) within GROUP (ORDER BY age) highmedianage
from (select age, gender,
ROUND((COUNT(*)/2)-1.96*sqrt(COUNT(*))/2)/COUNT(*) low,
ROUND((COUNT(*)/2)+1.96*sqrt(COUNT(*))/2)/COUNT(*) high
from persontable
group by age, gender)
group by gender, low, high
使用 this book 中的公式,我将以以下查询结束(我不确定您是否能很好地处理低范围和高范围;我的解释是您计算了序列号的范围并且您必须从这些位置查找值)。
with tab as
-- add sequence per group
(
select gender, age,
row_number() over (PARTITION BY gender order by gender, age) as seq
from persontable
),
-- get count
N as (select gender, count(*) cnt from persontable group by gender),
-- calculate sequence numbers of the CI
ci_seq as (
select gender,
round(cnt/2 - (1.96 * sqrt(cnt)/2)) r,
round(1 + cnt/2 + (1.96 * sqrt(cnt)/2)) s
from n),
-- calculate median
med as (
select
gender,
median(age) median_age
from persontable
group by gender),
med2 as (
select med.gender, median_age, r, s
from med
join ci_seq on med.gender = ci_seq.gender
)
select gender, median_age,
(select age from tab where seq = r and gender = med2.gender) ci_from,
(select age from tab where seq = s and gender = med2.gender) ci_to
from med2
;
另请注意,该公式仅近似于 CI。您还可以检查 this thread 以了解替代计算。
我正在尝试生成一个 oracle sql 查询,它不仅可以计算中位年龄,还可以计算 95% 的置信度。在这种情况下,要增加复杂性,需要跨组进行性别 我有 table 的人,他们的年龄和性别。我想确定每个组的中位年龄及其 95% 的置信区间。 我目前失败的尝试如下。
select gender,
median(age),
count(*),
percentile_cont(ROUND((COUNT(*)/2)-1.96*sqrt(COUNT(*))/2)/COUNT(*))
within GROUP (ORDER BY age) lowmedianage,
percentile_cont(ROUND((COUNT(*)/2)+1.96*sqrt(COUNT(*))/2)/COUNT(*))
within GROUP (ORDER BY age) highmedianage
from persontable
group by gender
我得到的错误不是 GROUP BY 表达式。
这里的问题是您将函数 count 作为 percentile_cont 的参数,它需要一个常量,而这个常量必须是 group by 子句的一部分。您可能可以在此处使用子查询。与此类似的内容:
select gender, median(age), count(*),
percentile_cont(low) within GROUP (ORDER BY age) lowmedianage,
percentile_cont(high) within GROUP (ORDER BY age) highmedianage
from (select age, gender,
ROUND((COUNT(*)/2)-1.96*sqrt(COUNT(*))/2)/COUNT(*) low,
ROUND((COUNT(*)/2)+1.96*sqrt(COUNT(*))/2)/COUNT(*) high
from persontable
group by age, gender)
group by gender, low, high
使用 this book 中的公式,我将以以下查询结束(我不确定您是否能很好地处理低范围和高范围;我的解释是您计算了序列号的范围并且您必须从这些位置查找值)。
with tab as
-- add sequence per group
(
select gender, age,
row_number() over (PARTITION BY gender order by gender, age) as seq
from persontable
),
-- get count
N as (select gender, count(*) cnt from persontable group by gender),
-- calculate sequence numbers of the CI
ci_seq as (
select gender,
round(cnt/2 - (1.96 * sqrt(cnt)/2)) r,
round(1 + cnt/2 + (1.96 * sqrt(cnt)/2)) s
from n),
-- calculate median
med as (
select
gender,
median(age) median_age
from persontable
group by gender),
med2 as (
select med.gender, median_age, r, s
from med
join ci_seq on med.gender = ci_seq.gender
)
select gender, median_age,
(select age from tab where seq = r and gender = med2.gender) ci_from,
(select age from tab where seq = s and gender = med2.gender) ci_to
from med2
;
另请注意,该公式仅近似于 CI。您还可以检查 this thread 以了解替代计算。