加权随机选择
Weighted Random Selection
请。我有两个 table 最常见的名字和姓氏。每个 table 基本上有两个字段:
表
CREATE TABLE "common_first_name" (
"first_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
CREATE TABLE "common_last_name" (
"last_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
P.S:TOP 1 名称仅出现约 1.8% 的时间。 table 各有 1000 行。
函数(伪,未就绪)
CREATE OR REPLACE FUNCTION create_sample_data(p_number_of_records INT)
RETURNS VOID
AS $$
DECLARE
SUM_OF_WEIGHTS CONSTANT INT := 100;
BEGIN
FOR i IN 1..coalesce(p_number_of_records, 0) LOOP
--Get the random first and last name but taking in consideration their probability (RATIO)round(random()*SUM_OF_WEIGHTS);
--create_person (random_first_name || ' ' || random_last_name);
END LOOP;
END
$$
LANGUAGE plpgsql VOLATILE;
P.S.: 每个名字的所有比率的总和(每个 table)总和为 100%.
我想 运行 一个函数 N 次并得到一个名字和一个姓氏来创建示例数据...两个 table 各有 1000 行。
样本大小可以是从 1000 个全名到 1000000 个名字的任何地方,所以如果有一个“快速”的方法来执行这个随机加权函数,那就更好了。
在 PL/PGSQL 中有关于如何操作的建议吗?
我在 SUPABASE.IO 上使用 PG 13.3。
谢谢
鉴于输入数据集较小,在纯 SQL 中执行此操作很简单。使用 CTE 为每个 common_FOO_name 表中的每一行构建下限和上限列,然后使用 generate_series() 生成随机数集。将所有内容连接在一起,并使用边界之间的随机值作为 WHERE 子句。
with first_names_weighted as (
select first_name,
sum(ratio) over (order by first_name) - ratio as lower_bound,
sum(ratio) over (order by first_name) as upper_bound
from common_first_name
),
last_names_weighted as (
select last_name,
sum(ratio) over (order by last_name) - ratio as lower_bound,
sum(ratio) over (order by last_name) as upper_bound
from common_last_name
),
randoms as (
select random() * (select sum(ratio) from common_first_name) as f_random,
random() * (select sum(ratio) from common_last_name) as l_random
from generate_series(1, 32)
)
select r, first_name, last_name
from randoms r
cross join first_names_weighted f
cross join last_names_weighted l
where f.lower_bound <= r.f_random and r.f_random <= f.upper_bound
and l.lower_bound <= r.l_random and r.l_random <= l.upper_bound;
更改传递给 generate_series()
的值以控制要生成的名称数量。如果它是一个函数很重要,您可以只使用 LANGAUGE SQL 函数定义来参数化该数字:
请。我有两个 table 最常见的名字和姓氏。每个 table 基本上有两个字段:
表
CREATE TABLE "common_first_name" (
"first_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
CREATE TABLE "common_last_name" (
"last_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
P.S:TOP 1 名称仅出现约 1.8% 的时间。 table 各有 1000 行。
函数(伪,未就绪)
CREATE OR REPLACE FUNCTION create_sample_data(p_number_of_records INT)
RETURNS VOID
AS $$
DECLARE
SUM_OF_WEIGHTS CONSTANT INT := 100;
BEGIN
FOR i IN 1..coalesce(p_number_of_records, 0) LOOP
--Get the random first and last name but taking in consideration their probability (RATIO)round(random()*SUM_OF_WEIGHTS);
--create_person (random_first_name || ' ' || random_last_name);
END LOOP;
END
$$
LANGUAGE plpgsql VOLATILE;
P.S.: 每个名字的所有比率的总和(每个 table)总和为 100%.
我想 运行 一个函数 N 次并得到一个名字和一个姓氏来创建示例数据...两个 table 各有 1000 行。
样本大小可以是从 1000 个全名到 1000000 个名字的任何地方,所以如果有一个“快速”的方法来执行这个随机加权函数,那就更好了。
在 PL/PGSQL 中有关于如何操作的建议吗?
我在 SUPABASE.IO 上使用 PG 13.3。
谢谢
鉴于输入数据集较小,在纯 SQL 中执行此操作很简单。使用 CTE 为每个 common_FOO_name 表中的每一行构建下限和上限列,然后使用 generate_series() 生成随机数集。将所有内容连接在一起,并使用边界之间的随机值作为 WHERE 子句。
with first_names_weighted as (
select first_name,
sum(ratio) over (order by first_name) - ratio as lower_bound,
sum(ratio) over (order by first_name) as upper_bound
from common_first_name
),
last_names_weighted as (
select last_name,
sum(ratio) over (order by last_name) - ratio as lower_bound,
sum(ratio) over (order by last_name) as upper_bound
from common_last_name
),
randoms as (
select random() * (select sum(ratio) from common_first_name) as f_random,
random() * (select sum(ratio) from common_last_name) as l_random
from generate_series(1, 32)
)
select r, first_name, last_name
from randoms r
cross join first_names_weighted f
cross join last_names_weighted l
where f.lower_bound <= r.f_random and r.f_random <= f.upper_bound
and l.lower_bound <= r.l_random and r.l_random <= l.upper_bound;
更改传递给 generate_series()
的值以控制要生成的名称数量。如果它是一个函数很重要,您可以只使用 LANGAUGE SQL 函数定义来参数化该数字: