在 ClickHouse 中使用相同 table 的最接近匹配文本搜索
Nearest matching text search using the same table in ClickHouse
CREATE TABLE DB.table
(
`date` Date,
`path` String,
`visits` UInt64,
`users` UInt64
) ENGINE = MergeTree
PARTITION BY toYYYYMM( `date`)
ORDER BY `date`
SETTINGS index_granularity = 8192
插入数据
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofa',14837,12444);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','regalo',11917,8268);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','puertas',7498,6279);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','armario',7078,6230);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofa-cama',6771,5442);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofas',6182,5161);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','tractor-cortacesped',5740,3407);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','cocina',5634,4721);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sillas',5448,4738);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','silla',448,478);
我想使用相同的 table
为每个 path
字段获取最接近的匹配文本
I read this example,但在这种情况下,“clickhouse”的值与 table...
不同
你知道这是一个交叉连接,如果是 10 行,它是 100 个 ngramDistance 调用?
SELECT DISTINCT
arraySort([t1.path, t2.path]) AS pair,
ngramDistance(t1.path, t2.path) AS dist
FROM t AS t1, t AS t2
WHERE t1.path != t2.path
ORDER BY dist ASC
Query id: 3cdeb2af-3628-4289-9d83-91d571aa70e7
┌─pair────────────────────────────────┬───────dist─┐
│ ['silla','sillas'] │ 0.2 │
│ ['sofa','sofas'] │ 0.33333334 │
│ ['sofa','sofa-cama'] │ 0.71428573 │
│ ['sofa-cama','sofas'] │ 0.75 │
│ ['puertas','sillas'] │ 1 │
│ ['sofas','tractor-cortacesped'] │ 1 │
│ ['cocina','sofa'] │ 1 │
│ ['sillas','sofa'] │ 1 │
│ ['silla','sofa'] │ 1 │
│ ['puertas','regalo'] │ 1 │
│ ['armario','regalo'] │ 1 │
│ ['regalo','sofa-cama'] │ 1 │
│ ['regalo','sofas'] │ 1 │
│ ['regalo','tractor-cortacesped'] │ 1 │
│ ['cocina','regalo'] │ 1 │
│ ['sofa','tractor-cortacesped'] │ 1 │
│ ['regalo','silla'] │ 1 │
│ ['armario','puertas'] │ 1 │
│ ['puertas','sofa-cama'] │ 1 │
│ ['puertas','sofas'] │ 1 │
│ ['puertas','tractor-cortacesped'] │ 1 │
│ ['cocina','puertas'] │ 1 │
│ ['armario','sofa'] │ 1 │
│ ['puertas','silla'] │ 1 │
│ ['armario','sofa-cama'] │ 1 │
│ ['armario','sofas'] │ 1 │
│ ['armario','tractor-cortacesped'] │ 1 │
│ ['armario','cocina'] │ 1 │
│ ['armario','sillas'] │ 1 │
│ ['armario','silla'] │ 1 │
│ ['puertas','sofa'] │ 1 │
│ ['sofa-cama','tractor-cortacesped'] │ 1 │
│ ['cocina','sofa-cama'] │ 1 │
│ ['sillas','sofa-cama'] │ 1 │
│ ['silla','sofa-cama'] │ 1 │
│ ['regalo','sofa'] │ 1 │
│ ['cocina','sofas'] │ 1 │
│ ['sillas','sofas'] │ 1 │
│ ['silla','sofas'] │ 1 │
│ ['cocina','tractor-cortacesped'] │ 1 │
│ ['sillas','tractor-cortacesped'] │ 1 │
│ ['silla','tractor-cortacesped'] │ 1 │
│ ['cocina','sillas'] │ 1 │
│ ['cocina','silla'] │ 1 │
│ ['regalo','sillas'] │ 1 │
└─────────────────────────────────────┴────────────┘
CREATE TABLE DB.table
(
`date` Date,
`path` String,
`visits` UInt64,
`users` UInt64
) ENGINE = MergeTree
PARTITION BY toYYYYMM( `date`)
ORDER BY `date`
SETTINGS index_granularity = 8192
插入数据
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofa',14837,12444);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','regalo',11917,8268);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','puertas',7498,6279);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','armario',7078,6230);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofa-cama',6771,5442);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sofas',6182,5161);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','tractor-cortacesped',5740,3407);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','cocina',5634,4721);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','sillas',5448,4738);
INSERT INTO DB.Table (`date`,`path`,visits,users) VALUES ('2022-04-01','silla',448,478);
我想使用相同的 table
为每个path
字段获取最接近的匹配文本
I read this example,但在这种情况下,“clickhouse”的值与 table...
不同你知道这是一个交叉连接,如果是 10 行,它是 100 个 ngramDistance 调用?
SELECT DISTINCT
arraySort([t1.path, t2.path]) AS pair,
ngramDistance(t1.path, t2.path) AS dist
FROM t AS t1, t AS t2
WHERE t1.path != t2.path
ORDER BY dist ASC
Query id: 3cdeb2af-3628-4289-9d83-91d571aa70e7
┌─pair────────────────────────────────┬───────dist─┐
│ ['silla','sillas'] │ 0.2 │
│ ['sofa','sofas'] │ 0.33333334 │
│ ['sofa','sofa-cama'] │ 0.71428573 │
│ ['sofa-cama','sofas'] │ 0.75 │
│ ['puertas','sillas'] │ 1 │
│ ['sofas','tractor-cortacesped'] │ 1 │
│ ['cocina','sofa'] │ 1 │
│ ['sillas','sofa'] │ 1 │
│ ['silla','sofa'] │ 1 │
│ ['puertas','regalo'] │ 1 │
│ ['armario','regalo'] │ 1 │
│ ['regalo','sofa-cama'] │ 1 │
│ ['regalo','sofas'] │ 1 │
│ ['regalo','tractor-cortacesped'] │ 1 │
│ ['cocina','regalo'] │ 1 │
│ ['sofa','tractor-cortacesped'] │ 1 │
│ ['regalo','silla'] │ 1 │
│ ['armario','puertas'] │ 1 │
│ ['puertas','sofa-cama'] │ 1 │
│ ['puertas','sofas'] │ 1 │
│ ['puertas','tractor-cortacesped'] │ 1 │
│ ['cocina','puertas'] │ 1 │
│ ['armario','sofa'] │ 1 │
│ ['puertas','silla'] │ 1 │
│ ['armario','sofa-cama'] │ 1 │
│ ['armario','sofas'] │ 1 │
│ ['armario','tractor-cortacesped'] │ 1 │
│ ['armario','cocina'] │ 1 │
│ ['armario','sillas'] │ 1 │
│ ['armario','silla'] │ 1 │
│ ['puertas','sofa'] │ 1 │
│ ['sofa-cama','tractor-cortacesped'] │ 1 │
│ ['cocina','sofa-cama'] │ 1 │
│ ['sillas','sofa-cama'] │ 1 │
│ ['silla','sofa-cama'] │ 1 │
│ ['regalo','sofa'] │ 1 │
│ ['cocina','sofas'] │ 1 │
│ ['sillas','sofas'] │ 1 │
│ ['silla','sofas'] │ 1 │
│ ['cocina','tractor-cortacesped'] │ 1 │
│ ['sillas','tractor-cortacesped'] │ 1 │
│ ['silla','tractor-cortacesped'] │ 1 │
│ ['cocina','sillas'] │ 1 │
│ ['cocina','silla'] │ 1 │
│ ['regalo','sillas'] │ 1 │
└─────────────────────────────────────┴────────────┘