在 Oracle 中聚合 clob 值的最有效方法
Most effective way to aggregate clob value in Oracle
我有一个table喜欢:
SOME_ID FIRST_CLOB ANOTHER_CLOB
0 1.5|3.6|0.3 5.5|9.6
1 0.2 4.0|7.2
2 3.1|0.7|1.2 9.2|8.8|6.3
clob 的长度未定义。它可能(实际上会)比示例中的长得多。
我需要这样的东西:
SOME_ID FIRST_CLOB_MEAN ANOTHER_CLOB_MEAN
0 1.8 7.55
1 0.2 5.6
2 1.66 8.1
它不仅可以是平均值,还可以是计数。现在我们在 pandas 中处理数据时计算它,但是 需要很长时间 才能加载具有 clob 值的 table。
我只看到一种方法:在输入上创建一个触发器,它将:
1.Split 每个 CLOB 字段都是这样的:
CLOB
1.5
3.6
0.3
2.Find 的平均值
我不确定这是最好的方法,如果我从 csv 导入 table,它是否有效。将不胜感激
P.S。我有另一个解决方案:只拆分 CLOB,而不像那样聚合:
NEW_ID SOME_ID FIRST_CLOB ANOTHER_CLOB
0 0 1.5 5.5
1 0 3.6 9.6
2 0 0.3 NULL
3 1 0.2 4.0
4 1 NULL 7.2
5 2 3.1 9.2
6 2 0.7 8.8
7 2 1.2 6.3
它会加载到 pandas 更快,但是 table 会大大增加(一个 CLOB 可以包含 10,25,50,100 个项目)并且它也是不受欢迎的
您可以使用:
WITH bounds ( some_id, first_clob, start1, end1, another_clob, start2, end2 ) AS (
SELECT some_id,
first_clob,
1,
INSTR( first_clob, '|', 1 ),
another_clob,
1,
INSTR( another_clob, '|', 1 )
FROM table_name
UNION ALL
SELECT some_id,
first_clob,
DECODE( end1, 0, 0, end1 + 1 ),
DECODE( end1, 0, 0, INSTR( first_clob, '|', end1 + 1 ) ),
another_clob,
DECODE( end2, 0, 0, end2 + 1 ),
DECODE( end2, 0, 0, INSTR( another_clob, '|', end2 + 1 ) )
FROM bounds
WHERE end1 > 0
OR end2 > 0
)
SELECT some_id,
AVG(
TO_NUMBER(
CASE
WHEN start1 = 0 THEN NULL
WHEN end1 = 0 THEN SUBSTR( first_clob, start1 )
ELSE SUBSTR( first_clob, start1, end1 - start1 )
END
)
) AS FIRST_CLOB_MEAN,
AVG(
TO_NUMBER(
CASE
WHEN start2 = 0 THEN NULL
WHEN end2 = 0 THEN SUBSTR( another_clob, start2 )
ELSE SUBSTR( another_clob, start2, end2 - start2 )
END
)
) AS ANOTHER_CLOB_MEAN
FROM bounds
GROUP BY some_id
ORDER BY some_id
或
SELECT some_id,
first_clob_mean,
another_clob_mean
FROM table_name t
CROSS APPLY (
SELECT AVG( TO_NUMBER(column_value) ) AS first_clob_mean
FROM XMLTABLE( ('"' || REPLACE( t.first_clob, '|', '","' ) || '"') )
)
CROSS APPLY (
SELECT AVG( TO_NUMBER(column_value) ) AS another_clob_mean
FROM XMLTABLE( ('"' || REPLACE( t.another_clob, '|', '","' ) || '"') )
)
其中,对于示例数据:
INSERT INTO table_name ( some_id, first_clob, another_clob )
SELECT 0, '1.5|3.6|0.3', '5.5|9.6' FROM DUAL UNION ALL
SELECT 1, '0.2', '4.0|7.2' FROM DUAL UNION ALL
SELECT 2, '3.1|0.7|1.2', '9.2|8.8|6.3' FROM DUAL;
DECLARE
v_clob CLOB;
BEGIN
FOR i IN 1 .. 4000 LOOP
IF v_clob IS NOT NULL THEN
v_clob := v_clob || '|';
END IF;
v_clob := v_clob || i;
END LOOP;
INSERT INTO table_name VALUES ( 4, v_clob, NULL );
END;
/
双输出:
SOME_ID | FIRST_CLOB_MEAN | ANOTHER_CLOB_MEAN
------: | ---------------------------------------: | ----------------:
0 | 1.8 | 7.55
1 | .2 | 5.6
2 | 1.66666666666666666666666666666666666667 | 8.1
4 | 2000.5 | null
db<>fiddle here
像这样的查询可用于通过竖线分隔符拆分 CLOB,然后取平均值。每个 CLOB 都需要拆分成自己的 sub-query,因此 CLOB 中的数字计数不会影响同一行中另一个 CLOB 中的数字计数。
SELECT c1.some_id, c1.first_clob_mean, c2.another_clob_mean
FROM ( SELECT some_id,
AVG (TO_NUMBER (REGEXP_SUBSTR (first_clob,
'[^|]+',
1,
LEVEL))) AS first_clob_mean
FROM clobs
CONNECT BY LEVEL <= REGEXP_COUNT (first_clob, '\|') + 1
AND PRIOR SYS_GUID () IS NOT NULL
AND PRIOR some_id = some_id
GROUP BY some_id) c1,
( SELECT some_id,
AVG (TO_NUMBER (REGEXP_SUBSTR (another_clob,
'[^|]+',
1,
LEVEL))) AS another_clob_mean
FROM clobs
CONNECT BY LEVEL <= REGEXP_COUNT (another_clob, '\|') + 1
AND PRIOR SYS_GUID () IS NOT NULL
AND PRIOR some_id = some_id
GROUP BY some_id) c2
WHERE c1.some_id = c2.some_id
ORDER BY some_id;
我有一个table喜欢:
SOME_ID FIRST_CLOB ANOTHER_CLOB
0 1.5|3.6|0.3 5.5|9.6
1 0.2 4.0|7.2
2 3.1|0.7|1.2 9.2|8.8|6.3
clob 的长度未定义。它可能(实际上会)比示例中的长得多。
我需要这样的东西:
SOME_ID FIRST_CLOB_MEAN ANOTHER_CLOB_MEAN
0 1.8 7.55
1 0.2 5.6
2 1.66 8.1
它不仅可以是平均值,还可以是计数。现在我们在 pandas 中处理数据时计算它,但是 需要很长时间 才能加载具有 clob 值的 table。
我只看到一种方法:在输入上创建一个触发器,它将:
1.Split 每个 CLOB 字段都是这样的:
CLOB
1.5
3.6
0.3
2.Find 的平均值
我不确定这是最好的方法,如果我从 csv 导入 table,它是否有效。将不胜感激
P.S。我有另一个解决方案:只拆分 CLOB,而不像那样聚合:
NEW_ID SOME_ID FIRST_CLOB ANOTHER_CLOB
0 0 1.5 5.5
1 0 3.6 9.6
2 0 0.3 NULL
3 1 0.2 4.0
4 1 NULL 7.2
5 2 3.1 9.2
6 2 0.7 8.8
7 2 1.2 6.3
它会加载到 pandas 更快,但是 table 会大大增加(一个 CLOB 可以包含 10,25,50,100 个项目)并且它也是不受欢迎的
您可以使用:
WITH bounds ( some_id, first_clob, start1, end1, another_clob, start2, end2 ) AS (
SELECT some_id,
first_clob,
1,
INSTR( first_clob, '|', 1 ),
another_clob,
1,
INSTR( another_clob, '|', 1 )
FROM table_name
UNION ALL
SELECT some_id,
first_clob,
DECODE( end1, 0, 0, end1 + 1 ),
DECODE( end1, 0, 0, INSTR( first_clob, '|', end1 + 1 ) ),
another_clob,
DECODE( end2, 0, 0, end2 + 1 ),
DECODE( end2, 0, 0, INSTR( another_clob, '|', end2 + 1 ) )
FROM bounds
WHERE end1 > 0
OR end2 > 0
)
SELECT some_id,
AVG(
TO_NUMBER(
CASE
WHEN start1 = 0 THEN NULL
WHEN end1 = 0 THEN SUBSTR( first_clob, start1 )
ELSE SUBSTR( first_clob, start1, end1 - start1 )
END
)
) AS FIRST_CLOB_MEAN,
AVG(
TO_NUMBER(
CASE
WHEN start2 = 0 THEN NULL
WHEN end2 = 0 THEN SUBSTR( another_clob, start2 )
ELSE SUBSTR( another_clob, start2, end2 - start2 )
END
)
) AS ANOTHER_CLOB_MEAN
FROM bounds
GROUP BY some_id
ORDER BY some_id
或
SELECT some_id,
first_clob_mean,
another_clob_mean
FROM table_name t
CROSS APPLY (
SELECT AVG( TO_NUMBER(column_value) ) AS first_clob_mean
FROM XMLTABLE( ('"' || REPLACE( t.first_clob, '|', '","' ) || '"') )
)
CROSS APPLY (
SELECT AVG( TO_NUMBER(column_value) ) AS another_clob_mean
FROM XMLTABLE( ('"' || REPLACE( t.another_clob, '|', '","' ) || '"') )
)
其中,对于示例数据:
INSERT INTO table_name ( some_id, first_clob, another_clob )
SELECT 0, '1.5|3.6|0.3', '5.5|9.6' FROM DUAL UNION ALL
SELECT 1, '0.2', '4.0|7.2' FROM DUAL UNION ALL
SELECT 2, '3.1|0.7|1.2', '9.2|8.8|6.3' FROM DUAL;
DECLARE
v_clob CLOB;
BEGIN
FOR i IN 1 .. 4000 LOOP
IF v_clob IS NOT NULL THEN
v_clob := v_clob || '|';
END IF;
v_clob := v_clob || i;
END LOOP;
INSERT INTO table_name VALUES ( 4, v_clob, NULL );
END;
/
双输出:
SOME_ID | FIRST_CLOB_MEAN | ANOTHER_CLOB_MEAN ------: | ---------------------------------------: | ----------------: 0 | 1.8 | 7.55 1 | .2 | 5.6 2 | 1.66666666666666666666666666666666666667 | 8.1 4 | 2000.5 | null
db<>fiddle here
像这样的查询可用于通过竖线分隔符拆分 CLOB,然后取平均值。每个 CLOB 都需要拆分成自己的 sub-query,因此 CLOB 中的数字计数不会影响同一行中另一个 CLOB 中的数字计数。
SELECT c1.some_id, c1.first_clob_mean, c2.another_clob_mean
FROM ( SELECT some_id,
AVG (TO_NUMBER (REGEXP_SUBSTR (first_clob,
'[^|]+',
1,
LEVEL))) AS first_clob_mean
FROM clobs
CONNECT BY LEVEL <= REGEXP_COUNT (first_clob, '\|') + 1
AND PRIOR SYS_GUID () IS NOT NULL
AND PRIOR some_id = some_id
GROUP BY some_id) c1,
( SELECT some_id,
AVG (TO_NUMBER (REGEXP_SUBSTR (another_clob,
'[^|]+',
1,
LEVEL))) AS another_clob_mean
FROM clobs
CONNECT BY LEVEL <= REGEXP_COUNT (another_clob, '\|') + 1
AND PRIOR SYS_GUID () IS NOT NULL
AND PRIOR some_id = some_id
GROUP BY some_id) c2
WHERE c1.some_id = c2.some_id
ORDER BY some_id;