如何解析以特定 str 开头的字符串的特定部分到 Oracle 中的第一个 space
How to parse a specific part of string that starts with a specific str to the first space in Oracle
我有这样的数据集:
SELECT
1 as text_id,
'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text
FROM DUAL
UNION
SELECT
2,
'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
FROM DUAL
我应该解析所有以“is”开头的短语;每行以第一个 space 个字符结尾。
所以我想从 full_text
获得的结果是:
text_id
parsed_part
1
A.ACCOUNT_ID
1
B.IDENTITY_NO
1
plate_number
2
ARC.PREV_RECORD
2
ARC.NEXT_RECORD
可能少于或多于 3 个短语,因此结果的行数可能会发生变化。
我尝试先反转文本并找到“si”和space之间的部分,但没有成功
reverse(regexp_substr(reverse(full_text), ' si ([^_]*) ',1, 1))
根据您发布的内容,看看这是否有帮助。
示例数据
SQL> with test as
2 (SELECT
3 1 as text_id,
4 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text
5 FROM DUAL
6 UNION
7 SELECT
8 2,
9 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
10 FROM DUAL
11 )
查询自身
12 select text_id,
13 ltrim(regexp_substr(full_text, 'is \S+', 1, column_value), 'is ') parsed_part
14 from test cross join
15 table(cast(multiset(select level from dual
16 connect by level <= regexp_count(full_text, ' is ')
17 ) as sys.odcinumberlist))
18 order by text_id, column_value;
TEXT_ID PARSED_PART
---------- --------------------
1 A.ACCOUNT_ID
1 B.IDENTITY_NO
1 plate_number
2 ARC.PREV_RECORD
2 ARC.NEXT_RECORD
SQL>
regexp_substr
搜索 is
字符串后跟两个 单词 并用点 分隔
- ltrim 删除前导
is
虽然 Littlefoot 先于我,但这是我的方法:
WITH tbl(text_id, full_text) AS (
SELECT 1, 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' FROM DUAL UNION ALL
SELECT 2, 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .' FROM DUAL
)
SELECT text_id,
REGEXP_SUBSTR(full_text, ' is (.*?)( |$)', 1, LEVEL, NULL, 1) parsed_part
FROM tbl
CONNECT BY LEVEL <= REGEXP_COUNT(full_text, ' is .*?( |$)')
AND PRIOR text_id = text_id
AND PRIOR SYS_GUID() IS NOT NULL;
TEXT_ID PARSED_PART
---------- --------------------
1 A.ACCOUNT_ID
1 B.IDENTITY_NO
1 plate_number
2 ARC.PREV_RECORD
2 ARC.NEXT_RECORD
5 rows selected.
通过 connect by
.
对虚线字的展开进行横向连接
WITH DATA AS (
SELECT 1 as textid, 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text FROM DUAL
UNION ALL SELECT 2, 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .' FROM DUAL
)
SELECT t.textid, w.word
FROM DATA t
CROSS JOIN LATERAL (
SELECT level AS lvl, REGEXP_SUBSTR(full_text, ' is ([A-Z._]+)',1, LEVEL, 'i', 1) AS word
FROM DUAL
CONNECT BY LEVEL <= REGEXP_COUNT(full_text, ' is ([A-Z._]+)', 1, 'i')
) w
ORDER BY t.textid;
TEXTID | WORD
-----: | :--------------
1 | A.ACCOUNT_ID
1 | B.IDENTITY_NO
1 | plate_number
2 | ARC.PREV_RECORD
2 | ARC.NEXT_RECORD
db<>fiddle here
您不需要使用(缓慢的)正则表达式,并且可以在递归中使用简单的字符串函数来实现 sub-query:
WITH bounds (text_id, full_text, start_pos, end_pos) AS (
SELECT text_id,
full_text,
INSTR(full_text, ' is ', 1),
INSTR(full_text, ' ', INSTR(full_text, ' is ', 1) + 4 )
FROM table_name
WHERE INSTR(full_text, ' is ', 1) > 0
UNION ALL
SELECT text_id,
full_text,
INSTR(full_text, ' is ', end_pos),
INSTR(full_text, ' ', INSTR(full_text, ' is ', end_pos) + 4 )
FROM bounds
WHERE start_pos > 0
AND end_pos > 0
)
SEARCH DEPTH FIRST BY text_id SET order_rn
SELECT text_id,
SUBSTR(full_text, start_pos + 4, end_pos - start_pos - 4) AS parsed_part
FROM bounds
WHERE start_pos > 0
AND end_pos > 0;
其中,对于示例数据:
CREATE TABLE table_name (text_id, full_text) AS
SELECT 1,
'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .'
FROM DUAL
UNION ALL
SELECT 2,
'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
FROM DUAL
输出:
TEXT_ID
PARSED_PART
1
A.ACCOUNT_ID
1
B.IDENTITY_NO
1
plate_number
2
ARC.PREV_RECORD
2
ARC.NEXT_RECORD
db<>fiddle here
我有这样的数据集:
SELECT
1 as text_id,
'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text
FROM DUAL
UNION
SELECT
2,
'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
FROM DUAL
我应该解析所有以“is”开头的短语;每行以第一个 space 个字符结尾。
所以我想从 full_text
获得的结果是:
text_id | parsed_part |
---|---|
1 | A.ACCOUNT_ID |
1 | B.IDENTITY_NO |
1 | plate_number |
2 | ARC.PREV_RECORD |
2 | ARC.NEXT_RECORD |
可能少于或多于 3 个短语,因此结果的行数可能会发生变化。
我尝试先反转文本并找到“si”和space之间的部分,但没有成功
reverse(regexp_substr(reverse(full_text), ' si ([^_]*) ',1, 1))
根据您发布的内容,看看这是否有帮助。
示例数据
SQL> with test as
2 (SELECT
3 1 as text_id,
4 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text
5 FROM DUAL
6 UNION
7 SELECT
8 2,
9 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
10 FROM DUAL
11 )
查询自身
12 select text_id,
13 ltrim(regexp_substr(full_text, 'is \S+', 1, column_value), 'is ') parsed_part
14 from test cross join
15 table(cast(multiset(select level from dual
16 connect by level <= regexp_count(full_text, ' is ')
17 ) as sys.odcinumberlist))
18 order by text_id, column_value;
TEXT_ID PARSED_PART
---------- --------------------
1 A.ACCOUNT_ID
1 B.IDENTITY_NO
1 plate_number
2 ARC.PREV_RECORD
2 ARC.NEXT_RECORD
SQL>
regexp_substr
搜索is
字符串后跟两个 单词 并用点 分隔
- ltrim 删除前导
is
虽然 Littlefoot 先于我,但这是我的方法:
WITH tbl(text_id, full_text) AS (
SELECT 1, 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' FROM DUAL UNION ALL
SELECT 2, 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .' FROM DUAL
)
SELECT text_id,
REGEXP_SUBSTR(full_text, ' is (.*?)( |$)', 1, LEVEL, NULL, 1) parsed_part
FROM tbl
CONNECT BY LEVEL <= REGEXP_COUNT(full_text, ' is .*?( |$)')
AND PRIOR text_id = text_id
AND PRIOR SYS_GUID() IS NOT NULL;
TEXT_ID PARSED_PART
---------- --------------------
1 A.ACCOUNT_ID
1 B.IDENTITY_NO
1 plate_number
2 ARC.PREV_RECORD
2 ARC.NEXT_RECORD
5 rows selected.
通过 connect by
.
WITH DATA AS ( SELECT 1 as textid, 'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .' as full_text FROM DUAL UNION ALL SELECT 2, 'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .' FROM DUAL ) SELECT t.textid, w.word FROM DATA t CROSS JOIN LATERAL ( SELECT level AS lvl, REGEXP_SUBSTR(full_text, ' is ([A-Z._]+)',1, LEVEL, 'i', 1) AS word FROM DUAL CONNECT BY LEVEL <= REGEXP_COUNT(full_text, ' is ([A-Z._]+)', 1, 'i') ) w ORDER BY t.textid;
TEXTID | WORD -----: | :-------------- 1 | A.ACCOUNT_ID 1 | B.IDENTITY_NO 1 | plate_number 2 | ARC.PREV_RECORD 2 | ARC.NEXT_RECORD
db<>fiddle here
您不需要使用(缓慢的)正则表达式,并且可以在递归中使用简单的字符串函数来实现 sub-query:
WITH bounds (text_id, full_text, start_pos, end_pos) AS (
SELECT text_id,
full_text,
INSTR(full_text, ' is ', 1),
INSTR(full_text, ' ', INSTR(full_text, ' is ', 1) + 4 )
FROM table_name
WHERE INSTR(full_text, ' is ', 1) > 0
UNION ALL
SELECT text_id,
full_text,
INSTR(full_text, ' is ', end_pos),
INSTR(full_text, ' ', INSTR(full_text, ' is ', end_pos) + 4 )
FROM bounds
WHERE start_pos > 0
AND end_pos > 0
)
SEARCH DEPTH FIRST BY text_id SET order_rn
SELECT text_id,
SUBSTR(full_text, start_pos + 4, end_pos - start_pos - 4) AS parsed_part
FROM bounds
WHERE start_pos > 0
AND end_pos > 0;
其中,对于示例数据:
CREATE TABLE table_name (text_id, full_text) AS
SELECT 1,
'The first is A.ACCOUNT_ID and the second one is B.IDENTITY_NO and third one is plate_number .'
FROM DUAL
UNION ALL
SELECT 2,
'The first is ARC.PREV_RECORD and the second one is ARC.NEXT_RECORD .'
FROM DUAL
输出:
TEXT_ID PARSED_PART 1 A.ACCOUNT_ID 1 B.IDENTITY_NO 1 plate_number 2 ARC.PREV_RECORD 2 ARC.NEXT_RECORD
db<>fiddle here