Oracle - 用正则表达式列表替换记录的子字符串
Oracle - Substitute substrings of records by a list of regular expressions
我想清理 table 的字母数字 (varchar2) 记录。应检查每条记录是否包含一组脏字,如果有,应将其替换。模式和替换存储在单独的 table 中。
示例:
create table to_clean (
text_id number,
dirty_text varchar2(4000));
insert into to_clean values(1,'hello this is my dirtyword1 text.');
insert into to_clean values(2,'hello this is my dirtyword3 text.');
insert into to_clean values(3,'hello this is my dirtyword2 dirtyword1 text.');
create table regex_list(
pattern varchar2(400),
replacement varchar2(400));
insert into regex_list values('dirtyword1','clean1');
insert into regex_list values('dirtyword2',' '); --remove totally
insert into regex_list values('dirtyword3','clean3');
伪代码:
for each dirty_text in to_clean
for pattern, replacement in regexlist
regex_replace(dirty_text, pattern, replacement)
在 Oracle 中解决此问题的最高效方法是什么? regex_list
包含正则表达式和纯字符串作为模式。我只想替换完整的单词,而不是部分单词(所以用空格分隔)
试试这个:
UPDATE to_clean c
SET dirty_text =
(SELECT REGEXP_REPLACE (dirty_text, pattern, replacement) replaced
FROM regex_list r
INNER JOIN
(SELECT t.*,
(SELECT pattern
FROM regex_list
WHERE INSTR (T.DIRTY_TEXT, pattern) <> 0)
find_pat
FROM to_clean t) s
ON (r.pattern = s.find_pat)
WHERE c.dirty_text = dirty_text);
如果你有表达式,而不是单词,instr 不起作用,那么使用 regexp_like(正如 Justin Cave 所说):
SELECT REGEXP_REPLACE (dirty_text, pattern, replacement) replaced
FROM regex_list r
INNER JOIN
(SELECT t.*,
(SELECT pattern
FROM regex_list
WHERE regexp_like(T.DIRTY_TEXT,pattern) )
find_pat
FROM to_clean t) s
ON (r.pattern = s.find_pat)
编辑
在这种情况下,您可以使用 plsql。请看这个:
--Create oracle objects
create or replace type clean_o as object(text_id number,dirty_text varchar2(500));
create or replace type clean_t as table of clean_o;
--Function
create or replace function clean_text return clean_t pipelined is
cursor clean_c is select * from to_clean;
text varchar2(250);
begin
for c in clean_c loop
text:= c.dirty_text;
for i in (select * from regex_list) loop
text:= regexp_replace(text,i.pattern,i.replacement);
end loop;
PIPE ROW (clean_o(c.text_id,text));
end loop;
end;
现在您可以这样做了:
select * from table(clean_text)
我想清理 table 的字母数字 (varchar2) 记录。应检查每条记录是否包含一组脏字,如果有,应将其替换。模式和替换存储在单独的 table 中。
示例:
create table to_clean (
text_id number,
dirty_text varchar2(4000));
insert into to_clean values(1,'hello this is my dirtyword1 text.');
insert into to_clean values(2,'hello this is my dirtyword3 text.');
insert into to_clean values(3,'hello this is my dirtyword2 dirtyword1 text.');
create table regex_list(
pattern varchar2(400),
replacement varchar2(400));
insert into regex_list values('dirtyword1','clean1');
insert into regex_list values('dirtyword2',' '); --remove totally
insert into regex_list values('dirtyword3','clean3');
伪代码:
for each dirty_text in to_clean
for pattern, replacement in regexlist
regex_replace(dirty_text, pattern, replacement)
在 Oracle 中解决此问题的最高效方法是什么? regex_list
包含正则表达式和纯字符串作为模式。我只想替换完整的单词,而不是部分单词(所以用空格分隔)
试试这个:
UPDATE to_clean c
SET dirty_text =
(SELECT REGEXP_REPLACE (dirty_text, pattern, replacement) replaced
FROM regex_list r
INNER JOIN
(SELECT t.*,
(SELECT pattern
FROM regex_list
WHERE INSTR (T.DIRTY_TEXT, pattern) <> 0)
find_pat
FROM to_clean t) s
ON (r.pattern = s.find_pat)
WHERE c.dirty_text = dirty_text);
如果你有表达式,而不是单词,instr 不起作用,那么使用 regexp_like(正如 Justin Cave 所说):
SELECT REGEXP_REPLACE (dirty_text, pattern, replacement) replaced
FROM regex_list r
INNER JOIN
(SELECT t.*,
(SELECT pattern
FROM regex_list
WHERE regexp_like(T.DIRTY_TEXT,pattern) )
find_pat
FROM to_clean t) s
ON (r.pattern = s.find_pat)
编辑
在这种情况下,您可以使用 plsql。请看这个:
--Create oracle objects
create or replace type clean_o as object(text_id number,dirty_text varchar2(500));
create or replace type clean_t as table of clean_o;
--Function
create or replace function clean_text return clean_t pipelined is
cursor clean_c is select * from to_clean;
text varchar2(250);
begin
for c in clean_c loop
text:= c.dirty_text;
for i in (select * from regex_list) loop
text:= regexp_replace(text,i.pattern,i.replacement);
end loop;
PIPE ROW (clean_o(c.text_id,text));
end loop;
end;
现在您可以这样做了:
select * from table(clean_text)