在 oracle sql 中使用 regexp_replace 标准化地址
Standardizing addresses using regexp_replace in oracle sql
非常感谢大家的帮助。
我们正在对客户的地址进行标准化。
我有一个主要客户table,其中包含客户数据,要标准化的地址字段。
我有一个映射 table,其中包含 from_str 到 to_str 值的映射。
我需要将对应于from_str的to_str获取到地址数据中。
如果标准化后地址的长度超过35个字符,则从前面的space(' ')字符串的右边到末尾,被创建为一个单独的字段address2
你能帮忙 sql 或 pl/sql 吗?使用 Oracle 12c 数据库。
我在下面编写的代码不适用于所有 from_str 值...仅适用于前 2 行。
感谢任何帮助。
到目前为止代码:
with addresses as
(
select cust_id,address addr from
(
select 10 cust_id,'9 Help Street, Level 4' address from dual union all
select 11 cust_id,'22 Victoria Street' address from dual union all
select 12 cust_id,'1495 Franklin Str' address from dual union all
select 13 cust_id,'30 Hasivim St.,Petah-Tikva' address from dual union all
select 14 cust_id,'2 Jakaranda St' address from dual union all
select 15 cust_id,'61, Science Park Rd' address from dual union all
select 16 cust_id,'61, Social park road' address from dual union all
select 17 cust_id,'Av. Hermanos Escobar 5756' address from dual union all
select 18 cust_id,'Ave. Hermanos Escobar 5756' address from dual union all
select 19 cust_id,'8000 W FLORISSANT AVE' address from dual union all
select 20 cust_id,'8600 MEMORIAL PKWY SW' address from dual union all
select 21 cust_id,'8200 FLORISSANTMEMORIALWAYABOVE SW' address from dual union all
select 22 cust_id,'8600 MEMORIALFLORISSANT PKWY SW' address from dual
) t1
),
replacements as
(
select id,to_str,from_string from_str from
(
select 1 id,'St' to_str,'Street' from_string from dual union all
select 2 id,'St' to_str,'St.' from_string from dual union all
select 3 id,'St' to_str,'Str' from_string from dual union all
select 4 id,'St' to_str,'St' from_string from dual union all
select 5 id,'Rd' to_str,'Rd.' from_string from dual union all
select 6 id,'Rd' to_str,'road' from_string from dual union all
select 7 id,'Av' to_str,'Av.' from_string from dual union all
select 8 id,'Av' to_str,'Ave.' from_string from dual union all
select 9 id,'Av' to_str,'Avenue' from_string from dual union all
select 10 id,'Av' to_str,'Aven.' from_string from dual union all
select 11 id,'West' to_str,'W' from_string from dual union all
select 12 id,'South West' to_str,'SW.' from_string from dual
) t2
),
r(addr,test_addr,l) as
(
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
id - 1
from
addresses,
replacements
where id = (select count(*) from replacements)
union all
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
l - 1
from r,
replacements
where id = l
)
select addr,test_addr,l
from r
where l=0
;
预期输出:
cust_id address
10 9 Help St, Level 4
11 22 Victoria St
12 1495 Franklin St
13 30 Hasivim St ,Petah-Tikva
14 2 Jakaranda St
15 61, Science Park Rd
16 61, Social park Rd
17 Av Hermanos Escobar 5756
18 Av Hermanos Escobar 5756
19 8000 West FLORISSANT Ave
20 8600 MEMORIAL PKWY South West
如果地址长度超过 35 个字符,则预期输出为:
cust_id address address2
21 8200 FLORISSANTMEMORIALWAYABOVE South West
22 8600 MEMORIALFLORISSANT PKWY South West
这是一种应用多个替换操作的...有趣的方式。所以 - 正如您提到的,您有 2 个问题。对于第一个,CTE 的递归部分是在 addr
上执行 regexp_replace()
而不是 test_addr
(上一个递归步骤的修改输出)。因此,只会应用列表中的最后一条规则。
r(addr,test_addr, l) as
(
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
id - 1
from
addresses,
replacements
where id = (select count(*) from replacements)
union all
-- if you do regexp_replace on addr, it throws out the previous replace (which is in r.test_addr)
select addr,regexp_replace(test_addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
l - 1
from r,
replacements
where id = l
)
对于“超过 35 个字符”的问题,我建议使用 substr/instr - 虽然难以阅读,但它们通常很快。
select addr,test_addr,l,
case when length(test_addr) > 35 then
substr(test_addr, 1, instr(substr(test_addr,1,35), ' ', -1))
else test_addr
end as addr1,
case when length(test_addr) > 35 then
substr(test_addr, instr(substr(test_addr,1,35), ' ', -1))
else null
end as addr2
from r
where l=0
;
可能有更优雅的方式来完成这部分,这只是我想到的第一件事。
非常感谢大家的帮助。
我们正在对客户的地址进行标准化。
我有一个主要客户table,其中包含客户数据,要标准化的地址字段。
我有一个映射 table,其中包含 from_str 到 to_str 值的映射。
我需要将对应于from_str的to_str获取到地址数据中。
如果标准化后地址的长度超过35个字符,则从前面的space(' ')字符串的右边到末尾,被创建为一个单独的字段address2
你能帮忙 sql 或 pl/sql 吗?使用 Oracle 12c 数据库。
我在下面编写的代码不适用于所有 from_str 值...仅适用于前 2 行。
感谢任何帮助。
到目前为止代码:
with addresses as
(
select cust_id,address addr from
(
select 10 cust_id,'9 Help Street, Level 4' address from dual union all
select 11 cust_id,'22 Victoria Street' address from dual union all
select 12 cust_id,'1495 Franklin Str' address from dual union all
select 13 cust_id,'30 Hasivim St.,Petah-Tikva' address from dual union all
select 14 cust_id,'2 Jakaranda St' address from dual union all
select 15 cust_id,'61, Science Park Rd' address from dual union all
select 16 cust_id,'61, Social park road' address from dual union all
select 17 cust_id,'Av. Hermanos Escobar 5756' address from dual union all
select 18 cust_id,'Ave. Hermanos Escobar 5756' address from dual union all
select 19 cust_id,'8000 W FLORISSANT AVE' address from dual union all
select 20 cust_id,'8600 MEMORIAL PKWY SW' address from dual union all
select 21 cust_id,'8200 FLORISSANTMEMORIALWAYABOVE SW' address from dual union all
select 22 cust_id,'8600 MEMORIALFLORISSANT PKWY SW' address from dual
) t1
),
replacements as
(
select id,to_str,from_string from_str from
(
select 1 id,'St' to_str,'Street' from_string from dual union all
select 2 id,'St' to_str,'St.' from_string from dual union all
select 3 id,'St' to_str,'Str' from_string from dual union all
select 4 id,'St' to_str,'St' from_string from dual union all
select 5 id,'Rd' to_str,'Rd.' from_string from dual union all
select 6 id,'Rd' to_str,'road' from_string from dual union all
select 7 id,'Av' to_str,'Av.' from_string from dual union all
select 8 id,'Av' to_str,'Ave.' from_string from dual union all
select 9 id,'Av' to_str,'Avenue' from_string from dual union all
select 10 id,'Av' to_str,'Aven.' from_string from dual union all
select 11 id,'West' to_str,'W' from_string from dual union all
select 12 id,'South West' to_str,'SW.' from_string from dual
) t2
),
r(addr,test_addr,l) as
(
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
id - 1
from
addresses,
replacements
where id = (select count(*) from replacements)
union all
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
l - 1
from r,
replacements
where id = l
)
select addr,test_addr,l
from r
where l=0
;
预期输出:
cust_id address
10 9 Help St, Level 4
11 22 Victoria St
12 1495 Franklin St
13 30 Hasivim St ,Petah-Tikva
14 2 Jakaranda St
15 61, Science Park Rd
16 61, Social park Rd
17 Av Hermanos Escobar 5756
18 Av Hermanos Escobar 5756
19 8000 West FLORISSANT Ave
20 8600 MEMORIAL PKWY South West
如果地址长度超过 35 个字符,则预期输出为:
cust_id address address2
21 8200 FLORISSANTMEMORIALWAYABOVE South West
22 8600 MEMORIALFLORISSANT PKWY South West
这是一种应用多个替换操作的...有趣的方式。所以 - 正如您提到的,您有 2 个问题。对于第一个,CTE 的递归部分是在 addr
上执行 regexp_replace()
而不是 test_addr
(上一个递归步骤的修改输出)。因此,只会应用列表中的最后一条规则。
r(addr,test_addr, l) as
(
select addr,regexp_replace(addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
id - 1
from
addresses,
replacements
where id = (select count(*) from replacements)
union all
-- if you do regexp_replace on addr, it throws out the previous replace (which is in r.test_addr)
select addr,regexp_replace(test_addr,'(^|\W)' || from_str || '(\W|$)','' || to_str || '') test_addr,
l - 1
from r,
replacements
where id = l
)
对于“超过 35 个字符”的问题,我建议使用 substr/instr - 虽然难以阅读,但它们通常很快。
select addr,test_addr,l,
case when length(test_addr) > 35 then
substr(test_addr, 1, instr(substr(test_addr,1,35), ' ', -1))
else test_addr
end as addr1,
case when length(test_addr) > 35 then
substr(test_addr, instr(substr(test_addr,1,35), ' ', -1))
else null
end as addr2
from r
where l=0
;
可能有更优雅的方式来完成这部分,这只是我想到的第一件事。