R,stringr - 替换数据框中行的多个字符
R, stringr - replace multiple characters from rows in a datframe
我在商店数据框中的 "address" 列中存储了地址,我想创建一个新列,对现有地址进行以下更正:
{"ST": "STREET",
"RD": "ROAD",
"AVE": "AVENUE",
"N": "NORTH",
"W": "WEST",
"S": "SOUTH",
"E": "EAST",
"STE": "SUITE",
"HWY": "HIGHWAY",
"DR": "DRIVE",
"NW": "NORTH WEST",
"NE": "NORTH EAST",
"SW": "SOUTH WEST",
"SE": "SOUTH EAST",
"LN": "LANE",
"WAY": "WAY"}
我该如何推进?
预期输出:
101 ST LN -> 101 街巷
解决此问题的一种方法是使用 stringi
中的 stri_replace_all_regex
。它接受矢量化模式和替换。
我们可以对单词边界使用 \b
通配符,它本身需要转义为 \b
。为了处理缩写以 .
结尾的情况,我们可以将文字 .
或 \b
与 (\.|\b)
.
匹配
我在答案的末尾根据您的数据制作了模式和替换向量。
library(stringi)
stri_replace_all_regex("101 ST. LN",pattern = terms[[1]], replacement = terms[[2]],vectorize_all = FALSE)
[1] "101 STREET LANE"
同样适用于要进行替换的字符串向量。
data <- data.frame(address = c("1 N ST", "2 E AVE", "3 S RD", "4 SE LN"))
stri_replace_all_regex(data$address,pattern = terms[[1]], replacement = terms[[2]],vectorize_all = FALSE)
#[1] "1 NORTH STREET" "2 EAST AVENUE" "3 SOUTH ROAD" "4 SOUTH EAST LANE"
数据
terms <- c("ST", "STREET", "RD", "ROAD", "AVE", "AVENUE", "N", "NORTH",
"W", "WEST", "S", "SOUTH", "E", "EAST", "STE", "SUITE", "HWY",
"HIGHWAY", "DR", "DRIVE", "NW", "NORTH WEST", "NE", "NORTH EAST",
"SW", "SOUTH WEST", "SE", "SOUTH EAST", "LN", "LANE", "WAY",
"WAY")
terms <- split(terms,rep(1:2,times = length(terms) / 2))
terms[[1]] <- paste0("\b",terms[[1]],"(\.|\b)")
terms[[1]]
# [1] "\bST(\.|\b)" "\bRD(\.|\b)" "\bAVE(\.|\b)" "\bN(\.|\b)" "\bW(\.|\b)" "\bS(\.|\b)" "\bE(\.|\b)"
# [8] "\bSTE(\.|\b)" "\bHWY(\.|\b)" "\bDR(\.|\b)" "\bNW(\.|\b)" "\bNE(\.|\b)" "\bSW(\.|\b)" "\bSE(\.|\b)"
#[15] "\bLN(\.|\b)" "\bWAY(\.|\b)"
terms[[2]]
# [1] "STREET" "ROAD" "AVENUE" "NORTH" "WEST" "SOUTH" "EAST" "SUITE" "HIGHWAY" "DRIVE"
#[11] "NORTH WEST" "NORTH EAST" "SOUTH WEST" "SOUTH EAST" "LANE" "WAY"
这应该有效,str_replace_all
来自包 stringr
:
df <- data.frame(address = c("12 ST W", "333 AVE", "45 RD", "666 STE E"))
str_replace_all(df$address,c("\bST\b" = "STREET",
"\bRD\b" = "ROAD",
"\bAVE\b" = "AVENUE",
"\bN\b" = "NORTH",
"\bW\b" = "WEST",
"\bE\b" = "EAST",
"\bSTE\b" = "SUITE"))
[1] "12 STREET WEST" "333 AVENUE" "45 ROAD" "666 SUITE EAST"
我在商店数据框中的 "address" 列中存储了地址,我想创建一个新列,对现有地址进行以下更正:
{"ST": "STREET",
"RD": "ROAD",
"AVE": "AVENUE",
"N": "NORTH",
"W": "WEST",
"S": "SOUTH",
"E": "EAST",
"STE": "SUITE",
"HWY": "HIGHWAY",
"DR": "DRIVE",
"NW": "NORTH WEST",
"NE": "NORTH EAST",
"SW": "SOUTH WEST",
"SE": "SOUTH EAST",
"LN": "LANE",
"WAY": "WAY"}
我该如何推进?
预期输出:
101 ST LN -> 101 街巷
解决此问题的一种方法是使用 stringi
中的 stri_replace_all_regex
。它接受矢量化模式和替换。
我们可以对单词边界使用 \b
通配符,它本身需要转义为 \b
。为了处理缩写以 .
结尾的情况,我们可以将文字 .
或 \b
与 (\.|\b)
.
我在答案的末尾根据您的数据制作了模式和替换向量。
library(stringi)
stri_replace_all_regex("101 ST. LN",pattern = terms[[1]], replacement = terms[[2]],vectorize_all = FALSE)
[1] "101 STREET LANE"
同样适用于要进行替换的字符串向量。
data <- data.frame(address = c("1 N ST", "2 E AVE", "3 S RD", "4 SE LN"))
stri_replace_all_regex(data$address,pattern = terms[[1]], replacement = terms[[2]],vectorize_all = FALSE)
#[1] "1 NORTH STREET" "2 EAST AVENUE" "3 SOUTH ROAD" "4 SOUTH EAST LANE"
数据
terms <- c("ST", "STREET", "RD", "ROAD", "AVE", "AVENUE", "N", "NORTH",
"W", "WEST", "S", "SOUTH", "E", "EAST", "STE", "SUITE", "HWY",
"HIGHWAY", "DR", "DRIVE", "NW", "NORTH WEST", "NE", "NORTH EAST",
"SW", "SOUTH WEST", "SE", "SOUTH EAST", "LN", "LANE", "WAY",
"WAY")
terms <- split(terms,rep(1:2,times = length(terms) / 2))
terms[[1]] <- paste0("\b",terms[[1]],"(\.|\b)")
terms[[1]]
# [1] "\bST(\.|\b)" "\bRD(\.|\b)" "\bAVE(\.|\b)" "\bN(\.|\b)" "\bW(\.|\b)" "\bS(\.|\b)" "\bE(\.|\b)"
# [8] "\bSTE(\.|\b)" "\bHWY(\.|\b)" "\bDR(\.|\b)" "\bNW(\.|\b)" "\bNE(\.|\b)" "\bSW(\.|\b)" "\bSE(\.|\b)"
#[15] "\bLN(\.|\b)" "\bWAY(\.|\b)"
terms[[2]]
# [1] "STREET" "ROAD" "AVENUE" "NORTH" "WEST" "SOUTH" "EAST" "SUITE" "HIGHWAY" "DRIVE"
#[11] "NORTH WEST" "NORTH EAST" "SOUTH WEST" "SOUTH EAST" "LANE" "WAY"
这应该有效,str_replace_all
来自包 stringr
:
df <- data.frame(address = c("12 ST W", "333 AVE", "45 RD", "666 STE E"))
str_replace_all(df$address,c("\bST\b" = "STREET",
"\bRD\b" = "ROAD",
"\bAVE\b" = "AVENUE",
"\bN\b" = "NORTH",
"\bW\b" = "WEST",
"\bE\b" = "EAST",
"\bSTE\b" = "SUITE"))
[1] "12 STREET WEST" "333 AVENUE" "45 ROAD" "666 SUITE EAST"