R - 具有编辑距离的多列文本挖掘
R - text mining over multiple columns with levenshtein distance
我想通过编辑距离(adist
R
中的函数)跨多个列多次比较文本字符串。我想做的是比较 source1$name
和 source2$name
。如果没有匹配项(即,如果 match.s1.s2$s2.i
返回 NA
,则使用两个数据帧中列出的地址(source1$address
和 source2$address
)执行相同的功能。基本上我正在寻找一种方法来优化我在多个领域的匹配。下面是一个示例。
name <- c("holiday inn", "geico", "zgf", "morton phillips")
address <- c("400 lafayette pl tupelo ms", "227 geico plaza chevy chase md",
"811 quincy st washington dc", "1911 1st st rockville md")
source1 <- data.frame(name, address)
name <- c("williams sonoma", "mamas bbq", "davis polk", "hop a long diner",
"joes crag shack", "mike lowry place", "holiday inn", "zummer")
name2 <- c(NA, NA, NA, NA, NA, NA, "hi express", "zummer gunsul frasca")
address <- c("2 reads way new castle de", "248 w 4th st newark de",
"1100 21st st nw washington dc", "1804 w 5th st wilmington de",
"1208 kenwood parkway holdridge nb", "4203 ocean drive miami fl",
"400 lafayette pl tupelo ms", "811 quincy st washington dc")
source2 <- data.frame(name, name2, address)
removeSPE <- function(x) gsub("[[:punct:]]", " ", x)
cleanup <- function(x){
x <- as.character(x) # convert to character
x <- tolower(x) # make all lowercase
x <- sapply(x, removeSPE) # remove special characters
x <- trimws(x) # remove extra white space
#x <- sapply(x, removeStopWords) # remove stopwords, defined above
#x <- trimws(x) # since stopwords have been removed, there is extra white space left, this removes it
x <- gsub("^. .$", "", x)
return(x)
}
source1$name <- cleanup(source1$name)
source2$name <- cleanup(source2$name)
source2$name2 <- cleanup(source2$name2)
source1$address <- cleanup(source1$address)
source2$address <- cleanup(source2$address)
source1$name <- cleanup(source1$name)
source2$name <- cleanup(source2$name)
source2$name2 <- cleanup(source2$name2)
dist.name<- adist(source1$name,source2$name, partial = TRUE, ignore.case = TRUE)
dist.name2 <- adist(source1$name, source2$name2, partial = TRUE, ignore.case = TRUE)
dist.address <- adist(source1$address, source2$address, partial = TRUE, ignore.case = TRUE)
min.name<-apply(dist.name, 2, min)
min.name2 <- apply(dist.name2, 2, min)
match.s1.s2<-NULL
for(i in 1:nrow(dist.address))
{
s2.i<-match(min.name[i],dist.name[i,])
s1.i<-i
match.s1.s2<-
rbind(data.frame(s2.i=s2.i,s1.i=s1.i,s2name=source2[s2.i,]$name, s1name=source1[s1.i,]$name,
adist=min.name[i], s1.i.address = source1[s1.i,]$address,
s2.i.address = source2[s2.i,]$address),match.s1.s2)
}
match.s1.s2
期望的结果是 source1
中的第 3 行与 source2
中的第 8 行匹配,source1
中的第 4 行与 source2
中的第 7 行匹配。有没有办法在上面的 for-loop
中合并 dist.name2
和 dist.address
(上面定义的)?也许是一段时间的陈述?我将使用的实际数据框大约有 500 行和 24,000 行。
余弦距离似乎做得很好:
out_df <- c()
for(x in source1$name) {
for(y in source2$full2) {
if (is.na(source2[source2$full2 == y, "name2"])) {
x2 <- source1[source1$name == x, "address"]
y2 <- source2[source2$full2 == y, "address"]
row <- data.frame(x, y2, stringdist(x, y, method="cosine", q = 1))
names(row) <- c("name1", "full2", "distance")
out_df <- rbind(out_df, row)
} else {
row <- data.frame(x, y, stringdist(x, y, method="cosine", q = 1))
names(row) <- c("name1", "full2", "distance")
out_df <- rbind(out_df, row)
}
}
}
names(out_df) <- c("name1", "full2", "distance")
grab <- aggregate(distance ~ name1, data = out_df, FUN = min)
merge(out_df, grab)
您仍然需要弄清楚如何排除不需要的结果。
我想通过编辑距离(adist
R
中的函数)跨多个列多次比较文本字符串。我想做的是比较 source1$name
和 source2$name
。如果没有匹配项(即,如果 match.s1.s2$s2.i
返回 NA
,则使用两个数据帧中列出的地址(source1$address
和 source2$address
)执行相同的功能。基本上我正在寻找一种方法来优化我在多个领域的匹配。下面是一个示例。
name <- c("holiday inn", "geico", "zgf", "morton phillips")
address <- c("400 lafayette pl tupelo ms", "227 geico plaza chevy chase md",
"811 quincy st washington dc", "1911 1st st rockville md")
source1 <- data.frame(name, address)
name <- c("williams sonoma", "mamas bbq", "davis polk", "hop a long diner",
"joes crag shack", "mike lowry place", "holiday inn", "zummer")
name2 <- c(NA, NA, NA, NA, NA, NA, "hi express", "zummer gunsul frasca")
address <- c("2 reads way new castle de", "248 w 4th st newark de",
"1100 21st st nw washington dc", "1804 w 5th st wilmington de",
"1208 kenwood parkway holdridge nb", "4203 ocean drive miami fl",
"400 lafayette pl tupelo ms", "811 quincy st washington dc")
source2 <- data.frame(name, name2, address)
removeSPE <- function(x) gsub("[[:punct:]]", " ", x)
cleanup <- function(x){
x <- as.character(x) # convert to character
x <- tolower(x) # make all lowercase
x <- sapply(x, removeSPE) # remove special characters
x <- trimws(x) # remove extra white space
#x <- sapply(x, removeStopWords) # remove stopwords, defined above
#x <- trimws(x) # since stopwords have been removed, there is extra white space left, this removes it
x <- gsub("^. .$", "", x)
return(x)
}
source1$name <- cleanup(source1$name)
source2$name <- cleanup(source2$name)
source2$name2 <- cleanup(source2$name2)
source1$address <- cleanup(source1$address)
source2$address <- cleanup(source2$address)
source1$name <- cleanup(source1$name)
source2$name <- cleanup(source2$name)
source2$name2 <- cleanup(source2$name2)
dist.name<- adist(source1$name,source2$name, partial = TRUE, ignore.case = TRUE)
dist.name2 <- adist(source1$name, source2$name2, partial = TRUE, ignore.case = TRUE)
dist.address <- adist(source1$address, source2$address, partial = TRUE, ignore.case = TRUE)
min.name<-apply(dist.name, 2, min)
min.name2 <- apply(dist.name2, 2, min)
match.s1.s2<-NULL
for(i in 1:nrow(dist.address))
{
s2.i<-match(min.name[i],dist.name[i,])
s1.i<-i
match.s1.s2<-
rbind(data.frame(s2.i=s2.i,s1.i=s1.i,s2name=source2[s2.i,]$name, s1name=source1[s1.i,]$name,
adist=min.name[i], s1.i.address = source1[s1.i,]$address,
s2.i.address = source2[s2.i,]$address),match.s1.s2)
}
match.s1.s2
期望的结果是 source1
中的第 3 行与 source2
中的第 8 行匹配,source1
中的第 4 行与 source2
中的第 7 行匹配。有没有办法在上面的 for-loop
中合并 dist.name2
和 dist.address
(上面定义的)?也许是一段时间的陈述?我将使用的实际数据框大约有 500 行和 24,000 行。
余弦距离似乎做得很好:
out_df <- c()
for(x in source1$name) {
for(y in source2$full2) {
if (is.na(source2[source2$full2 == y, "name2"])) {
x2 <- source1[source1$name == x, "address"]
y2 <- source2[source2$full2 == y, "address"]
row <- data.frame(x, y2, stringdist(x, y, method="cosine", q = 1))
names(row) <- c("name1", "full2", "distance")
out_df <- rbind(out_df, row)
} else {
row <- data.frame(x, y, stringdist(x, y, method="cosine", q = 1))
names(row) <- c("name1", "full2", "distance")
out_df <- rbind(out_df, row)
}
}
}
names(out_df) <- c("name1", "full2", "distance")
grab <- aggregate(distance ~ name1, data = out_df, FUN = min)
merge(out_df, grab)
您仍然需要弄清楚如何排除不需要的结果。