在 R 中编程层次聚类算法的教学方法
Pedagogical way of programming hierarchical clustering algorithm in R
我正在准备关于 R 机器学习的讲座,我想参加 hierarchical clustering as one example. I found this very instructive page here: http://home.deib.polimi.it/matteucc/Clustering/tutorial_html/hierarchical.html
它从以下距离 table 开始(在读取数据时请注意 NA
作为 column/row 名称,另见下文):
最短的距离是MI
和TO
之间的138
,所以我们想把那些列和行合并到一个新的column/rowMI/TO
中这个新复合对象 MI/TO
到所有剩余城市的距离等于原始城市之一的最短距离 MI
或 TO
,例如MI/TO
到 RM
是 564
(来自 MI
),因为它比 669
(来自 TO
)短。 (这种进行聚合的方式称为 single-linkage clustering)。所以我们有一个新的 table:
我的问题
我开始用 R 对此进行编码,很快发现代码变得越来越混乱——远非初出茅庐的程序员可以轻松理解的东西。您知道可以以自然直观的方式进行此类数据操作的方法或程序包吗?
所以这是 R 中的起始 table:
D <- matrix(c(0,662,877,255,412,996,
662,0,295,468,268,400,
877,295,0,754,564,138,
255,468,754,0,219,869,
412,268,564,219,0,669,
996,400,138,869,669,0), ncol=6, byrow=T)
rownames(D) <- colnames(D) <- c("BA","FI","MI","Na","RM","TO")
D
## BA FI MI Na RM TO
## BA 0 662 877 255 412 996
## FI 662 0 295 468 268 400
## MI 877 295 0 754 564 138
## Na 255 468 754 0 219 869
## RM 412 268 564 219 0 669
## TO 996 400 138 869 669 0
内置函数 "hclust" 已经是一个很好用的函数。
hc1 = hclust(as.dist(D), method = "single")
hc1$merge
plot(hc1)
如果你想澄清,我可以详细描述。
按照hclust的逻辑,你可以试试:
savemat = list()
D1 = D; diag(D1) = Inf # a trick to make zero a infinity
m = 1
while(dim(D1)[1] > 2) {
# get the location of minimum distance
minloc = which(D1 == min(D1), arr.ind = T)[1,]
# make a two-column matrix then find out the minimum value of each row
u = apply(cbind(D1[minloc[2],],D1[minloc[1],]),1,min)
# updating the matrix
D1[minloc[2],] = u
D1[,minloc[2]] = u
u = paste0(rownames(D1)[minloc[2]],'/',rownames(D1)[minloc[1]])
rownames(D1)[minloc[2]] = u
colnames(D1)[minloc[2]] = u
# deleting the merged column/row
D1 = D1[-minloc[1],-minloc[1]]
diag(D1) = Inf
# save the steps into a list element mth
savemat[[m]] = D1
m = m + 1
}
savemat
将代码更新为递归函数和单独的打印函数,以便更好地了解正在发生的事情。与 hcl(<data.frame>,<log_level>)
一起使用。日志级别可以是 0 仅表示最终结果,1 表示打印中间数据集,2 表示打印每个步骤
# To be allowed to add column later, don't know a better way than coercing to data.frame
d <- data.frame(D,stringsAsFactors=F)
myprt <- function(message,var) {
print(message)
print(var)
}
hcl <- function(d,prt=0) {
if (prt) myprt("Starting dataset:",d)
# 1) Get the shortest distance informations:
Ref <- which( d==min(d[d>0]), useNames=T, arr.ind=T )
if (prt>1) myprt("Ref is:",Ref)
# 2) Subset the original entry to remove thoose towns:
res <- d[-Ref[,1],-Ref[,1]]
if (prt>1) myprt("Res is:", res)
# 3) Get the subset for the two nearest towns:
tmp <- d[-Ref[,1],Ref[,1]]
if (prt>1) myprt("Tmp is:",tmp)
# 4) Get the vector of minimal distance from original dataset with the two town (row by row on t)
dists <- apply( tmp, 1, function(x) { x[x==min(x)] } )
#dists <- tmp[ tmp == pmin( tmp[,1], tmp[,2] ) ]
if (prt>1) myprt("Dists is:",dists)
# 5) Let's build the resulting matrix:
tnames <- paste(rownames(Ref),collapse="/") # Get the names of town to the new name
if (length(res) == 1) {
# Nothing left in the original dataset just concat the names and return
tnames <- paste(c(tnames,names(dists)),collapse="/")
Finalres <- data.frame(tnames = dists) # build the df
names(Finalres) <- rownames(Finalres) <- tnames # Name it
if (prt>0) myprt("Final result:",Finalres)
return(Finalres) # Last iteration
} else {
Finalres <- res
Finalres[tnames,tnames] <- 0 # Set the diagonal to 0
Finalres[is.na(Finalres)] <- dists # the previous assignment has set NAs, replae them by the dists values
if (prt>0) myprt("Dataset before recursive call:",Finalres)
return(hcl(Finalres,prt)) # we're not at end, recall ourselves with actual result
}
}
另一个想法的步骤:
# To be allowed to add column later, don't know a better way than coercing to data.frame
d <- data.frame(D,stringsAsFactors=F)
# 1) Get the shortest distance informations:
Ref <- which( d==min(d[d>0]), useNames=T, arr.ind=T )
# 2) Subset the original entry to remove thoose towns:
res <-d[-Ref[,1],-Ref[,1]]
# 3) Get the subset for the two nearest towns:
tmp <- d[-Ref[,1],Ref[,1]]
# 4) Get the vector of minimal distance from original dataset with the two town (row by row on tpm), didn't find a proper way to avoid apply
dists <- apply( tmp, 1, function(x) { x[x==min(x)] } )
dists <- dists <- tmp[ tmp == pmin( tmp[,1], tmp[,2] ) ]
# 5) Let's build the resulting matrix:
tnames <- paste(rownames(Ref),collapse="/") # Get the names of town to the new name
Finalres <- res
Finalres[tnames,tnames] <- 0 # Set the diagonal to 0
Finalres[is.na(Finalres)] <- dists # the previous assignment has set NAs, replae them by the dists values
输出:
> Finalres
BA FI Na RM TO/MI
BA 0 662 255 412 877
FI 662 0 468 268 295
Na 255 468 0 219 754
RM 412 268 219 0 564
TO/MI 877 295 754 564 0
以及每一步的输出:
> #Steps:
>
> Ref
row col
TO 6 3
MI 3 6
> res
BA FI Na RM
BA 0 662 255 412
FI 662 0 468 268
Na 255 468 0 219
RM 412 268 219 0
> tmp
TO MI
BA 996 877
FI 400 295
Na 869 754
RM 669 564
> dists
[1] 877 295 754 564
这里有很多对象复制可以避免以节省性能,我让它有一个更好的逐步视图。
我正在准备关于 R 机器学习的讲座,我想参加 hierarchical clustering as one example. I found this very instructive page here: http://home.deib.polimi.it/matteucc/Clustering/tutorial_html/hierarchical.html
它从以下距离 table 开始(在读取数据时请注意 NA
作为 column/row 名称,另见下文):
最短的距离是MI
和TO
之间的138
,所以我们想把那些列和行合并到一个新的column/rowMI/TO
中这个新复合对象 MI/TO
到所有剩余城市的距离等于原始城市之一的最短距离 MI
或 TO
,例如MI/TO
到 RM
是 564
(来自 MI
),因为它比 669
(来自 TO
)短。 (这种进行聚合的方式称为 single-linkage clustering)。所以我们有一个新的 table:
我的问题
我开始用 R 对此进行编码,很快发现代码变得越来越混乱——远非初出茅庐的程序员可以轻松理解的东西。您知道可以以自然直观的方式进行此类数据操作的方法或程序包吗?
所以这是 R 中的起始 table:
D <- matrix(c(0,662,877,255,412,996,
662,0,295,468,268,400,
877,295,0,754,564,138,
255,468,754,0,219,869,
412,268,564,219,0,669,
996,400,138,869,669,0), ncol=6, byrow=T)
rownames(D) <- colnames(D) <- c("BA","FI","MI","Na","RM","TO")
D
## BA FI MI Na RM TO
## BA 0 662 877 255 412 996
## FI 662 0 295 468 268 400
## MI 877 295 0 754 564 138
## Na 255 468 754 0 219 869
## RM 412 268 564 219 0 669
## TO 996 400 138 869 669 0
内置函数 "hclust" 已经是一个很好用的函数。
hc1 = hclust(as.dist(D), method = "single")
hc1$merge
plot(hc1)
如果你想澄清,我可以详细描述。
按照hclust的逻辑,你可以试试:
savemat = list()
D1 = D; diag(D1) = Inf # a trick to make zero a infinity
m = 1
while(dim(D1)[1] > 2) {
# get the location of minimum distance
minloc = which(D1 == min(D1), arr.ind = T)[1,]
# make a two-column matrix then find out the minimum value of each row
u = apply(cbind(D1[minloc[2],],D1[minloc[1],]),1,min)
# updating the matrix
D1[minloc[2],] = u
D1[,minloc[2]] = u
u = paste0(rownames(D1)[minloc[2]],'/',rownames(D1)[minloc[1]])
rownames(D1)[minloc[2]] = u
colnames(D1)[minloc[2]] = u
# deleting the merged column/row
D1 = D1[-minloc[1],-minloc[1]]
diag(D1) = Inf
# save the steps into a list element mth
savemat[[m]] = D1
m = m + 1
}
savemat
将代码更新为递归函数和单独的打印函数,以便更好地了解正在发生的事情。与 hcl(<data.frame>,<log_level>)
一起使用。日志级别可以是 0 仅表示最终结果,1 表示打印中间数据集,2 表示打印每个步骤
# To be allowed to add column later, don't know a better way than coercing to data.frame
d <- data.frame(D,stringsAsFactors=F)
myprt <- function(message,var) {
print(message)
print(var)
}
hcl <- function(d,prt=0) {
if (prt) myprt("Starting dataset:",d)
# 1) Get the shortest distance informations:
Ref <- which( d==min(d[d>0]), useNames=T, arr.ind=T )
if (prt>1) myprt("Ref is:",Ref)
# 2) Subset the original entry to remove thoose towns:
res <- d[-Ref[,1],-Ref[,1]]
if (prt>1) myprt("Res is:", res)
# 3) Get the subset for the two nearest towns:
tmp <- d[-Ref[,1],Ref[,1]]
if (prt>1) myprt("Tmp is:",tmp)
# 4) Get the vector of minimal distance from original dataset with the two town (row by row on t)
dists <- apply( tmp, 1, function(x) { x[x==min(x)] } )
#dists <- tmp[ tmp == pmin( tmp[,1], tmp[,2] ) ]
if (prt>1) myprt("Dists is:",dists)
# 5) Let's build the resulting matrix:
tnames <- paste(rownames(Ref),collapse="/") # Get the names of town to the new name
if (length(res) == 1) {
# Nothing left in the original dataset just concat the names and return
tnames <- paste(c(tnames,names(dists)),collapse="/")
Finalres <- data.frame(tnames = dists) # build the df
names(Finalres) <- rownames(Finalres) <- tnames # Name it
if (prt>0) myprt("Final result:",Finalres)
return(Finalres) # Last iteration
} else {
Finalres <- res
Finalres[tnames,tnames] <- 0 # Set the diagonal to 0
Finalres[is.na(Finalres)] <- dists # the previous assignment has set NAs, replae them by the dists values
if (prt>0) myprt("Dataset before recursive call:",Finalres)
return(hcl(Finalres,prt)) # we're not at end, recall ourselves with actual result
}
}
另一个想法的步骤:
# To be allowed to add column later, don't know a better way than coercing to data.frame
d <- data.frame(D,stringsAsFactors=F)
# 1) Get the shortest distance informations:
Ref <- which( d==min(d[d>0]), useNames=T, arr.ind=T )
# 2) Subset the original entry to remove thoose towns:
res <-d[-Ref[,1],-Ref[,1]]
# 3) Get the subset for the two nearest towns:
tmp <- d[-Ref[,1],Ref[,1]]
# 4) Get the vector of minimal distance from original dataset with the two town (row by row on tpm), didn't find a proper way to avoid apply
dists <- apply( tmp, 1, function(x) { x[x==min(x)] } )
dists <- dists <- tmp[ tmp == pmin( tmp[,1], tmp[,2] ) ]
# 5) Let's build the resulting matrix:
tnames <- paste(rownames(Ref),collapse="/") # Get the names of town to the new name
Finalres <- res
Finalres[tnames,tnames] <- 0 # Set the diagonal to 0
Finalres[is.na(Finalres)] <- dists # the previous assignment has set NAs, replae them by the dists values
输出:
> Finalres
BA FI Na RM TO/MI
BA 0 662 255 412 877
FI 662 0 468 268 295
Na 255 468 0 219 754
RM 412 268 219 0 564
TO/MI 877 295 754 564 0
以及每一步的输出:
> #Steps:
>
> Ref
row col
TO 6 3
MI 3 6
> res
BA FI Na RM
BA 0 662 255 412
FI 662 0 468 268
Na 255 468 0 219
RM 412 268 219 0
> tmp
TO MI
BA 996 877
FI 400 295
Na 869 754
RM 669 564
> dists
[1] 877 295 754 564
这里有很多对象复制可以避免以节省性能,我让它有一个更好的逐步视图。