为什么聚类的代码中有NA的介绍?
Why is there an NA introduction in the code for clustering?
该代码尝试使用简单的相异性度量(即对于 0-0 匹配是完美匹配的地方)编写二进制变量的聚类代码。为了确保它不会以局部最小值结束,我需要 运行 这几次。但偶尔,我会收到与引入 NA 值有关的错误消息。代码中没有使用强制转换。我不知道 NA 值是怎么长出来的。
dissim<-function(a,b){
match<-sum(a==b)
unmatch<-sum(a!=b)
sim<-match/(match+unmatch)
dissim<-1-sim
return(dissim)
}
findmode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
pleasecluster<-function(df){
##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it.
create<-function(dataframe){
proto1<-NULL
for(i in 1:length(dataframe[1,])){
proto1<-c(proto1, sample(c(0,1), 1))
}
proto2<-as.numeric(proto1==0)
return(list(proto1, proto2))
}
##This function will assign a cluster index to each entry in the original data frame.
clusterassign<-function(proto1, proto2, dataframe){
clustervector<-NULL
for(i in 1:length(dataframe[,1])){
dis1<-dissim(dataframe[i,], proto1)
dis2<-dissim(dataframe[i,], proto2)
clusterindex<-which.min(c(dis1, dis2))
clustervector<-c(clustervector, clusterindex)
}
return(clustervector)
}
##Based on the created clusters, this will then find the centres of those clusters
updproto<-function(clvec, dataframe){
cluster1<-(dataframe[clvec==1,])
cluster2<-(dataframe[clvec==2,])
newproto1<-NULL
newproto2<-NULL
for(i in 1:length(dataframe[1,])){
mode1<-findmode(cluster1[,i])
newproto1<-c(newproto1, mode1)
mode2<-findmode(cluster2[,i])
newproto2<-c(newproto2, mode2)
}
return(list(newproto1, newproto2))
}
##This will match the centres found of the current clusters and the initial centres used
checkproto<-function(oldproto1, olproto2, newproto1, newproto2){
if (sum(oldproto1!=newproto1)>0){a1<-FALSE}
else{a1<-TRUE}
if (sum(oldproto2!=newproto2)>0){a2<-FALSE}
else{a2<-T}
return(c(a1,a2))
}
##The main function
starter<-create(df)
proto1<-starter[[1]]
proto2<-starter[[2]]
count<-1
repeat{
clvec<-clusterassign(proto1, proto2, df)
oldproto1<-proto1
oldproto2<-proto2
upd<-updproto(clvec, df)
proto1<-upd[[1]]
proto2<-upd[[2]]
check<-checkproto(oldproto1, oldproto2, proto1, proto2)
count<-count+1
#calc total dissimilarity
totdiss1<-NULL
totdiss2<-NULL
cluster1<-df[clvec==1,]
for(i in 1:sum(clvec==1)){
dissi1<-dissim(cluster1[i,],proto1)
totdiss1<-sum(totdiss1, dissi1)
}
cluster2<-df[clvec==2,]
for(i in 1:sum(clvec==2)){
dissi2<-dissim(cluster2[i,],proto2)
totdiss2<-sum(totdiss2, dissi2)
}
totdiss<-totdiss1+totdiss2
if((all(check))|count>50){break}
}
return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2 ))
}
对于测试数据集,我得到了正确的值和错误消息。
a
c.1..1. c.1..0. c.1..1..1 c.0..0. c.0..0..1 c.0..0..2 c.1..1..2 c.1..1..3
1 1 1 1 0 0 0 1 1
2 1 0 1 0 0 0 1 1
3 1 1 1 1 0 0 1 1
4 1 1 1 0 0 0 1 1
5 1 1 0 0 0 0 1 1
6 0 0 0 1 1 1 1 1
7 0 1 0 1 1 1 1 1
8 0 0 0 1 1 1 1 1
9 0 0 0 1 0 1 1 1
pleasecluster(a)
[[1]]
[1] 1 1 1 0 0 0 1 1
[[2]]
[1] 0 0 0 1 1 1 1 1
[[3]]
[1] 1 1 1 1 1 2 2 2 2
[[4]]
[1] 4
[[5]]
[1] 0.625
[[6]]
[1] 0.375
[[7]]
[1] 0.25
pleasecluster(a)
Error in if (sum(oldproto2 != newproto2) > 0) { :
missing value where TRUE/FALSE needed
如果这更适合代码审查或数据科学 SE,请告诉我。
附录
考虑到错误可能是由于其中一个簇为空的情况(每当调用一行时导致 NA 值),我修改了代码。 (我还使用 ncol 和 nrow 修改了所有 for 循环 运行ning 的长度)。现在我得到一个不同的错误。
findmode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
pleasecluster<-function(df){
##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it.
create<-function(dataframe){
repeat{proto1<-NULL
for(i in 1:ncol(dataframe)){
proto1<-c(proto1, sample(c(0,1), 1))
}
proto2<-as.numeric(proto1==0)
if (length(unique(proto1))>1){break}
}
return(list(proto1, proto2))
}
##This function will assign a cluster index to each entry in the original data frame.
clusterassign<-function(proto1, proto2, dataframe){
clustervector<-NULL
for(i in 1:nrow(dataframe)){
dis1<-dissim(dataframe[i,], proto1)
dis2<-dissim(dataframe[i,], proto2)
clusterindex<-which.min(c(dis1, dis2))
clustervector<-c(clustervector, clusterindex)
}
return(clustervector)
}
##Based on the created clusters, this will then find the centres of those clusters
updproto<-function(clvec, dataframe){
cluster1<-(dataframe[clvec==1,])
cluster2<-(dataframe[clvec==2,])
newproto1<-NULL
newproto2<-NULL
if (nrow(cluster2)>0&nrow(cluster1)>0) {for(i in 1:ncol(dataframe)){
mode1<-findmode(cluster1[,i])
newproto1<-c(newproto1, mode1)
mode2<-findmode(cluster2[,i])
newproto2<-c(newproto2, mode2)
}}
else {starter<-create(dataframe)
newproto1<-starter[[1]]
newproto2<-starter[[2]]}
return(list(newproto1, newproto2))
}
##This will match the centres found of the current clusters and the initial centres used
checkproto<-function(oldproto1, olproto2, newproto1, newproto2){
if (sum(oldproto1!=newproto1)>0){a1<-FALSE}
else{a1<-TRUE}
if (sum(oldproto2!=newproto2)>0){a2<-FALSE}
else{a2<-T}
return(c(a1,a2))
}
##The main function
starter<-create(df)
proto1<-starter[[1]]
proto2<-starter[[2]]
count<-1
repeat{
clvec<-clusterassign(proto1, proto2, df)
oldproto1<-proto1
oldproto2<-proto2
upd<-updproto(clvec, df)
proto1<-upd[[1]]
proto2<-upd[[2]]
check<-checkproto(oldproto1, oldproto2, proto1, proto2)
count<-count+1
#calc total dissimilarity
totdiss1<-NULL
totdiss2<-NULL
cluster1<-df[clvec==1,]
for(i in 1:nrow(cluster1)){
dissi1<-dissim(cluster1[i,],proto1)
totdiss1<-sum(totdiss1, dissi1)
}
cluster2<-df[clvec==2,]
for(i in 1:nrow(cluster2)){
dissi2<-dissim(cluster2[i,],proto2)
totdiss2<-sum(totdiss2, dissi2)
}
totdiss<-totdiss1+totdiss2
if((all(check))|count>50){break}
}
return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2 ))
}
pleasecluster(a)
Hide Traceback
Rerun with Debug
Error in e2[[j]] : subscript out of bounds
3 Ops.data.frame(a, b)
2 dissim(cluster2[i, ], proto2)
1 pleasecluster(a)
问题是由于簇向量偶尔会全1或全2,导致其中一个簇为空。因此,在进一步的循环中,当调用这些空集群时,要么引入 NA's
,要么像第二种情况一样,调用失败,因为集群是空的。如果 clusterassign
函数中出现这种情况,一个简单的随机化循环应该可以解决问题。
#to check for and remove empty clusters
if (length(unique(clustervector))==1){
repeat{ clustervector<-NULL
for (i in 1:nrow(dataframe)){
add<-sample(c(1,2), 1)
clustervector<-c(clustervector, add)
}
if (length(unique(clustervector))==2){break}
}
}
该代码尝试使用简单的相异性度量(即对于 0-0 匹配是完美匹配的地方)编写二进制变量的聚类代码。为了确保它不会以局部最小值结束,我需要 运行 这几次。但偶尔,我会收到与引入 NA 值有关的错误消息。代码中没有使用强制转换。我不知道 NA 值是怎么长出来的。
dissim<-function(a,b){
match<-sum(a==b)
unmatch<-sum(a!=b)
sim<-match/(match+unmatch)
dissim<-1-sim
return(dissim)
}
findmode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
pleasecluster<-function(df){
##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it.
create<-function(dataframe){
proto1<-NULL
for(i in 1:length(dataframe[1,])){
proto1<-c(proto1, sample(c(0,1), 1))
}
proto2<-as.numeric(proto1==0)
return(list(proto1, proto2))
}
##This function will assign a cluster index to each entry in the original data frame.
clusterassign<-function(proto1, proto2, dataframe){
clustervector<-NULL
for(i in 1:length(dataframe[,1])){
dis1<-dissim(dataframe[i,], proto1)
dis2<-dissim(dataframe[i,], proto2)
clusterindex<-which.min(c(dis1, dis2))
clustervector<-c(clustervector, clusterindex)
}
return(clustervector)
}
##Based on the created clusters, this will then find the centres of those clusters
updproto<-function(clvec, dataframe){
cluster1<-(dataframe[clvec==1,])
cluster2<-(dataframe[clvec==2,])
newproto1<-NULL
newproto2<-NULL
for(i in 1:length(dataframe[1,])){
mode1<-findmode(cluster1[,i])
newproto1<-c(newproto1, mode1)
mode2<-findmode(cluster2[,i])
newproto2<-c(newproto2, mode2)
}
return(list(newproto1, newproto2))
}
##This will match the centres found of the current clusters and the initial centres used
checkproto<-function(oldproto1, olproto2, newproto1, newproto2){
if (sum(oldproto1!=newproto1)>0){a1<-FALSE}
else{a1<-TRUE}
if (sum(oldproto2!=newproto2)>0){a2<-FALSE}
else{a2<-T}
return(c(a1,a2))
}
##The main function
starter<-create(df)
proto1<-starter[[1]]
proto2<-starter[[2]]
count<-1
repeat{
clvec<-clusterassign(proto1, proto2, df)
oldproto1<-proto1
oldproto2<-proto2
upd<-updproto(clvec, df)
proto1<-upd[[1]]
proto2<-upd[[2]]
check<-checkproto(oldproto1, oldproto2, proto1, proto2)
count<-count+1
#calc total dissimilarity
totdiss1<-NULL
totdiss2<-NULL
cluster1<-df[clvec==1,]
for(i in 1:sum(clvec==1)){
dissi1<-dissim(cluster1[i,],proto1)
totdiss1<-sum(totdiss1, dissi1)
}
cluster2<-df[clvec==2,]
for(i in 1:sum(clvec==2)){
dissi2<-dissim(cluster2[i,],proto2)
totdiss2<-sum(totdiss2, dissi2)
}
totdiss<-totdiss1+totdiss2
if((all(check))|count>50){break}
}
return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2 ))
}
对于测试数据集,我得到了正确的值和错误消息。
a
c.1..1. c.1..0. c.1..1..1 c.0..0. c.0..0..1 c.0..0..2 c.1..1..2 c.1..1..3
1 1 1 1 0 0 0 1 1
2 1 0 1 0 0 0 1 1
3 1 1 1 1 0 0 1 1
4 1 1 1 0 0 0 1 1
5 1 1 0 0 0 0 1 1
6 0 0 0 1 1 1 1 1
7 0 1 0 1 1 1 1 1
8 0 0 0 1 1 1 1 1
9 0 0 0 1 0 1 1 1
pleasecluster(a)
[[1]]
[1] 1 1 1 0 0 0 1 1
[[2]]
[1] 0 0 0 1 1 1 1 1
[[3]]
[1] 1 1 1 1 1 2 2 2 2
[[4]]
[1] 4
[[5]]
[1] 0.625
[[6]]
[1] 0.375
[[7]]
[1] 0.25
pleasecluster(a)
Error in if (sum(oldproto2 != newproto2) > 0) { :
missing value where TRUE/FALSE needed
如果这更适合代码审查或数据科学 SE,请告诉我。
附录
考虑到错误可能是由于其中一个簇为空的情况(每当调用一行时导致 NA 值),我修改了代码。 (我还使用 ncol 和 nrow 修改了所有 for 循环 运行ning 的长度)。现在我得到一个不同的错误。
findmode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
pleasecluster<-function(df){
##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it.
create<-function(dataframe){
repeat{proto1<-NULL
for(i in 1:ncol(dataframe)){
proto1<-c(proto1, sample(c(0,1), 1))
}
proto2<-as.numeric(proto1==0)
if (length(unique(proto1))>1){break}
}
return(list(proto1, proto2))
}
##This function will assign a cluster index to each entry in the original data frame.
clusterassign<-function(proto1, proto2, dataframe){
clustervector<-NULL
for(i in 1:nrow(dataframe)){
dis1<-dissim(dataframe[i,], proto1)
dis2<-dissim(dataframe[i,], proto2)
clusterindex<-which.min(c(dis1, dis2))
clustervector<-c(clustervector, clusterindex)
}
return(clustervector)
}
##Based on the created clusters, this will then find the centres of those clusters
updproto<-function(clvec, dataframe){
cluster1<-(dataframe[clvec==1,])
cluster2<-(dataframe[clvec==2,])
newproto1<-NULL
newproto2<-NULL
if (nrow(cluster2)>0&nrow(cluster1)>0) {for(i in 1:ncol(dataframe)){
mode1<-findmode(cluster1[,i])
newproto1<-c(newproto1, mode1)
mode2<-findmode(cluster2[,i])
newproto2<-c(newproto2, mode2)
}}
else {starter<-create(dataframe)
newproto1<-starter[[1]]
newproto2<-starter[[2]]}
return(list(newproto1, newproto2))
}
##This will match the centres found of the current clusters and the initial centres used
checkproto<-function(oldproto1, olproto2, newproto1, newproto2){
if (sum(oldproto1!=newproto1)>0){a1<-FALSE}
else{a1<-TRUE}
if (sum(oldproto2!=newproto2)>0){a2<-FALSE}
else{a2<-T}
return(c(a1,a2))
}
##The main function
starter<-create(df)
proto1<-starter[[1]]
proto2<-starter[[2]]
count<-1
repeat{
clvec<-clusterassign(proto1, proto2, df)
oldproto1<-proto1
oldproto2<-proto2
upd<-updproto(clvec, df)
proto1<-upd[[1]]
proto2<-upd[[2]]
check<-checkproto(oldproto1, oldproto2, proto1, proto2)
count<-count+1
#calc total dissimilarity
totdiss1<-NULL
totdiss2<-NULL
cluster1<-df[clvec==1,]
for(i in 1:nrow(cluster1)){
dissi1<-dissim(cluster1[i,],proto1)
totdiss1<-sum(totdiss1, dissi1)
}
cluster2<-df[clvec==2,]
for(i in 1:nrow(cluster2)){
dissi2<-dissim(cluster2[i,],proto2)
totdiss2<-sum(totdiss2, dissi2)
}
totdiss<-totdiss1+totdiss2
if((all(check))|count>50){break}
}
return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2 ))
}
pleasecluster(a)
Hide Traceback
Rerun with Debug
Error in e2[[j]] : subscript out of bounds
3 Ops.data.frame(a, b)
2 dissim(cluster2[i, ], proto2)
1 pleasecluster(a)
问题是由于簇向量偶尔会全1或全2,导致其中一个簇为空。因此,在进一步的循环中,当调用这些空集群时,要么引入 NA's
,要么像第二种情况一样,调用失败,因为集群是空的。如果 clusterassign
函数中出现这种情况,一个简单的随机化循环应该可以解决问题。
#to check for and remove empty clusters
if (length(unique(clustervector))==1){
repeat{ clustervector<-NULL
for (i in 1:nrow(dataframe)){
add<-sample(c(1,2), 1)
clustervector<-c(clustervector, add)
}
if (length(unique(clustervector))==2){break}
}
}