2016-04-30 48 views
0

該代碼是嘗試使用簡單的相異性度量(即0-0匹配完美匹配的地方)編寫二進制變量的聚類代碼。爲了確保它不會以局部最小值結束,我需要運行幾次。但偶爾,我會得到與引入NA值有關的錯誤消息。代碼中沒有使用強制。我不知道NA價值如何增長。爲什麼在集羣代碼中有一個NA介紹?

dissim<-function(a,b){ 
match<-sum(a==b) 
unmatch<-sum(a!=b) 
sim<-match/(match+unmatch) 
dissim<-1-sim 
return(dissim) 
} 

findmode <- function(x) { 
    ux <- unique(x) 
    ux[which.max(tabulate(match(x, ux)))] 
} 


pleasecluster<-function(df){ 
    ##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it. 
    create<-function(dataframe){ 
    proto1<-NULL 
    for(i in 1:length(dataframe[1,])){ 
     proto1<-c(proto1, sample(c(0,1), 1)) 
    } 
    proto2<-as.numeric(proto1==0) 
    return(list(proto1, proto2)) 
    } 
    ##This function will assign a cluster index to each entry in the original data frame. 
    clusterassign<-function(proto1, proto2, dataframe){ 
    clustervector<-NULL 
    for(i in 1:length(dataframe[,1])){ 
     dis1<-dissim(dataframe[i,], proto1) 
     dis2<-dissim(dataframe[i,], proto2) 
     clusterindex<-which.min(c(dis1, dis2)) 
     clustervector<-c(clustervector, clusterindex) 
    } 
    return(clustervector) 
    } 
    ##Based on the created clusters, this will then find the centres of those clusters 
    updproto<-function(clvec, dataframe){ 
    cluster1<-(dataframe[clvec==1,]) 
    cluster2<-(dataframe[clvec==2,]) 
    newproto1<-NULL 
    newproto2<-NULL 
    for(i in 1:length(dataframe[1,])){ 
     mode1<-findmode(cluster1[,i]) 
     newproto1<-c(newproto1, mode1) 
     mode2<-findmode(cluster2[,i]) 
     newproto2<-c(newproto2, mode2) 
    } 
    return(list(newproto1, newproto2)) 
    } 
    ##This will match the centres found of the current clusters and the initial centres used 
    checkproto<-function(oldproto1, olproto2, newproto1, newproto2){ 
    if (sum(oldproto1!=newproto1)>0){a1<-FALSE} 
    else{a1<-TRUE} 
    if (sum(oldproto2!=newproto2)>0){a2<-FALSE} 
    else{a2<-T} 
    return(c(a1,a2)) 
    } 
    ##The main function 
    starter<-create(df) 
    proto1<-starter[[1]] 
    proto2<-starter[[2]] 
    count<-1 
    repeat{ 
    clvec<-clusterassign(proto1, proto2, df) 
    oldproto1<-proto1 
    oldproto2<-proto2 
    upd<-updproto(clvec, df) 
    proto1<-upd[[1]] 
    proto2<-upd[[2]] 
    check<-checkproto(oldproto1, oldproto2, proto1, proto2) 
    count<-count+1 
#calc total dissimilarity 
    totdiss1<-NULL 
    totdiss2<-NULL 
    cluster1<-df[clvec==1,] 
    for(i in 1:sum(clvec==1)){ 

    dissi1<-dissim(cluster1[i,],proto1) 
    totdiss1<-sum(totdiss1, dissi1) 
    } 
    cluster2<-df[clvec==2,] 
    for(i in 1:sum(clvec==2)){ 

    dissi2<-dissim(cluster2[i,],proto2) 
    totdiss2<-sum(totdiss2, dissi2) 
    } 
    totdiss<-totdiss1+totdiss2 
    if((all(check))|count>50){break} 
    } 

    return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2)) 
} 

對於測試數據集,我得到了正確的值和錯誤信息。

a 
    c.1..1. c.1..0. c.1..1..1 c.0..0. c.0..0..1 c.0..0..2 c.1..1..2 c.1..1..3 
1  1  1   1  0   0   0   1   1 
2  1  0   1  0   0   0   1   1 
3  1  1   1  1   0   0   1   1 
4  1  1   1  0   0   0   1   1 
5  1  1   0  0   0   0   1   1 
6  0  0   0  1   1   1   1   1 
7  0  1   0  1   1   1   1   1 
8  0  0   0  1   1   1   1   1 
9  0  0   0  1   0   1   1   1 

pleasecluster(a) 
[[1]] 
[1] 1 1 1 0 0 0 1 1 

[[2]] 
[1] 0 0 0 1 1 1 1 1 

[[3]] 
[1] 1 1 1 1 1 2 2 2 2 

[[4]] 
[1] 4 

[[5]] 
[1] 0.625 

[[6]] 
[1] 0.375 

[[7]] 
[1] 0.25 

pleasecluster(a) 
Error in if (sum(oldproto2 != newproto2) > 0) { : 
    missing value where TRUE/FALSE needed 

請讓我知道這是否更適合代碼審查或數據科學SE。

附錄

考慮,可能出現錯誤,由於情況下集羣之一將是空的(導致NA每當行稱爲值),我修改的代碼。 (我也修改了所有for循環使用ncol和nrow運行的長度)。現在我得到一個不同的錯誤。

findmode <- function(x) { 
    ux <- unique(x) 
    ux[which.max(tabulate(match(x, ux)))] 
} 


pleasecluster<-function(df){ 
    ##create will create the first prototypes i.e. initial cluster centres. First prototype is random, second is the centre farthest from it. 
    create<-function(dataframe){ 
    repeat{proto1<-NULL 
    for(i in 1:ncol(dataframe)){ 
     proto1<-c(proto1, sample(c(0,1), 1)) 
    } 
    proto2<-as.numeric(proto1==0) 
    if (length(unique(proto1))>1){break} 
    } 
    return(list(proto1, proto2)) 
    } 
    ##This function will assign a cluster index to each entry in the original data frame. 
    clusterassign<-function(proto1, proto2, dataframe){ 
    clustervector<-NULL 
    for(i in 1:nrow(dataframe)){ 
     dis1<-dissim(dataframe[i,], proto1) 
     dis2<-dissim(dataframe[i,], proto2) 
     clusterindex<-which.min(c(dis1, dis2)) 
     clustervector<-c(clustervector, clusterindex) 
    } 
    return(clustervector) 
    } 
    ##Based on the created clusters, this will then find the centres of those clusters 
    updproto<-function(clvec, dataframe){ 
    cluster1<-(dataframe[clvec==1,]) 
    cluster2<-(dataframe[clvec==2,]) 
    newproto1<-NULL 
    newproto2<-NULL 
    if (nrow(cluster2)>0&nrow(cluster1)>0) {for(i in 1:ncol(dataframe)){ 
     mode1<-findmode(cluster1[,i]) 
     newproto1<-c(newproto1, mode1) 
     mode2<-findmode(cluster2[,i]) 
     newproto2<-c(newproto2, mode2) 
    }} 
    else {starter<-create(dataframe) 
    newproto1<-starter[[1]] 
    newproto2<-starter[[2]]} 
    return(list(newproto1, newproto2)) 
    } 
    ##This will match the centres found of the current clusters and the initial centres used 
    checkproto<-function(oldproto1, olproto2, newproto1, newproto2){ 
    if (sum(oldproto1!=newproto1)>0){a1<-FALSE} 
    else{a1<-TRUE} 
    if (sum(oldproto2!=newproto2)>0){a2<-FALSE} 
    else{a2<-T} 
    return(c(a1,a2)) 
    } 
    ##The main function 
    starter<-create(df) 
    proto1<-starter[[1]] 
    proto2<-starter[[2]] 
    count<-1 
    repeat{ 
    clvec<-clusterassign(proto1, proto2, df) 
    oldproto1<-proto1 
    oldproto2<-proto2 
    upd<-updproto(clvec, df) 
    proto1<-upd[[1]] 
    proto2<-upd[[2]] 
    check<-checkproto(oldproto1, oldproto2, proto1, proto2) 
    count<-count+1 
#calc total dissimilarity 
    totdiss1<-NULL 
    totdiss2<-NULL 
    cluster1<-df[clvec==1,] 
    for(i in 1:nrow(cluster1)){ 

    dissi1<-dissim(cluster1[i,],proto1) 
    totdiss1<-sum(totdiss1, dissi1) 
    } 
    cluster2<-df[clvec==2,] 
    for(i in 1:nrow(cluster2)){ 

    dissi2<-dissim(cluster2[i,],proto2) 
    totdiss2<-sum(totdiss2, dissi2) 
    } 
    totdiss<-totdiss1+totdiss2 
    if((all(check))|count>50){break} 
    } 

    return(list(oldproto1, oldproto2, clvec, count, totdiss, totdiss1, totdiss2)) 
} 




pleasecluster(a) 
Hide Traceback 

Rerun with Debug 
Error in e2[[j]] : subscript out of bounds 
3 Ops.data.frame(a, b) 
2 dissim(cluster2[i, ], proto2) 
1 pleasecluster(a) 
+0

當您嘗試使用'create'函數時,您應該收到錯誤消息。它不會在'globalenv()'中。 –

+0

@ 42-這個錯誤信息在腳本的罕見演繹中出現。那怎麼會是由於環境分配不當造成的?我沒有得到如何不能在.glodalenv()導致問題。 –

+0

@ 42-對不起,但我自己學習R。因此,我可能不知道什麼會是相當常見的知識! –

回答

0

問題是由於這樣一個事實,即偶爾聚類向量變成全1或全2,因此其中一個聚類爲空。因此,在進一步的循環中,當調用這些空集羣時,可以引入NA's,或者如第二種情況那樣,由於集羣爲空,調用失敗。一個簡單的隨機循環,以防在clusterassign函數中出現這種情況應該解決這個問題。

#to check for and remove empty clusters 
    if (length(unique(clustervector))==1){ 
     repeat{ clustervector<-NULL 
     for (i in 1:nrow(dataframe)){ 
     add<-sample(c(1,2), 1) 
     clustervector<-c(clustervector, add) 
     } 
     if (length(unique(clustervector))==2){break} 
     } 
    }