I###################################################
### chunk number 1: data
###################################################
#this code was originally integrated as part of the LaTeX file that created the paper 'A non-Euclidean clustering method using relational data' by Matthew J. Vavrek. 
#The following is released under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported license
#full terms of the license can be found at: http://creativecommons.org/licenses/by-nc-sa/3.0/

#This code was tested on an OS X 10.10 distribution running R version 3.2.2

#To set up the environment, be sure that you have the following packages installed:
#-fossil
#-ecodist
#-vegan
#-doMC
#-multicore
#-foreach

#please note that some of this code (i.e. the simulated data section) may not run on a Windows computer, as the doMC package does not support Windows machines at this time. To run this section in parallel on a Windows computer, you can use the doSMP package instead. Please consult the doSMP package page at http://cran.r-project.org/web/packages/doSMP/ for full instructions on setting up a machine with doSMP.

#relational clustering paper
library(fossil)
library(ecodist)
library(vegan)
#Enable parallel processing
library(doMC)
registerDoMC()
options(cores=4)


###################################################
### chunk number 2: multiclust
###################################################
#note that the 'iters' object below refers to the number of simulations per level of endemism; for the data in the paper, 'iters' was set to 1000, but for this supplementary data it is set to 3, as computation time for 1000 replicates on a quad core CPU workstation is approximately 10 hours
iters <- 1000
clust.comp.end<-foreach(j=seq(0.02, 0.6, 0.02), .combine='rbind') %dopar% {
  print(j)
  clust.mat<-matrix(0,iters,6)
    for (i in 1:iters) {
        #create a simulated matrix to cluster
        sm<-sim.occ(endemics=j)
        #the original groupings
        orig<-c(rep(1,30), rep(2,30), rep(3,30))
        #make a distance matrix
        sm.dist<-ecol.dist(sm, method = sorenson)
        #UPGMA custering - needs to be put in explicit groups using treecut()
        sm.up<-hclust(sm.dist, 'average')
        sm.up3<-cutree(sm.up, 3)
        #kmeans clustering
        smt<-t(sm)
        sm.cap.euc<-capscale(smt~1, dist="euclidean", metaMDS = TRUE)
        sm.cap.ca.euc<-sm.cap.euc$CA
        sm.k.euc <- kmeans(sm.cap.ca.euc$Xbar, 3, nstart=100)
        sm.cap<-capscale(smt~1, dist="bray", metaMDS = TRUE)
        sm.cap.ca<-sm.cap$CA
        sm.k <- kmeans(sm.cap.ca$Xbar, 3, nstart=100)
        #complete linkage
        sm.com<-hclust(sm.dist)
        sm.com3<-cutree(sm.com, 3)
        #single linkage
        sm.sin<-hclust(sm.dist, 'single')
        sm.sin3<-cutree(sm.sin, 3)
        #relational clustering
        sm.rc<-rclust(sm.dist, 3, 20)
        #sm.nm<-nmds(sm.dist, mindim=2, nits=1)
        clust.mat[i,1]<-adj.rand.index(orig, sm.up3)
        clust.mat[i,2]<-adj.rand.index(orig, sm.k.euc$cluster)
        clust.mat[i,3]<-adj.rand.index(orig, sm.k$cluster)
        clust.mat[i,4]<-adj.rand.index(orig, sm.com3)
        clust.mat[i,5]<-adj.rand.index(orig, sm.sin3)
        clust.mat[i,6]<-adj.rand.index(orig, sm.rc)
    }   
    clust.res<-matrix(0,2,6)
    clust.res[1,]<-apply(clust.mat, 2, function(x) mean(x))
    clust.res[2,]<-apply(clust.mat, 2, function(x) sd(x))
    clust.res<-round(clust.res, 3)
    clust.int<-c(j,clust.res)
}


###################################################
### chunk number 3: fig1
###################################################
#to create a multi tile figure that shows how the level of endemicity in the simulation affects the clstering of the different sites, at least in non-Euclidean space with an NMDS
pdf('figure1.pdf')
oldmar<-par()$mar
#the affects the layout of the figure
layout(matrix(25:1, 5, 5, byrow=TRUE))
#map with mst
par(mar=c(0,0,0,0))
orig<-c(rep(1,30), rep(2,30), rep(3,30))
for (k in seq(0.02, 0.5, 0.02)) {
    a<-nmds(ecol.dist(sim.occ(endemics=k)), mindim=2, nits=1)
    plot(a$conf[[1]], pch=orig, xaxt='n', yaxt='n', ylim=c(-0.4, 0.4), xlim=c(-0.4, 0.4), col=orig+1)
    text(-0.25, 0.37, labels=paste('e=', k, sep=''), cex=1.5)
}
dev.off()

###################################################
### chunk number 4: fig2
###################################################
pdf('figure2.pdf')
par(mar=oldmar)
plot(clust.comp.end[,1:2], type='n', xlim = c(0.6, 0), ylim = c(-0.1, 1.1), ylab = 'Adjusted Rand Index', xlab = 'Relative Endemicity')
lines(clust.comp.end[,1], clust.comp.end[,2], col=1)
points(clust.comp.end[,1],clust.comp.end[,2], pch=1, col=1)
lines(clust.comp.end[,1], clust.comp.end[,4], col=2)
points(clust.comp.end[,1],clust.comp.end[,4], pch=2, col=2)
lines(clust.comp.end[,1], clust.comp.end[,6], col=3)
points(clust.comp.end[,1],clust.comp.end[,6], pch=3, col=3)
lines(clust.comp.end[,1], clust.comp.end[,8], col=4)
points(clust.comp.end[,1],clust.comp.end[,8], pch=4, col=4)
lines(clust.comp.end[,1], clust.comp.end[,10], col='steelblue')
points(clust.comp.end[,1],clust.comp.end[,10], pch=5, col='steelblue')
lines(clust.comp.end[,1], clust.comp.end[,12], col='mediumvioletred')
points(clust.comp.end[,1],clust.comp.end[,12], pch=6, col='mediumvioletred')
legend(x='bottomleft', c('UPGMA', 'k-means/E', 'k-means/NE', 'Complete Linkage', 'Single Linkage', 'NERC'), pch=1:6, col=c(1:4,'steelblue','mediumvioletred'))
dev.off()

###################################################
### chunk number 4: sparse-data
###################################################

iters <- 1000
clust.comp.samp<-foreach(j=seq(0.2, 3, 0.1), .combine='rbind') %dopar% {
  print(j)
  clust.mat<-matrix(0,iters,6)
    for (i in 1:iters) {
        #create a simulated matrix to cluster
        sm<-sim.occ(endemics=0.2, avg.abund=j)
        #the original groupings
        orig<-c(rep(1,30), rep(2,30), rep(3,30))
        #make a distance matrix
        sm.dist<-ecol.dist(sm, method = sorenson)
        #UPGMA custering - needs to be put in explicit groups using treecut()
        sm.up<-hclust(sm.dist, 'average')
        sm.up3<-cutree(sm.up, 3)
        #kmeans clustering
        smt<-t(sm)
        sm.cap.euc<-capscale(smt~1, dist="euclidean", metaMDS = TRUE)
        sm.cap.ca.euc<-sm.cap.euc$CA
        sm.k.euc <- kmeans(sm.cap.ca.euc$Xbar, 3, nstart=100)
        sm.cap<-capscale(smt~1, dist="bray", metaMDS = TRUE)
        sm.cap.ca<-sm.cap$CA
        sm.k <- kmeans(sm.cap.ca$Xbar, 3, nstart=100)
        #complete linkage
        sm.com<-hclust(sm.dist)
        sm.com3<-cutree(sm.com, 3)
        #single linkage
        sm.sin<-hclust(sm.dist, 'single')
        sm.sin3<-cutree(sm.sin, 3)
        #relational clustering
        sm.rc<-rclust(sm.dist, 3, 20)
        #sm.nm<-nmds(sm.dist, mindim=2, nits=1)
        clust.mat[i,1]<-adj.rand.index(orig, sm.up3)
        clust.mat[i,2]<-adj.rand.index(orig, sm.k.euc$cluster)
        clust.mat[i,3]<-adj.rand.index(orig, sm.k$cluster)
        clust.mat[i,4]<-adj.rand.index(orig, sm.com3)
        clust.mat[i,5]<-adj.rand.index(orig, sm.sin3)
        clust.mat[i,6]<-adj.rand.index(orig, sm.rc)
    }   
    clust.res<-matrix(0,2,6)
    clust.res[1,]<-apply(clust.mat, 2, function(x) mean(x))
    clust.res[2,]<-apply(clust.mat, 2, function(x) sd(x))
    clust.res<-round(clust.res, 3)
    clust.int<-c(j,clust.res)
}
        
    
###################################################
### chunk number 4: fig2
###################################################
pdf('figure3.pdf')
plot(clust.comp.samp[,1:2], type='n', xlim = c(3.2,0), ylim = c(-0.1, 1.1), ylab = 'Adjusted Rand Index', xlab = 'Sampling Intensity', xaxt='n')
lines(clust.comp.samp[,1], clust.comp.samp[,2], col=1)
points(clust.comp.samp[,1],clust.comp.samp[,2], pch=1, col=1)
lines(clust.comp.samp[,1], clust.comp.samp[,4], col=2)
points(clust.comp.samp[,1],clust.comp.samp[,4], pch=2, col=2)
lines(clust.comp.samp[,1], clust.comp.samp[,6], col=3)
points(clust.comp.samp[,1],clust.comp.samp[,6], pch=3, col=3)
lines(clust.comp.samp[,1], clust.comp.samp[,8], col=4)
points(clust.comp.samp[,1],clust.comp.samp[,8], pch=4, col=4)
lines(clust.comp.samp[,1], clust.comp.samp[,10], col='steelblue')
points(clust.comp.samp[,1],clust.comp.samp[,10], pch=5, col='steelblue')
lines(clust.comp.samp[,1], clust.comp.samp[,12], col='mediumvioletred')
points(clust.comp.samp[,1],clust.comp.samp[,12], pch=6, col='mediumvioletred')
legend(x='bottomleft', c('UPGMA', 'k-means/E', 'k-means/NE', 'Complete Linkage', 'Single Linkage', 'NERC'), pch=1:6, col=c(1:4,'steelblue','mediumvioletred'))
axis(1, at=seq(3, 0, -0.5))
dev.off()

save.image('rc.RData')