#############CODE FOR CALCULATING PRIOR AGE FOR A CALIBRATED NODE#####################
### Special thanks to David Cannatella, The University of Texas at Austin for helping with this code###


library(MCMCpack)
library(ggplot2)

################################################
# Exponential Distribution for Calibration

# The single parameter of the exponential distribution is the scale (= mean) parameter.
# However, the R functions accept the rate, which is 1/mean
# Rate = 1/Mean
# Mean = 1/Rate = Beta = standard deviation
# Median = log(2)/Rate = log(2) * Mean
# log(2) = 0.6931472

# You may change the next two values as needed. But change nothing else!
myOffset <- 8.8  # e.g. Minimum age of node
myMedian <- 10.2  # This value includes the offset 

(myMean  <-  (myMedian - myOffset)/log(2))
myRate   <-  1/myMean

# Calculate percentiles and plot them
(pct95 <- qexp(0.95, rate = myRate) + myOffset)   
(pct975 <- qexp(0.975, rate = myRate) + myOffset)  
maxX <- pct975 + 20

df <- data.frame(x = c(0 + myOffset, maxX))
ggplot(df, aes(x)) +
    xlim(0, 100) +
    stat_function(fun = dexp, args = list(rate = 1/myMean), 
        color = "black", n = 100, xlim = (c(myOffset, maxX))) +
    xlab("millions of years") +
    scale_x_continuous(breaks = seq(-10, maxX, by = 10)) +
    geom_vline(aes(xintercept = pct975), col = "red", lty = "dashed") +
    geom_vline(aes(xintercept = pct95), col = "green", lty = "dashed") +
    geom_vline(aes(xintercept = myMean + myOffset), col = "blue", lty = "dashed") +
    geom_vline(aes(xintercept = myMedian), col = "orange", lty = "dashed") +
    geom_vline(aes(xintercept = myOffset), col = "darkgray", lty = "dashed") +
    geom_label(x = pct975, y = .015, label = "97.5%", size = 3) +
    geom_label(x = pct95, y = .015, label = "95%", size = 3) +
    geom_label(x = myMean + myOffset, y = .015, label = "mean", size = 3) +
    geom_label(x = myMedian, y = .01,  label = "median (50%)", size = 3) +
    geom_label(x = myOffset, y = .005,  label = "Offset", size = 3) +
    # geom_label(x = maxX/2, y = .01,  label = paste("offsetexp(", myOffset, ",", signif(myMean + myOffset, digits = 3), ")"), size = 4)
    ggtitle(label = paste("offsetexp(", myOffset, ",", signif(myMean + myOffset, digits = 3), ")"))


#################################################################################################
###############CODE FOR CALCULATING EVOLUTIONARY RATE OF A TREE FOR CALIBRATION##################

## If you use this code for another project, please cite the Gunnell et al. 2018 paper ##
##Gunnell, G.F., Boyer, D.M., Friscia, A.R., Heritage, S., Manthi, F.K., Miller, E.R., Sallam, H.M., Simmons, N.B., Stevens, N.J., Seiffert, E.R., 2018. Fossil lemurs from Egypt and Kenya suggest an African origin for Madagascar’s aye-aye. Nature communications, 9(1), p.3193. ####

# Load the package (after installation).
library(optimx)   # optimx seems better than R's default optim()
library(GenSA)    # GenSA seems better than optimx (but slower) on 5+ parameters, 
# seems to sometimes fail on simple problems (2-3 parameters)
library(FD)       # for FD::maxent() (make sure this is up-to-date)
library(snow)     # (if you want to use multicore functionality; some systems/R versions prefer library(parallel), try either)
library(parallel)
library(rexpokit)
library(cladoRcpp)


#############################################################

wd = np("~/Desktop/clockratepr")
setwd(wd)
getwd()
treefilename="mito_newick.nwk"
datesfilename="mito_dates.txt"
rootage=8.8

#############################################################

library(ape)
library(fitdistrplus)

#############################################################


# to extract path lengths, scale them, and write them to a csv file #

mytree=read.tree(treefilename)
mytips=as.data.frame(mytree$tip)
tipnumbers=as.numeric(rownames(mytips))
tipnames=as.matrix(mytips)
maxtipnumber=max(tipnumbers)
rootnumber=maxtipnumber+1
mymatrix=matrix(nrow=maxtipnumber,ncol=5)
colnames(mymatrix)=c("node #","taxon","path-length","tip-date","scaled-path")
mymatrix[,1]=tipnumbers
mymatrix[,2]=tipnames[,1]

mytable=dist.nodes(mytree)
torootdist_A=as.matrix(mytable[,rootnumber])
torootdist_B=as.matrix(torootdist_A[1:maxtipnumber,1])
mymatrix[,3]=torootdist_B[,1]

##
mydates=read.table(datesfilename)
mydates=as.matrix(mydates)
mydates=mydates[order(mydates[, "V1"]),]
mymatrix=mymatrix[order(mymatrix[, "taxon"]),]
mymatrix[,4]=mydates[,2]

pathlength=as.numeric(mymatrix[,3])
tipdate=as.numeric(mymatrix[,4])

count=1
repeat {
    mymatrix[count,5]=(pathlength[count])/((rootage)-(tipdate[count]))
    count=count+1
    if (count==maxtipnumber+1) break
}

filenameout_a="path.lengths."
filenameout_b=rootage
filenameout_c=".csv"
filenameout_d=paste(filenameout_a,filenameout_b,filenameout_c,sep="")
write.table(mymatrix,filenameout_d,row.names=F,na="",sep=",")


#############################################################


# to fit distributions and write-out a table with parameter values, likelihoods and BICs #

x=as.numeric(mymatrix[,5])


xfornormal=x*1000
fitted_normal=fitdist(xfornormal,"norm",method="mle")
a=as.matrix(unlist(summary(fitted_normal)))
names=rownames(a)
names=names[1:20]
a1=""
a2=a["estimate.mean",]
a3=a["estimate.sd",]
a4=a["method",]
a5=a["n",]
a6=a["loglik",]
a7=a["bic",]
names=c("NORMAL","estimate.mean","estimate.sd","method","n","loglik","bic")
normal_summary=matrix(,nrow=7,ncol=2)
normal_summary[1:7,1]=names
normal_summary[1,2]=a1
normal_summary[2,2]=a2
normal_summary[3,2]=a3
normal_summary[4,2]=a4
normal_summary[5,2]=a5
normal_summary[6,2]=a6
normal_summary[7,2]=a7

a22=as.numeric(a2)
a33=as.numeric(a3)
a2=a22/1000
a3=a33/1000
normal_summary[2,2]=a2
normal_summary[3,2]=a3


xforlognormal=x
fitted_lognormal=fitdist(xforlognormal,"lnorm",method=c("mle"))
b=as.matrix(unlist(summary(fitted_lognormal)))
names=rownames(b)
names=names[1:20]
b1=""
b2=b["estimate.meanlog",]
b3=b["estimate.sdlog",]
b4=b["method",]
b5=b["n",]
b6=b["loglik",]
b7=b["bic",]
names=c("LOGNORMAL","estimate.meanlog","estimate.sdlog","method","n","loglik","bic")
lognormal_summary=matrix(,nrow=7,ncol=2)
lognormal_summary[1:7,1]=names
lognormal_summary[1,2]=b1
lognormal_summary[2,2]=b2
lognormal_summary[3,2]=b3
lognormal_summary[4,2]=b4
lognormal_summary[5,2]=b5
lognormal_summary[6,2]=b6
lognormal_summary[7,2]=b7


xforgamma=x
# fitted_gamma=fitdist(xforgamma,"gamma",method="mle",lower=c(0,0)) #
fitted_gamma=fitdist(xforgamma,"gamma",method="mle")
c=as.matrix(unlist(summary(fitted_gamma)))
names=rownames(c)
names=names[1:20]
c1=""
c2=c["estimate.shape",]
c3=c["estimate.rate",]
c4=c["method",]
c5=c["n",]
c6=c["loglik",]
c7=c["bic",]
names=c("GAMMA","estimate.shape","estimate.rate","method","n","loglik","bic")
gamma_summary=matrix(,nrow=7,ncol=2)
gamma_summary[1:7,1]=names
gamma_summary[1,2]=c1
gamma_summary[2,2]=c2
gamma_summary[3,2]=c3
gamma_summary[4,2]=c4
gamma_summary[5,2]=c5
gamma_summary[6,2]=c6
gamma_summary[7,2]=c7

normal_mean=as.numeric(normal_summary[2,2])
normal_sd=as.numeric(normal_summary[3,2])
lognormal_mean=as.numeric(lognormal_summary[2,2])
lognormal_sd=as.numeric(lognormal_summary[3,2])
gamma_shape=as.numeric(gamma_summary[2,2])
gamma_rate=as.numeric(gamma_summary[3,2])

#####
normal_summary
lognormal_summary
gamma_summary
#####

outputmatrix=matrix(,nrow=25,ncol=2)
blankcell=""
outputmatrix[8,1]=blankcell
outputmatrix[8,2]=blankcell
outputmatrix[9,1]=blankcell
outputmatrix[9,2]=blankcell
outputmatrix[17,1]=blankcell
outputmatrix[17,2]=blankcell
outputmatrix[18,1]=blankcell
outputmatrix[18,2]=blankcell
outputmatrix[1:7,1:2]=normal_summary[1:7,1:2]
outputmatrix[10:16,1:2]=lognormal_summary[1:7,1:2]
outputmatrix[19:25,1:2]=gamma_summary[1:7,1:2]

filenameout_e="fit.distributions."
filenameout_f=rootage
filenameout_g=".csv"
filenameout_h=paste(filenameout_e,filenameout_f,filenameout_g,sep="")
write.table(outputmatrix,filenameout_h,row.names=F,col.names=F,na="",sep=",")


labelx="red=normal, blue=lognormal, green=gamma"
labely=""
titledensity="empirical density & fit distributions"
titlehistogram="empirical data & fit distributions"
densityofdata=density(x)


pdf("density.pdf",width=5,height=5)
plot(densityofdata,main=titledensity,xlab=labelx,ylab=labely)
curve(dnorm(x,normal_mean,normal_sd),col="red",lwd=1,add=T)
curve(dlnorm(x,lognormal_mean,lognormal_sd),col="blue",lwd=1,add=T)
curve(dgamma(x,gamma_shape,gamma_rate),col="green",lwd=1,add=T)
dev.off()

pdf("histogram.pdf",width=5,height=5)
hist(x,main=titlehistogram,xlab=labelx,ylab=labely,breaks=25)
curve(dnorm(x,normal_mean,normal_sd),col="red",lwd=1,add=T)
curve(dlnorm(x,lognormal_mean,lognormal_sd),col="blue",lwd=1,add=T)
curve(dgamma(x,gamma_shape,gamma_rate),col="green",lwd=1,add=T)
dev.off()

###

quit("no")