#This is the R code for "Enforced Symmetry: The Necessity of Symmetric Waxing and Waning"
#By Niklas Hohmann* and Emilia Jarochowska
#*Corresponding author, Email: Niklas.Hohmann@fau.de

#The raw data necessary for the examples 1 and 2 in the Appendix and figure 1 in the main text
#is available on the Open Science Framework (OSF) under osf.io/zw5ef/
#For these part of the code to work, please download it and place it in you working directory
#make sure to set your working directory (wd) right!!!
#setwd("insert your working directory with the raw data here") #if necessary uncomment and set the wd

###################################
#Figures from the main text
###################################
### generate fig. 1: Explaining  the procedure of data processing
par(mfrow=c(1,5))
par(mar=c(3,3,2,1))
par(mgp=c(1.4,0.5,0))
x=seq(0,1.5,length.out=1000)
x1=c(0.2,1,1.2,1.5)
y1=c(0,1,4,0)
m1=c(1,0,0,0)
f1=splinefunH(x1,y1,m1)
plot(NULL,xlim=c(0,1.5),ylim=c(0,4),ylab='MESs',xlab='Time',cex.lab=1.3)
lines(seq(0.2,1.5,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red')
x2=c(0,0.2,0.8)
y2=c(0,2,0)
m2=c(1,0,-0.1)
f2=splinefunH(x2,y2,m2)
lines(seq(0,0.8,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue')
legend('topleft',legend=c('Taxon 1','Taxon 2'),col=c('red',"blue"),lwd=6,bty='n')

###
plot(NULL,xlim=c(0,1.5),ylim=c(0,4),main='x-shift',ylab='MESs',xlab='Taxon Age',cex.lab=1.3)
lines(seq(0,1.3,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red')
lines(seq(0,0.8,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue')

###
plot(NULL,xlim=c(0,1.5),main='x-scale',ylim=c(0,4),ylab='MESs',xlab='Relative Taxon Age',cex.lab=1.3)
lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red')
lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue')

###
plot(NULL,xlim=c(0,1),main='y-scale',ylim=c(0,1.2),ylab='Relative MESs',xlab='Relative Taxon Age',cex.lab=1.3)
lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100))/4,lwd=6,col='red')
lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100))/2,lwd=6,col='blue')

###
plot(NULL,xlim=c(0,1),main='Averaging',ylim=c(0,1.2),ylab='Relative MESs',xlab='Relative Taxon Age',cex.lab=1.3)
c1=rgb(1,0,0,0.6)
c2=rgb(0,0,1,0.6)
lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100))/4,lwd=6,col=c1)
lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100))/2,lwd=6,col=c2)
lines(seq(0,1,length.out = 100),0.5*(f1(seq(0.2,1.5,length.out = 100))/4 + f2(seq(0,0.8,length.out = 100))/2),lwd=6)
legend('topleft',legend=c('Average'),col=c('black'),lwd=6,bty='n')


#### Figure 2: The decomposition of a function into symmetric and asymmetric part
x=c(-2,0.5,2)
y=c(1,4,0)
m=c(0,0,0)
f=splinefunH(x,y,m)
x1=seq(-2,2,length.out = 1000)
y1=f(x1)

par(mfrow=c(1,2))
par(mar=c(3,2.3,0.5,0.5))
par(mgp=c(1.2,0.5,0))
plot(x1,y1,xlab='x',ylab='f(x)',ylim=c(0,4.85),type='l',lwd=6)
ysym=pmin(y1,rev(y1))
plot(NULL,xlim=c(-2,2),ylim=c(0,4.85),xlab = 'x',ylab='f(x)')
polygon(c(-2,x1),c(0,y1),col='blue',border='blue')
polygon(c(-2,x1),c(0,ysym),col='red',border=TRUE,lwd=2)
legend('topleft',fill=c('red','blue'),border=c('black',NA),legend=c('Symmetric part','Asymmetric part'),bty='n')

############# Figure 3: Plot a number of trajectories including theor symmetric/asymmetric part
#requires binlist and/or binlist1 from the examples in the appendix (see below)
n=9
par(mfrow=c(3,3))
par(mar=c(3,2.1,1,1))
par(mgp=c(1.3,0.4,0))
l=sample.int(length(binlist),size=n,replace = FALSE)
for( i in 1:n){
  pp=binlist[[l[i]]]
  val=matrix(c(pmin(pp,rev(pp)),pp-pmin(pp,rev(pp))),nrow=2,byrow=TRUE)
  barplot(val,col=c('red','blue'),space=0,xlab='Relative Age',ylab='Rescaled Occ.')
  axis(1,at=c(0,10),labels=c(0,1))
}

############## Figure 4: asymmetry of radiolarians and forams
#requires binlist and/or binlist1 from example 1 and 2 in the example
#joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories
symnorm=function(v){  #QuaSy for bbinned data
  return(sum(abs(v-pmin(v,rev(v)))))
}

#determine distribution of asymmetry for all foraminifera
symval=rep(0,length(binlist)) #initialize storage
for (i in 1:length(binlist)){ #determine QuAsy for all forams
  symval[i]=0.1*symnorm(binlist[[i]])
}
hist(symval,xlab='Asymmetry',main='Foraminifera from example 1') #plot distribution of asymmetry

#determine distribution of asymmetry for all radiolaria
symval1=rep(0,length(binlist1)) #initialize storage
for (i in 1:length(binlist1)){ 
  symval1[i]=0.1*symnorm(binlist1[[i]]) #determine QuAsy
}
hist(symval1,xlab='Asymmetry',main='Radiolarians from example 2') #plot distribution of asymmetry

##### Nicer Figures #####
#asymmetry of radiolarians and formas
#joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories
#create figure 1 in the main text
par(mfrow=c(1,2))
symofaveragesforam=0.1*symnorm(Reduce("+",binlist)/length(binlist)) #asymmetry of the averaged trajectories
averageofsymforam=mean(symval) #average of the asymmetry of the trajectories
hist(symval,xlab='Asymmetry (QuaSy)',main='Foraminifera')
lines(c(symofaveragesforam,symofaveragesforam),c(0,75),col='red',lwd=4,lty=2)
lines(c(averageofsymforam,averageofsymforam),c(0,75),col='blue',lwd=4,lty=2)

hist(symval1,xlab='Asymmetry (QuaSy)',main='Radiolaria')
symofaveragesrad=0.1*symnorm(Reduce("+",binlist1)/length(binlist1)) #asymmetry of the averaged trajectories
averageofsymrad=mean(symval) #average of the asymmetry of the trajectories
lines(c(symofaveragesrad,symofaveragesrad),c(0,90),col='red',lwd=4,lty=2)
lines(c(averageofsymrad,averageofsymrad),c(0,90),col='blue',lwd=4,lty=2)
par(mfrow=c(1,1))

#get proportion of species that have asymmetry lowrer than the asymmetry of the average trajectory
rt=length(symval[symval <=symofaveragesforam])/length(symval)
rt1=length(symval1[symval1 <=symofaveragesrad])/length(symval1)

#### Figure 5 and 6 random walks

#################################
#Random Walk figures for the main text
################################
#This code generates the figures for the part "Symmetry by Conditioning" of the main text
#this requires the function "symnorm" defined in the section "Appendix 1: Measuring asymmetry"

#ordinary random walk
# starst at time 0 at point 0! just steps after that are recorded!
n=10 #no of steps
pup=0.5 #Probability to go UP
getpath=function(n,pup){ 
  l=rbinom(n,1,pup)
  l[l==0]=-1
  return(cumsum(l))
}

#random walk conditioned to be positive. here the starting point at zero is included in the output
getcondpospath=function(n,pup){
  repeat{
    path=c(getpath(n,pup))
    if (min(path)>0){
      break
    }
  }
  return(c(0,path))
}
getcondpospath(n,0.4)

#random walk conditioned to be positive and hit zero after n steps
getcondpath=function(n,pup){
  repeat{
    path=c(getpath(n,pup))
    if (min(path[1:(n-1)])>0 & path[n]==0){
      break
    }
  }
  return(c(0,path))
}
getcondpath(10,0.4)

########generate data for the plot with the distributin of asymmetry for unconditioned/conditioned random walks
noofpaths=10000 #number of simulations to approx the distribution
n=20 #conditioned to go extinct after 20 time steps
pup=0.6 #probability of the random walk to increase 
intval1=rep(0,noofpaths) #initialize storage
#condiitoned positive paths
for (i in 1:noofpaths){
  path=getcondpospath(n,pup)
  asym= path-pmin(path,rev(path))
  intval1[i]=integrate(approxfun(0:n,asym),0,n)[[1]]
}
hist(intval1) #distribution of asymmetry for paths conditoned to be positive

#condiitoned positive paths that go extinct after n steps
intval2=rep(0,noofpaths)
for (i in 1:noofpaths){
  path=getcondpath(n,pup)
  asym= path-pmin(path,rev(path))
  intval2[i]=integrate(approxfun(0:n,asym),0,n)[[1]]
  if(i %%100==0){
    print(i)
  }
}
hist(intval2) #distribution of asymmetry for paths conditoned to be positive AND go extinct after n steps

##get asymmetry of a path that is the average over many paths
opath=rep(0,n+1) #initialize storage
#get averaged path
for (i in 1:noofpaths){
  opath=getcondpath(n,pup)+opath
}
opath=opath/noofpaths
#get asymmetrical part of the averaged path
asymopath=opath-pmin(opath,rev(opath))
#determine 1- norm of the asymmetric part to determine the QuAsy of the averaged path
ll=integrate(approxfun(0:n,asymopath),0,n)[[1]]

###Generate figure no.  6 in the main text
col1=rgb(1,0,0,0.8)
col2=rgb(0,0,1,0.5)
hist(intval2,col=col1,freq=FALSE,xlim=c(0,max(intval1)),xlab='Asymmetry',main='Asymmetry of Random Walks')
hist(intval1,col=col2,freq=FALSE,add=TRUE)
points(ll,0,cex=1.5,pch=19)
legend('topright',col=c(col1,col2,'black'),pch=c(NA,NA,19),legend=c('Positive Random walk','Positive Random Walk + Extinction','Averaged Positive Random Walk + Extinction'),bty='n',fill=c(col2,col1,NA),border=c(NA,NA,NA))

###Generate figure no. 5 in the main text.
plot(NULL,xlim=c(0,20),ylim=c(0,12),xlab='Time steps',ylab='',main='Behaviour of Random Walks')
lines(0:20,getcondpospath(n,pup),col=rgb(0,0,1,1),lwd=4)
lines(0:20,getcondpath(n,pup),col=rgb(1,0,0,1),lwd=4)
lines(0:20,getcondpospath(n,pup),col=rgb(0,0,1,0.5),lwd=8)
lines(0:20,getcondpath(n,pup),col=rgb(1,0,0,0.5),lwd=8)
lines(0:20,opath,lwd=3)


###################################
#Appendix 3.1: Example 1: Testing the Age-Area Hypothesis
###################################
#This is the code for the example 1 in appendix 3.
#Make sure the raw data necessary is loaded (see comments at the very top)

#### get empirical data to test the age-area hypotheses against ####
noofbins=10 #number of bins in which occurrences will be binned 
binborders=seq(0,1,length.out = noofbins+1) #borders of the bins
te1=read.table("02sept2018foram.csv",sep="\t",header=TRUE) #import empirical data
#initialize storage
species=numeric()
age=numeric()
di=dim(te1)
#extract ages of occurrences and the corresponding species
for (k in 1:(dim(te1)[1])){ 
  ll=te1[k,]
  species=c(species,as.character(ll$Resolved.Species))
  age=c(age,ll$Age..Ma..Gradstein.et.al..2012)
  if (k%%1000==0){ #to demonstrate progress
    print(paste(as.character(round((k/di[1])*100)),"%"))
  }
}
rm(te1) #remove imported data
#create species list
uniquespecies=species[!duplicated(species)] 

#for each species, extract the agse of occurrences
agelist=list()
for (k in 1:length(uniquespecies)){ 
  l=which(species==uniquespecies[k])
  agelist[[k]]=sort(age[l])
}

#normalize occurrences for first occurrence to be at 0 and last at 1 (x-shift and x-scale)
normageslist=list()
occ=numeric()
i=1
for (k in 1:length(agelist)){ 
  if (length(agelist[[k]])>1){ #ignore species with only one occurrences
    occ=c(occ,length(agelist[[k]]))
    mi=min(agelist[[k]])
    ma=max(agelist[[k]])
    normageslist[[i]]=(agelist[[k]]-mi)/(ma-mi)
    i=i+1
  }
  else{
  }
}
#some info about the data
hist(occ) #histogram
length(occ) # total no of species
sum(occ)#overall number of occurernces
mean(occ) #average no of occurences
median(occ) # medianoccurences

#bin each species into the bins specified at the very beginning
binlist=list()
for (k in 1:length(normageslist)){
  binval=rep(0,noofbins)
  age=normageslist[[k]]
  for (i in 1:noofbins){
    binval[i]=length(age[age>= binborders[i] & age < binborders[i+1]])
  }
  binval[noofbins]=binval[noofbins]+1
  binlist[[k]]=binval/(sum(binval*diff(binborders)))
}

######### simulated trajectories under the age-are hypothesis #########
#subroutine to simulate path of a brownian motion between 0 and 1
#based on https://en.wikipedia.org/wiki/Wiener_process#Wiener_representation
brownianpath=function(x,approxqual){ 
  #x is where the position of the path will be determined, approxqual determines the quality of the approximation
  out=rep(0,length(x)) #initialize memory
  for (k in 1:approxqual){ 
    out=(sqrt(2)*rnorm(1)/((k-0.5)*pi))*(sin((k-0.5)*pi*x))+out
  }
  return(out)
}

#subroutine: monte carlo integration for highly irregular functions (standard implementation of integration in R tends to fail for paths of the brownian motion)
mcint=function(fun,leftlim,rightlim,runs,funmax){ 
  #fun is a positive function to integrate, leftlim is the left integration limit, rightlim is the right integration limit
  #runs is the number of points used for the monte carlo approach
  #funmax is a number that needs to be higher than the maximum value of fun in the integration interval for the method to work
  xvals=runif(runs,leftlim,rightlim) #determine uniformly distributed x values
  yvals=runif(runs,0,funmax) #determine uniformly distributed y values
  intval=length(xvals[fun(xvals)>yvals])*((rightlim-leftlim)*funmax)/runs #the value of the integral is proportional to the number of points under the function, divided by the overall number of points
  return(intval)
}

noofsamples=length(binlist) #number of artificial trajectories that will be simulated
m=5 #gradient, determines how strong the drift is
x=seq(0,1,length.out = 1000)#no of points where the brownian motion is determined
approxqual=1000 #quality of the approximation of the brownian motion
artificialsamplelist=list() #stroage for the simulated trajectories
#Simulate the trajectories and bin them
for (k in 1:noofsamples){
  repeat{
    h=brownianpath(x,approxqual)+m*x #get path of the brownian motion plus drift
    if (min(h[1:10])>=0){ #if the path hits zero too early, repeat. The number 10 is chosen arbitrarily
      break
    }
  }
  if (length(h[h<0])>0){ #if the species goes extinct before time 1:
    o=min(which(h<0)) #determine where it goes extinct
    h=approx(seq(0,1,length.out=o),c(h[1:(o-1)],0),xout=x)[[2]] #rescale the trajectory to go extinct at one
  }
  else{
  }
  #plot(x,h,type='l')
  #binning of the trajectories
  binvals=rep(0,noofbins) #initialize memory
  ma=max(h)
  fun=approxfun(x,h) #function to be integrated
  for (l in 1:noofbins){ #determine the values for the bins. they correspond to the area under the trajectory of the brownian motion
    binvals[l]=mcint(fun,binborders[l],binborders[l+1],10^6,ma) #integrate with the monte carlo method from above to determine the value of the l-th bin
  }
  binvals=binvals/(sum(binvals*diff(binborders))) #normate the bins to have area 1
  artificialsamplelist[[k]]=binvals
  if (k%%10==0){
    print(paste(as.character(round((k/noofsamples)*100)),"%"))
  }
}
#artificial sample/trajectory
str(artificialsamplelist)
length(artificialsamplelist) #number of artificial samples generaed

############# Perform the test ##########
###transform simulated and empirical data into format suitable for input into the cramer test
realdatavals=numeric()
artdatavals=numeric()
for(k in 1:length(binlist)){
  realdatavals=c(realdatavals,binlist[[k]])
  artdatavals=c(artdatavals,artificialsamplelist[[k]])
}
str(realdatavals)
realmatrix=matrix(realdatavals,ncol=noofbins,byrow=TRUE)
artmatrix=matrix(artdatavals,ncol=noofbins,byrow=TRUE)
str(realmatrix)

#finally: test
require(cramer) #load the package

results=cramer.test(realmatrix,artmatrix,conf.level=0.95,replicates=10^4) #run the test
str(results) #show results


##########################################
#Appendix 3.2 Example 2: Testing Reversibility
##########################################
#This is the code for the example 2 in appendix 3.
#Make sure the raw data necessary is loaded (see comments at the very top)

#import downloaded data
te1=read.table("02sept2018rad.csv",sep="\t",header=TRUE)
noofbins=10 #number of bins in which occurrences will be binned 
binborders=seq(0,1,length.out = noofbins+1) #borders of the bins
#initialize storage
species=numeric()
age=numeric()
di=dim(te1)
#extract ages of occurrences and the corresponding species
for (k in 1:(dim(te1)[1])){ 
  ll=te1[k,]
  species=c(species,as.character(ll$Resolved.Species))
  age=c(age,ll$Age..Ma..Gradstein.et.al..2012)
  if (k%%1000==0){ #to demonstrate progress
    print(paste(as.character(round((k/di[1])*100)),"%"))
  }
}
rm(te1) #remove imported data
#create species list
uniquespecies=species[!duplicated(species)] 

#for each species, extract the agse of occurrences
agelist=list()
for (k in 1:length(uniquespecies)){ 
  l=which(species==uniquespecies[k])
  agelist[[k]]=sort(age[l])
}

#normalize occurrences for first occurrence to be at 0 and last at 1 (x-shift and x-scale)
normageslist=list()
occ=numeric()
i=1
for (k in 1:length(agelist)){ 
  if (length(agelist[[k]])>1){ #ignore species with only one occurrences
    occ=c(occ,length(agelist[[k]]))
    mi=min(agelist[[k]])
    ma=max(agelist[[k]])
    normageslist[[i]]=(agelist[[k]]-mi)/(ma-mi)
    i=i+1
  }
  else{
  }
}
#some info about the data
hist(occ) #histogram
length(occ) # total no of species
sum(occ)#overall number of occurernces
mean(occ) #average no of occurences
median(occ) # medianoccurences

#bin each species into the bins specified at the very beginning
binlist1=list()
for (k in 1:length(normageslist)){
  binval=rep(0,noofbins)
  age=normageslist[[k]]
  for (i in 1:noofbins){
    binval[i]=length(age[age>= binborders[i] & age < binborders[i+1]])
  }
  binval[noofbins]=binval[noofbins]+1
  binlist1[[k]]=binval/(sum(binval*diff(binborders)))
}

###testing procedure ####
require(cramer)
#number of tests
nooftests=1000 
#initialize storage
res=rep(0,nooftests)
pvals=rep(0,nooftests)
resultlist=list()
#run tests
for (w in 1:nooftests){ 
  #split into two samples
  xvals=numeric()
  yvals=numeric()
  for (k in 1:length(binlist1)){ #assign trajectories to one of the samples for the test
    p=runif(1) #fair coin flip
    if (p>0.5){ 
      xvals=c(xvals,binlist1[[k]])
    }
    else{
      yvals=c(yvals,binlist1[[k]])
    }
  }
  #transform into input format for the test
  xmat=matrix(xvals,ncol=noofbins,byrow=TRUE) 
  ymat=matrix(rev(yvals),ncol=noofbins,byrow=TRUE) #reverse trajectories by using rev
  #run the test
  results=cramer.test(xmat,ymat,conf.level=0.95) 
  #store results
  resultlist[[w]]=results
  res[w]=results$result
  pvals[w]=results$p.value
  print(w)
}
#histogram of the results (figure 2 in the appepdix)
hist(pvals,main='p-values for testing reversibility',xlab='p-value')
#some descriptive parameters
mean(pvals)
median(pvals)
max(pvals)

##############################################
#Appendix 1: Measuring asymmetry
#############################################
#In this section, the QuAsy as described in appendix 1 is implemented
#It is then demonstrated by creating FIg. ??? in the main text.
#This requires to have
#(1) the examples from Appendix 3.1 and 3.2 executed to the point where the variables "binlist" and "binlist1" have been filled with data (no simulation of the artificial trajectories or testing procedure required)
#(2) this requires the raw data (see comments at the top)
#define QuAsy ||x ||_{asy} for binned data/vectors x as defined in appendix 1.
symnorm=function(v){ 
  return(sum(abs(v-pmin(v,rev(v)))))
}

#determine distribution of asymmetry for all foraminifera
symval=rep(0,length(binlist)) #initialize storage
for (i in 1:length(binlist)){ #determine QuAsy for all forams
  symval[i]=0.1*symnorm(binlist[[i]])
}
hist(symval,xlab='Asymmetry',main='Foraminifera from example 1') #plot distribution of asymmetry

#determine distribution of asymmetry for all radiolaria
symval1=rep(0,length(binlist1)) #initialize storage
for (i in 1:length(binlist1)){ 
  symval1[i]=0.1*symnorm(binlist1[[i]]) #determine QuAsy
}
hist(symval1,xlab='Asymmetry',main='Radiolarians from example 2') #plot distribution of asymmetry

##### Nicer Figures #####
#asymmetry of radiolarians and formas
#joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories
#create figure 1 in the main text
par(mfrow=c(1,2))
symofaveragesforam=0.1*symnorm(Reduce("+",binlist)/length(binlist)) #asymmetry of the averaged trajectories
averageofsymforam=mean(symval) #average of the asymmetry of the trajectories
hist(symval,xlab='Asymmetry',main='Foraminifera')
points(symofaveragesforam,0,col='red',lwd=15)
points(averageofsymforam,0,col='blue',lwd=15)

hist(symval1,xlab='Asymmetry',main='Radiolaria')
symofaveragesrad=0.1*symnorm(Reduce("+",binlist1)/length(binlist1)) #asymmetry of the averaged trajectories
averageofsymrad=mean(symval) #average of the asymmetry of the trajectories
points(symofaveragesrad,0,col='red',lwd=15)
points(averageofsymrad,0,col='blue',lwd=15)
par(mfrow=c(1,1))

#get proportion of species that have asymmetry lowrer than the asymmetry of the average trajectory
rt=length(symval[symval <=symofaveragesforam])/length(symval)
rt1=length(symval1[symval1 <=symofaveragesrad])/length(symval1)

### show decomposition of a function into symmetric and asymmetric part
#this creates fig. 1 in the appendix
x=seq(-2,2,length.out=100)
par(mfrow=c(1,2))
#plot function to be decomposed
plot(x,sin(x),main='Original function',type='l',lwd='6',ylab='y',ylim=c(-1,2))
legend('topleft',lwd=6,legend='sin(x)',bty='n')
#plot decomposed parts (asymmetric and symmetric part)
plot(NULL,xlim=c(-2,2),ylim=c(-1,2),xlab='x',ylab='y',main='Decomposition')
lines(x,pmin(sin(x),sin(-x)),col='red',lwd=6)
lines(x,sin(x)-pmin(sin(x),sin(-x)),col='blue',lwd=6)
legend('topleft',col=c('red','blue'),lwd=6,legend=c('symmetric part','asymmetric part'),cex=0.7,bty='n')
par(mfrow=c(1,1))