#This is the R code for "Enforced Symmetry: The Necessity of Symmetric Waxing and Waning" #By Niklas Hohmann* and Emilia Jarochowska #*Corresponding author, Email: Niklas.Hohmann@fau.de #The raw data necessary for the examples 1 and 2 in the Appendix and figure 1 in the main text #is available on the Open Science Framework (OSF) under osf.io/zw5ef/ #For these part of the code to work, please download it and place it in you working directory #make sure to set your working directory (wd) right!!! #setwd("insert your working directory with the raw data here") #if necessary uncomment and set the wd ################################### #Figures from the main text ################################### ### generate fig. 1: Explaining the procedure of data processing par(mfrow=c(1,5)) par(mar=c(3,3,2,1)) par(mgp=c(1.4,0.5,0)) x=seq(0,1.5,length.out=1000) x1=c(0.2,1,1.2,1.5) y1=c(0,1,4,0) m1=c(1,0,0,0) f1=splinefunH(x1,y1,m1) plot(NULL,xlim=c(0,1.5),ylim=c(0,4),ylab='MESs',xlab='Time',cex.lab=1.3) lines(seq(0.2,1.5,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red') x2=c(0,0.2,0.8) y2=c(0,2,0) m2=c(1,0,-0.1) f2=splinefunH(x2,y2,m2) lines(seq(0,0.8,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue') legend('topleft',legend=c('Taxon 1','Taxon 2'),col=c('red',"blue"),lwd=6,bty='n') ### plot(NULL,xlim=c(0,1.5),ylim=c(0,4),main='x-shift',ylab='MESs',xlab='Taxon Age',cex.lab=1.3) lines(seq(0,1.3,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red') lines(seq(0,0.8,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue') ### plot(NULL,xlim=c(0,1.5),main='x-scale',ylim=c(0,4),ylab='MESs',xlab='Relative Taxon Age',cex.lab=1.3) lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100)),lwd=6,col='red') lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100)),lwd=6,col='blue') ### plot(NULL,xlim=c(0,1),main='y-scale',ylim=c(0,1.2),ylab='Relative MESs',xlab='Relative Taxon Age',cex.lab=1.3) lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100))/4,lwd=6,col='red') lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100))/2,lwd=6,col='blue') ### plot(NULL,xlim=c(0,1),main='Averaging',ylim=c(0,1.2),ylab='Relative MESs',xlab='Relative Taxon Age',cex.lab=1.3) c1=rgb(1,0,0,0.6) c2=rgb(0,0,1,0.6) lines(seq(0,1,length.out = 100),f1(seq(0.2,1.5,length.out = 100))/4,lwd=6,col=c1) lines(seq(0,1,length.out = 100),f2(seq(0,0.8,length.out = 100))/2,lwd=6,col=c2) lines(seq(0,1,length.out = 100),0.5*(f1(seq(0.2,1.5,length.out = 100))/4 + f2(seq(0,0.8,length.out = 100))/2),lwd=6) legend('topleft',legend=c('Average'),col=c('black'),lwd=6,bty='n') #### Figure 2: The decomposition of a function into symmetric and asymmetric part x=c(-2,0.5,2) y=c(1,4,0) m=c(0,0,0) f=splinefunH(x,y,m) x1=seq(-2,2,length.out = 1000) y1=f(x1) par(mfrow=c(1,2)) par(mar=c(3,2.3,0.5,0.5)) par(mgp=c(1.2,0.5,0)) plot(x1,y1,xlab='x',ylab='f(x)',ylim=c(0,4.85),type='l',lwd=6) ysym=pmin(y1,rev(y1)) plot(NULL,xlim=c(-2,2),ylim=c(0,4.85),xlab = 'x',ylab='f(x)') polygon(c(-2,x1),c(0,y1),col='blue',border='blue') polygon(c(-2,x1),c(0,ysym),col='red',border=TRUE,lwd=2) legend('topleft',fill=c('red','blue'),border=c('black',NA),legend=c('Symmetric part','Asymmetric part'),bty='n') ############# Figure 3: Plot a number of trajectories including theor symmetric/asymmetric part #requires binlist and/or binlist1 from the examples in the appendix (see below) n=9 par(mfrow=c(3,3)) par(mar=c(3,2.1,1,1)) par(mgp=c(1.3,0.4,0)) l=sample.int(length(binlist),size=n,replace = FALSE) for( i in 1:n){ pp=binlist[[l[i]]] val=matrix(c(pmin(pp,rev(pp)),pp-pmin(pp,rev(pp))),nrow=2,byrow=TRUE) barplot(val,col=c('red','blue'),space=0,xlab='Relative Age',ylab='Rescaled Occ.') axis(1,at=c(0,10),labels=c(0,1)) } ############## Figure 4: asymmetry of radiolarians and forams #requires binlist and/or binlist1 from example 1 and 2 in the example #joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories symnorm=function(v){ #QuaSy for bbinned data return(sum(abs(v-pmin(v,rev(v))))) } #determine distribution of asymmetry for all foraminifera symval=rep(0,length(binlist)) #initialize storage for (i in 1:length(binlist)){ #determine QuAsy for all forams symval[i]=0.1*symnorm(binlist[[i]]) } hist(symval,xlab='Asymmetry',main='Foraminifera from example 1') #plot distribution of asymmetry #determine distribution of asymmetry for all radiolaria symval1=rep(0,length(binlist1)) #initialize storage for (i in 1:length(binlist1)){ symval1[i]=0.1*symnorm(binlist1[[i]]) #determine QuAsy } hist(symval1,xlab='Asymmetry',main='Radiolarians from example 2') #plot distribution of asymmetry ##### Nicer Figures ##### #asymmetry of radiolarians and formas #joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories #create figure 1 in the main text par(mfrow=c(1,2)) symofaveragesforam=0.1*symnorm(Reduce("+",binlist)/length(binlist)) #asymmetry of the averaged trajectories averageofsymforam=mean(symval) #average of the asymmetry of the trajectories hist(symval,xlab='Asymmetry (QuaSy)',main='Foraminifera') lines(c(symofaveragesforam,symofaveragesforam),c(0,75),col='red',lwd=4,lty=2) lines(c(averageofsymforam,averageofsymforam),c(0,75),col='blue',lwd=4,lty=2) hist(symval1,xlab='Asymmetry (QuaSy)',main='Radiolaria') symofaveragesrad=0.1*symnorm(Reduce("+",binlist1)/length(binlist1)) #asymmetry of the averaged trajectories averageofsymrad=mean(symval) #average of the asymmetry of the trajectories lines(c(symofaveragesrad,symofaveragesrad),c(0,90),col='red',lwd=4,lty=2) lines(c(averageofsymrad,averageofsymrad),c(0,90),col='blue',lwd=4,lty=2) par(mfrow=c(1,1)) #get proportion of species that have asymmetry lowrer than the asymmetry of the average trajectory rt=length(symval[symval <=symofaveragesforam])/length(symval) rt1=length(symval1[symval1 <=symofaveragesrad])/length(symval1) #### Figure 5 and 6 random walks ################################# #Random Walk figures for the main text ################################ #This code generates the figures for the part "Symmetry by Conditioning" of the main text #this requires the function "symnorm" defined in the section "Appendix 1: Measuring asymmetry" #ordinary random walk # starst at time 0 at point 0! just steps after that are recorded! n=10 #no of steps pup=0.5 #Probability to go UP getpath=function(n,pup){ l=rbinom(n,1,pup) l[l==0]=-1 return(cumsum(l)) } #random walk conditioned to be positive. here the starting point at zero is included in the output getcondpospath=function(n,pup){ repeat{ path=c(getpath(n,pup)) if (min(path)>0){ break } } return(c(0,path)) } getcondpospath(n,0.4) #random walk conditioned to be positive and hit zero after n steps getcondpath=function(n,pup){ repeat{ path=c(getpath(n,pup)) if (min(path[1:(n-1)])>0 & path[n]==0){ break } } return(c(0,path)) } getcondpath(10,0.4) ########generate data for the plot with the distributin of asymmetry for unconditioned/conditioned random walks noofpaths=10000 #number of simulations to approx the distribution n=20 #conditioned to go extinct after 20 time steps pup=0.6 #probability of the random walk to increase intval1=rep(0,noofpaths) #initialize storage #condiitoned positive paths for (i in 1:noofpaths){ path=getcondpospath(n,pup) asym= path-pmin(path,rev(path)) intval1[i]=integrate(approxfun(0:n,asym),0,n)[[1]] } hist(intval1) #distribution of asymmetry for paths conditoned to be positive #condiitoned positive paths that go extinct after n steps intval2=rep(0,noofpaths) for (i in 1:noofpaths){ path=getcondpath(n,pup) asym= path-pmin(path,rev(path)) intval2[i]=integrate(approxfun(0:n,asym),0,n)[[1]] if(i %%100==0){ print(i) } } hist(intval2) #distribution of asymmetry for paths conditoned to be positive AND go extinct after n steps ##get asymmetry of a path that is the average over many paths opath=rep(0,n+1) #initialize storage #get averaged path for (i in 1:noofpaths){ opath=getcondpath(n,pup)+opath } opath=opath/noofpaths #get asymmetrical part of the averaged path asymopath=opath-pmin(opath,rev(opath)) #determine 1- norm of the asymmetric part to determine the QuAsy of the averaged path ll=integrate(approxfun(0:n,asymopath),0,n)[[1]] ###Generate figure no. 6 in the main text col1=rgb(1,0,0,0.8) col2=rgb(0,0,1,0.5) hist(intval2,col=col1,freq=FALSE,xlim=c(0,max(intval1)),xlab='Asymmetry',main='Asymmetry of Random Walks') hist(intval1,col=col2,freq=FALSE,add=TRUE) points(ll,0,cex=1.5,pch=19) legend('topright',col=c(col1,col2,'black'),pch=c(NA,NA,19),legend=c('Positive Random walk','Positive Random Walk + Extinction','Averaged Positive Random Walk + Extinction'),bty='n',fill=c(col2,col1,NA),border=c(NA,NA,NA)) ###Generate figure no. 5 in the main text. plot(NULL,xlim=c(0,20),ylim=c(0,12),xlab='Time steps',ylab='',main='Behaviour of Random Walks') lines(0:20,getcondpospath(n,pup),col=rgb(0,0,1,1),lwd=4) lines(0:20,getcondpath(n,pup),col=rgb(1,0,0,1),lwd=4) lines(0:20,getcondpospath(n,pup),col=rgb(0,0,1,0.5),lwd=8) lines(0:20,getcondpath(n,pup),col=rgb(1,0,0,0.5),lwd=8) lines(0:20,opath,lwd=3) ################################### #Appendix 3.1: Example 1: Testing the Age-Area Hypothesis ################################### #This is the code for the example 1 in appendix 3. #Make sure the raw data necessary is loaded (see comments at the very top) #### get empirical data to test the age-area hypotheses against #### noofbins=10 #number of bins in which occurrences will be binned binborders=seq(0,1,length.out = noofbins+1) #borders of the bins te1=read.table("02sept2018foram.csv",sep="\t",header=TRUE) #import empirical data #initialize storage species=numeric() age=numeric() di=dim(te1) #extract ages of occurrences and the corresponding species for (k in 1:(dim(te1)[1])){ ll=te1[k,] species=c(species,as.character(ll$Resolved.Species)) age=c(age,ll$Age..Ma..Gradstein.et.al..2012) if (k%%1000==0){ #to demonstrate progress print(paste(as.character(round((k/di[1])*100)),"%")) } } rm(te1) #remove imported data #create species list uniquespecies=species[!duplicated(species)] #for each species, extract the agse of occurrences agelist=list() for (k in 1:length(uniquespecies)){ l=which(species==uniquespecies[k]) agelist[[k]]=sort(age[l]) } #normalize occurrences for first occurrence to be at 0 and last at 1 (x-shift and x-scale) normageslist=list() occ=numeric() i=1 for (k in 1:length(agelist)){ if (length(agelist[[k]])>1){ #ignore species with only one occurrences occ=c(occ,length(agelist[[k]])) mi=min(agelist[[k]]) ma=max(agelist[[k]]) normageslist[[i]]=(agelist[[k]]-mi)/(ma-mi) i=i+1 } else{ } } #some info about the data hist(occ) #histogram length(occ) # total no of species sum(occ)#overall number of occurernces mean(occ) #average no of occurences median(occ) # medianoccurences #bin each species into the bins specified at the very beginning binlist=list() for (k in 1:length(normageslist)){ binval=rep(0,noofbins) age=normageslist[[k]] for (i in 1:noofbins){ binval[i]=length(age[age>= binborders[i] & age < binborders[i+1]]) } binval[noofbins]=binval[noofbins]+1 binlist[[k]]=binval/(sum(binval*diff(binborders))) } ######### simulated trajectories under the age-are hypothesis ######### #subroutine to simulate path of a brownian motion between 0 and 1 #based on https://en.wikipedia.org/wiki/Wiener_process#Wiener_representation brownianpath=function(x,approxqual){ #x is where the position of the path will be determined, approxqual determines the quality of the approximation out=rep(0,length(x)) #initialize memory for (k in 1:approxqual){ out=(sqrt(2)*rnorm(1)/((k-0.5)*pi))*(sin((k-0.5)*pi*x))+out } return(out) } #subroutine: monte carlo integration for highly irregular functions (standard implementation of integration in R tends to fail for paths of the brownian motion) mcint=function(fun,leftlim,rightlim,runs,funmax){ #fun is a positive function to integrate, leftlim is the left integration limit, rightlim is the right integration limit #runs is the number of points used for the monte carlo approach #funmax is a number that needs to be higher than the maximum value of fun in the integration interval for the method to work xvals=runif(runs,leftlim,rightlim) #determine uniformly distributed x values yvals=runif(runs,0,funmax) #determine uniformly distributed y values intval=length(xvals[fun(xvals)>yvals])*((rightlim-leftlim)*funmax)/runs #the value of the integral is proportional to the number of points under the function, divided by the overall number of points return(intval) } noofsamples=length(binlist) #number of artificial trajectories that will be simulated m=5 #gradient, determines how strong the drift is x=seq(0,1,length.out = 1000)#no of points where the brownian motion is determined approxqual=1000 #quality of the approximation of the brownian motion artificialsamplelist=list() #stroage for the simulated trajectories #Simulate the trajectories and bin them for (k in 1:noofsamples){ repeat{ h=brownianpath(x,approxqual)+m*x #get path of the brownian motion plus drift if (min(h[1:10])>=0){ #if the path hits zero too early, repeat. The number 10 is chosen arbitrarily break } } if (length(h[h<0])>0){ #if the species goes extinct before time 1: o=min(which(h<0)) #determine where it goes extinct h=approx(seq(0,1,length.out=o),c(h[1:(o-1)],0),xout=x)[[2]] #rescale the trajectory to go extinct at one } else{ } #plot(x,h,type='l') #binning of the trajectories binvals=rep(0,noofbins) #initialize memory ma=max(h) fun=approxfun(x,h) #function to be integrated for (l in 1:noofbins){ #determine the values for the bins. they correspond to the area under the trajectory of the brownian motion binvals[l]=mcint(fun,binborders[l],binborders[l+1],10^6,ma) #integrate with the monte carlo method from above to determine the value of the l-th bin } binvals=binvals/(sum(binvals*diff(binborders))) #normate the bins to have area 1 artificialsamplelist[[k]]=binvals if (k%%10==0){ print(paste(as.character(round((k/noofsamples)*100)),"%")) } } #artificial sample/trajectory str(artificialsamplelist) length(artificialsamplelist) #number of artificial samples generaed ############# Perform the test ########## ###transform simulated and empirical data into format suitable for input into the cramer test realdatavals=numeric() artdatavals=numeric() for(k in 1:length(binlist)){ realdatavals=c(realdatavals,binlist[[k]]) artdatavals=c(artdatavals,artificialsamplelist[[k]]) } str(realdatavals) realmatrix=matrix(realdatavals,ncol=noofbins,byrow=TRUE) artmatrix=matrix(artdatavals,ncol=noofbins,byrow=TRUE) str(realmatrix) #finally: test require(cramer) #load the package results=cramer.test(realmatrix,artmatrix,conf.level=0.95,replicates=10^4) #run the test str(results) #show results ########################################## #Appendix 3.2 Example 2: Testing Reversibility ########################################## #This is the code for the example 2 in appendix 3. #Make sure the raw data necessary is loaded (see comments at the very top) #import downloaded data te1=read.table("02sept2018rad.csv",sep="\t",header=TRUE) noofbins=10 #number of bins in which occurrences will be binned binborders=seq(0,1,length.out = noofbins+1) #borders of the bins #initialize storage species=numeric() age=numeric() di=dim(te1) #extract ages of occurrences and the corresponding species for (k in 1:(dim(te1)[1])){ ll=te1[k,] species=c(species,as.character(ll$Resolved.Species)) age=c(age,ll$Age..Ma..Gradstein.et.al..2012) if (k%%1000==0){ #to demonstrate progress print(paste(as.character(round((k/di[1])*100)),"%")) } } rm(te1) #remove imported data #create species list uniquespecies=species[!duplicated(species)] #for each species, extract the agse of occurrences agelist=list() for (k in 1:length(uniquespecies)){ l=which(species==uniquespecies[k]) agelist[[k]]=sort(age[l]) } #normalize occurrences for first occurrence to be at 0 and last at 1 (x-shift and x-scale) normageslist=list() occ=numeric() i=1 for (k in 1:length(agelist)){ if (length(agelist[[k]])>1){ #ignore species with only one occurrences occ=c(occ,length(agelist[[k]])) mi=min(agelist[[k]]) ma=max(agelist[[k]]) normageslist[[i]]=(agelist[[k]]-mi)/(ma-mi) i=i+1 } else{ } } #some info about the data hist(occ) #histogram length(occ) # total no of species sum(occ)#overall number of occurernces mean(occ) #average no of occurences median(occ) # medianoccurences #bin each species into the bins specified at the very beginning binlist1=list() for (k in 1:length(normageslist)){ binval=rep(0,noofbins) age=normageslist[[k]] for (i in 1:noofbins){ binval[i]=length(age[age>= binborders[i] & age < binborders[i+1]]) } binval[noofbins]=binval[noofbins]+1 binlist1[[k]]=binval/(sum(binval*diff(binborders))) } ###testing procedure #### require(cramer) #number of tests nooftests=1000 #initialize storage res=rep(0,nooftests) pvals=rep(0,nooftests) resultlist=list() #run tests for (w in 1:nooftests){ #split into two samples xvals=numeric() yvals=numeric() for (k in 1:length(binlist1)){ #assign trajectories to one of the samples for the test p=runif(1) #fair coin flip if (p>0.5){ xvals=c(xvals,binlist1[[k]]) } else{ yvals=c(yvals,binlist1[[k]]) } } #transform into input format for the test xmat=matrix(xvals,ncol=noofbins,byrow=TRUE) ymat=matrix(rev(yvals),ncol=noofbins,byrow=TRUE) #reverse trajectories by using rev #run the test results=cramer.test(xmat,ymat,conf.level=0.95) #store results resultlist[[w]]=results res[w]=results$result pvals[w]=results$p.value print(w) } #histogram of the results (figure 2 in the appepdix) hist(pvals,main='p-values for testing reversibility',xlab='p-value') #some descriptive parameters mean(pvals) median(pvals) max(pvals) ############################################## #Appendix 1: Measuring asymmetry ############################################# #In this section, the QuAsy as described in appendix 1 is implemented #It is then demonstrated by creating FIg. ??? in the main text. #This requires to have #(1) the examples from Appendix 3.1 and 3.2 executed to the point where the variables "binlist" and "binlist1" have been filled with data (no simulation of the artificial trajectories or testing procedure required) #(2) this requires the raw data (see comments at the top) #define QuAsy ||x ||_{asy} for binned data/vectors x as defined in appendix 1. symnorm=function(v){ return(sum(abs(v-pmin(v,rev(v))))) } #determine distribution of asymmetry for all foraminifera symval=rep(0,length(binlist)) #initialize storage for (i in 1:length(binlist)){ #determine QuAsy for all forams symval[i]=0.1*symnorm(binlist[[i]]) } hist(symval,xlab='Asymmetry',main='Foraminifera from example 1') #plot distribution of asymmetry #determine distribution of asymmetry for all radiolaria symval1=rep(0,length(binlist1)) #initialize storage for (i in 1:length(binlist1)){ symval1[i]=0.1*symnorm(binlist1[[i]]) #determine QuAsy } hist(symval1,xlab='Asymmetry',main='Radiolarians from example 2') #plot distribution of asymmetry ##### Nicer Figures ##### #asymmetry of radiolarians and formas #joint plot, red=asymmetry of the averaged trajectories, blue=average asymmetry of the trajectories #create figure 1 in the main text par(mfrow=c(1,2)) symofaveragesforam=0.1*symnorm(Reduce("+",binlist)/length(binlist)) #asymmetry of the averaged trajectories averageofsymforam=mean(symval) #average of the asymmetry of the trajectories hist(symval,xlab='Asymmetry',main='Foraminifera') points(symofaveragesforam,0,col='red',lwd=15) points(averageofsymforam,0,col='blue',lwd=15) hist(symval1,xlab='Asymmetry',main='Radiolaria') symofaveragesrad=0.1*symnorm(Reduce("+",binlist1)/length(binlist1)) #asymmetry of the averaged trajectories averageofsymrad=mean(symval) #average of the asymmetry of the trajectories points(symofaveragesrad,0,col='red',lwd=15) points(averageofsymrad,0,col='blue',lwd=15) par(mfrow=c(1,1)) #get proportion of species that have asymmetry lowrer than the asymmetry of the average trajectory rt=length(symval[symval <=symofaveragesforam])/length(symval) rt1=length(symval1[symval1 <=symofaveragesrad])/length(symval1) ### show decomposition of a function into symmetric and asymmetric part #this creates fig. 1 in the appendix x=seq(-2,2,length.out=100) par(mfrow=c(1,2)) #plot function to be decomposed plot(x,sin(x),main='Original function',type='l',lwd='6',ylab='y',ylim=c(-1,2)) legend('topleft',lwd=6,legend='sin(x)',bty='n') #plot decomposed parts (asymmetric and symmetric part) plot(NULL,xlim=c(-2,2),ylim=c(-1,2),xlab='x',ylab='y',main='Decomposition') lines(x,pmin(sin(x),sin(-x)),col='red',lwd=6) lines(x,sin(x)-pmin(sin(x),sin(-x)),col='blue',lwd=6) legend('topleft',col=c('red','blue'),lwd=6,legend=c('symmetric part','asymmetric part'),cex=0.7,bty='n') par(mfrow=c(1,1))