# Script for 'Detecting telomere elongation in longitudinal datasets: Analysis of a proposal by Simons, Stulp and Nakagawa'
# Needs packages 'tidyr', and 'ggplot2'if you want to plot figure 1

###Function definitions#####

generate.true.attrition <- function(
  # This function tells you the proportion of true lengtheners for a specified set of parameter values
  max.years = 15, # the follow up interval 
  baselineTL.mean = 7000, # the mean of the TL distribution at baseline 
  baselineTL.sd = 700, # the sd of the TL distribution at baseline
  attrition.mean = 30, # mean annual attrition in bp 
  attrition.sd = 50, # sd of annual attrition in bp 
  error="sd", # whether the measurement error should be applied as a CV (proportional to TL) or a constant SD
  error.rate = 140, # Either the SDe, or a coefficient of variation expressed as a proportion not percentage, e.g. 0.02
  n = 10000, # the number of individuals in the simulation
  r = 0 # correlation between attrition in successive years
  )
  { 
  require(tidyr) 
  # generate the starting population of individuals: an array of n rows x max.years columns+1 (for the baseline)
  telomere.lengths = array(, dim = c(n, max.years+1)) # make an empty array for the TL data
  telomere.lengths[,1] = rnorm(n,baselineTL.mean,baselineTL.sd) # fill the first column with baseline values of TL
  
  # generate the TL values at yearly follow up points
  # calculate annual attrition
  # in the first year this is just drawn from the specified distribution
  attrition = rnorm(n,attrition.mean,attrition.sd)
  annual.attrition = attrition
  
  for(i in 2:(max.years+1)) {
    telomere.lengths[,i] = telomere.lengths[,i-1] - annual.attrition
    # Now calculate the annual attrition for the next year (this is equation 1 in the paper)
    if (r<1) {annual.attrition = r*attrition + sqrt(1-r^2)*rnorm(n,mean=attrition.mean*(1-r)/sqrt(1-r^2),sd=attrition.sd) }
  }
  
  # calculate true annual attrition from baseline: positive values = attrition; negative values = growth
  true.attrition = array(,dim = c(n,max.years)) # make an empty array for the attrition measures
  true.attrition = telomere.lengths[,1] - telomere.lengths[,2:(max.years+1)]
  true.elongation.annual=sum(true.attrition[ ,1] < 0)/length(true.attrition[ , 1])
  true.elongation.overall=sum(true.attrition[ , max.years]<0)/length(true.attrition[ , max.years])
  
  # measure TL with error
  measured.TL = array(, dim = c(n, max.years+1)) # make an empty array for the TL measurement data
  if (error=="cv"){
    for (i in 1:n){
    for (j in 1:(max.years+1)){
      measured.TL[i,j] = rnorm(1,mean=telomere.lengths[i,j],sd=abs(telomere.lengths[i,j]*error.rate)) # abs stops negative sds
    }
  }
}
  if (error=="sd"){
    for (i in 1:n){
      for (j in 1:(max.years+1)){
        measured.TL[i,j] = rnorm(1,mean=telomere.lengths[i,j],error.rate)
      }
    }
  }
return(true.elongation.overall)
}

generate.TL.data = function(
# This function generates the data for each simulated dataset
  max.years = 15, # the maximum follow up interval 
  baselineTL.mean = 7000, # the mean of the TL distribution at baseline 
  baselineTL.sd = 700, # the sd of the TL distribution at baseline 
  attrition.mean = 30, # mean annual attrition in bp 
  attrition.sd = 50, # sd of annual attrition in bp 
  error="sd", # Whether the measurement error is to be implemented as a CV (proportional to TL), or a constant SD
  error.rate = 140, # Either the SDe, or a coefficient of variation expressed as a proportion not percentage, e.g. 0.02
  n = 10000, # the number of individuals in the simulation
  r = 0 # correlation between attrition in successive years
  ) 
  {
  require(tidyr) 
  # generate the starting population of individuals: an array of n rows x max.years columns+1 (for the baseline)
  telomere.lengths = array(, dim = c(n, max.years+1)) # make an empty array for the TL data
  telomere.lengths[,1] = rnorm(n,baselineTL.mean,baselineTL.sd) # fill the first column with baseline values of TL
  # generate the TL values at yearly follow up points
  # calculate annual attrition
  # in the first year this is just drawn from the specified distribution
  attrition = rnorm(n,attrition.mean,attrition.sd)
  annual.attrition = attrition
  for(i in 2:(max.years+1)) {
    telomere.lengths[,i] = telomere.lengths[,i-1] - annual.attrition
    # Now calculate the annual attrition for the next year (this is equation 1 in the Bateson and Nettle paper)
    if (r<1) {annual.attrition = r*attrition + sqrt(1-r^2)*rnorm(n,mean=attrition.mean*(1-r)/sqrt(1-r^2),sd=attrition.sd) }
  }
  
  # calculate true annual attrition from baseline: positive values = attrition; negative values = growth
  true.attrition = array(,dim = c(n,max.years)) # make an empty array for the attrition measures
  true.attrition = telomere.lengths[,1] - telomere.lengths[,2:(max.years+1)]
  true.elongation.annual=sum(true.attrition[ ,1] < 0)/length(true.attrition[ , 1])
  true.elongation.overall=sum(true.attrition[ , max.years]<0)/length(true.attrition[ , max.years])

  # measure TL with a defined CV or a constant SD equal to 7000 * CV
  # note that CV = sd/mean therefore sd = CV*mean
  measured.TL = array(, dim = c(n, max.years+1)) # make an empty array for the TL measurement data
if (error=="cv"){
  for (i in 1:n){
    for (j in 1:(max.years+1)){
      measured.TL[i,j] = rnorm(1,mean=telomere.lengths[i,j],sd=abs(telomere.lengths[i,j]*error.rate)) # abs stops negative sds
    }
  }
}
  if (error=="sd"){
    for (i in 1:n){
      for (j in 1:(max.years+1)){
        measured.TL[i,j] = rnorm(1,mean=telomere.lengths[i,j],sd=error.rate) 
      }
    }
  }
  
  return(as.data.frame(measured.TL)) 
}

simons = function(data=data) {
  # This function applies the Simons et al statistic to a data frame, returning the F and p-value
  # NB This is based on Mirre Simons' code published with his paper
  # Estimating error using individual regressions, Equation 4 in main manuscript
  matrixresi<-matrix(0,dim(data)[1],1)  # a matrix to put the residuals of the individual regressions in in
  x=1:dim(data)[2] #number of columns is the number of timepoints
  j=1
  while(j<(dim(data)[1]+1)) #loop the individual regressions for the amount of individuals in the dataset
  {
    fit<-lm(t(data[j,])~x)  #individual linear regression
    matrixresi[j,]<-sum((residuals(fit))^2)/(length(x)-2) #residual sum of squares
    j<-j+1
  }
  sigma1<-mean(c(matrixresi)) #average residual sum of squares across the individuals
  
  
  #Estimating error under the assumption that TL cannot increase over time, Equation 5 in main manuscript
  
  #first we calculate the increases TL increases over time (between first(1) and last(3) timepoint)
  deltaTL=data[,3]-data[,1]
  
  #Next we determine which individuals increase in TL between the first and last timepoint
  indexTLincreases=which(deltaTL>0)
  
  #We create a new variable including only the data of individual increases
  TLincreases=deltaTL[indexTLincreases]
  #sigma2
  sigma2=0.5*sum(TLincreases^2)/(length(TLincreases))
  
  
  #compare both estimates of error variance (sigma1 and sigma2), Equation 6 in main manuscript
  vratio=sigma2/sigma1
  pvalue<-pf(vratio,length(TLincreases)-1,dim(data)[1]-1,lower.tail=F)
  return(c(vratio, pvalue))
}

simons.replicate<-function(reps=100, attrition.sd = 50, r = 0, error.rate = 140, max.years=2) {
# This function runs simons function repeatedly for different parameter values and returns the vector of p values
    ps=NA
  for (i in 1:reps) {
    print(paste("replicate ", i))
    data = generate.TL.data(max.years = max.years, long.thin = FALSE, stress.attrition.sd = stress.attrition.sd, CV = CV, r = r)
    ps[i]=simons(data)[2]
  }
  print(sum(ps<0.05)/length(ps))
  return(ps)
}


run.sims<-function(reps=100, attrition.sd=50, error="sd", error.rate=140, n=10000){
# This function runs the whole set of simulations for one column of figure 1
rs=NA
true.elongation=NA
years=NA
simons.sig=NA
ps=NA
current.true.elongations=NA
for (current.r in c(1, 0.5, 0)){
  print(paste("r = ", current.r))
  for (current.years in 2:11) {
    print(paste("Years = ", current.years))
    rs=append(rs, current.r)
    years=append(years,current.years)
# Calculate the proportion of true elongation at that parameter combination
  for (i in 1:reps)
    {current.true.elongations[i]=generate.true.attrition(max.years=current.years, attrition.sd=attrition.sd, error.rate=error.rate, r=current.r, n=n, error=error)}
    true.elongation=append(true.elongation, mean(current.true.elongations))
    print(paste("Proportion of true lengtheners: ", mean(current.true.elongations)))
  for (i in 1:reps) {
    data = generate.TL.data(max.years = current.years, attrition.sd = attrition.sd, error.rate=error.rate, r=current.r, n=n, error=error)
          ps[i]=simons(data)[2]}
    sig.prop=sum(ps<0.05)/length(ps)
    print(paste("Simons' test significant proportion: ", sig.prop))
    simons.sig=append(simons.sig, sig.prop)
    }}
d=data.frame(rs, years, true.elongation, simons.sig)
d=d[-1, ]
write.csv(d, file=paste("simons.simulations.data.",error, error.rate,".csv"))
return(d)
}

##### Replication code######
# Caution: these lines take a long time to run! (about 8 hours for each run.sims statement on my ordinary desktop PC)
# They save their output so you can leave them running
run.sims(reps=100, n=10000, error="sd", error.rate=0)
run.sims(reps=100, n=10000, error="sd", error.rate=140)
run.sims(reps=100, n=10000, error="sd", error.rate=560)

# Making figure 1
library(ggplot2)
d0=read.csv("simons.simulations.data. sd 0 .csv")
d2=read.csv("simons.simulations.data. sd 140 .csv")
d8=read.csv("simons.simulations.data. sd 560 .csv")
d0$rtext[d2$rs==1]="r = 1"
d0$rtext[d2$rs==0.5] = "r = 0.5"
d0$rtext[d2$rs==0] = "r = 0"
d2$rtext[d2$rs==1]="r = 1"
d2$rtext[d2$rs==0.5] = "r = 0.5"
d2$rtext[d2$rs==0] = "r = 0"
d8$rtext="r = 1"
d8$rtext[d8$rs==0.5] = "r = 0.5"
d8$rtext[d8$rs==0] = "r = 0"
d0$cv=rep("SDe = 0", times=length(d0$rtext))
d2$cv=rep("SDe = 140", times=length(d2$rtext))
d8$cv=rep("SDe = 560", times=length(d8$rtext))
d=rbind(d0, d2, d8)
d$fill<-"no"
d$fill[d$true.elongation>=0.05]<-"yes"
figsd=ggplot(d, aes(x=years, y=true.elongation)) + 
  coord_cartesian(ylim=c(0,1)) + 
  geom_ribbon(aes(ymax=1, ymin=0, fill=fill)) + 
  scale_fill_manual(values=c("white", "grey80")) + 
  geom_line() + 
  geom_point(aes(y=simons.sig), colour="red") + 
  geom_line(aes(y=simons.sig), colour="red", linetype=2) + 
  guides(fill=FALSE) + 
  theme_bw() + 
  theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) + 
  ylab("Proportion") + 
  xlab("Years of follow-up") +
  coord_cartesian(ylim=c(0, 1.05)) +
  facet_grid(rtext ~ cv)
figsd

png("figure1.png", res=300, units="in", width=8, height=8)
figsd
dev.off()