# Supplemental Information: R code for gapfilling algorithm, with an example applied to runoff in the Hubbard Brook record. 

data1<-read.csv("HBEF_flow.csv",header = TRUE)
##Read in datasheet with 3 coloumns: Date, Measured flow in L/s, and Predicted flow in L/s
#1L=0.035ft^3  watershed6=1423976.4ft^2 300sec/5min 12inches/ft 25.4mm/inch
data1$Actual_mm_5min<-data1$Actual_Q_L_s*0.0353147*(1/1423976.4)*300*12*25.4  #convert flow rate to watershed mm per 5 minutes 
data1$Predicted_mm_5min<-data1$Predicted_Q_L_s*0.0353147*(1/1423976.4)*300*12*25.4#convert flow rate to watershed mm per 5 minutes
date1 = as.POSIXct(data1[,1],origin = "1970-01-01",tz="EST") # convert the numeric date into a date object
date1 = strptime(date1,format="%Y-%m-%d %H:%M:%S",tz="EST")
date1. = as.numeric(date1) 

#########################################
# CREATE A FUNCTION FOR MAKING FAKE GAPS#
#########################################
#"gaplength" = the duration of the gap to simulate (i.e. how many 5 minute blocks to include)
#"iterations"= how many fake gaps to simulate
#"data"= the name of the dataframe to use (dataframe must have 3 columns: timestamp, observed Q (l/s), and modeled Q (l/s), observed q (mm), modeled q (mm))

mdprdct <- function(gaplength, iterations, data){
  output<-matrix(ncol=6,nrow=iterations)
  for (i in 1:iterations) {
    sample((gaplength+3527):(nrow(data)-(gaplength+3527)), 1)->a # identify the random beginning of the simulated gap
    diffy1 <- data[a-1,4]-data[a,5]
    diffy2 <- data[a+gaplength+1,4]-data[a+gaplength,5]
    diffy <- c(diffy1, diffy2)
    diffx <- c(data[a,1], data[a+gaplength,1])
    mm <- approx(diffx, diffy, data[a:(a+gaplength),1])
    gapobserved <- data[a:(a+gaplength),4] #grab real flux observations for W6 during the fake gap 
    gappredicted <- data[a:(a+gaplength),5]+mm$y #grab modeled (predicted) flux observations for W6 during the fake gap    
    gapflowrate <-  data[a:(a+gaplength),2] #grab flow rate (for averaging in figure 3)
    output[i,1] <- sum(gapobserved, na.rm=T) # sum the actual total mm in W6 during the gap
    output[i,2] <- sum(gappredicted, na.rm=T) # sum the predicted total mm in W6 during the gap
    output[i,3] <- max(gapobserved, na.rm=T) # calculate the max runoff in mm/5 min in W6 during the gap
    output[i,4] <- mean(gapobserved, na.rm=T) # calculate the mean runoff in mm/5 min in W6 during the gap
    output[i,5] <- output[i,2]-output[i,1] #calculate error in total flux during gap (predicted -observed, so that overestimations are positive?)
    output[i,6] <- mean(gapflowrate, na.rm=T) # calculate the mean flow rate l/s in W6 during the gap
  }
  colnames(output) <- c("W6 Real"," W6 Predicted", "Max W6", "Mean W6", "Error", "avgFlow_l_s")
  return(output)
}


#######################################################################################
#APPLY THE NEW FUNCTION TO QUANTIFY UNCERTAINTY IN RUNOFF DUE TO GAPS AT HUBBARD BROOK#
#######################################################################################

w6gaps = read.csv("realHBgapsW6_JLC.csv") # read in spreadsheet of real gaps in the watershed 6 record
w6gaps$begin = strptime(paste(w6gaps[,1], sprintf("%04d",w6gaps[,2])), format="%m/%d/%Y %H %M", tz="EST") # make the beginning of the gap a date object
w6gaps$end = strptime(paste(w6gaps[,3], sprintf("%04d",w6gaps[,4])), format="%m/%d/%Y %H %M",tz="EST") # make the end of the gap a date object

nogaps=data1
for(i in 1:length(w6gaps[,1])){ 
  nogaps=subset(nogaps,Date<as.POSIXlt(w6gaps[i,7])|Date>as.POSIXlt(w6gaps[i,8]))
} #remove the gaps
nogaps=subset(nogaps,Date>820479600)
nogaps=subset(nogaps,Date<1262329200)
withgaps=subset(data1,Date>820479600)
withgaps=subset(withgaps,Date<1262329200)

#Subset and quantify uncertiany in the gaps of interest

w6gaps.partial = subset(w6gaps, w6gaps$Cause=="TechErr") # subset by gap cause (currently technician error)

begin = strptime(paste(w6gaps.partial[,1], sprintf("%04d",w6gaps.partial[,2])), format="%m/%d/%Y %H %M",tz="EST") # make the beginning of the gap a date object
end = strptime(paste(w6gaps.partial[,3], sprintf("%04d",w6gaps.partial[,4])), format="%m/%d/%Y %H %M",tz="EST") # make the end of the gap a date object
duration = as.numeric(end) - as.numeric(begin) # calculate the gap duration, in seconds
nsteps = as.numeric(duration/300-1) # number of 5-min intervals in the gap

#calculate Runoff During the Gaps...(currently using predictions from ensemble regression model at HBEF)
ROs = matrix(NA, nrow(w6gaps.partial), 1)
for(i in 1:nrow(w6gaps.partial)){
  ROs[i] = sum(subset(data1$Predicted_mm_5min, date1.>=as.numeric(begin[i]) & date1.<=as.numeric(end[i])))
}

q = matrix(NA, length(w6gaps.partial[,1]), 1) # make a matrix to hold the runoff info during the W6 gap
gapAvs <- matrix(NA, length(w6gaps.partial[,1]), 1) # make a matrix to hold the average flowrate info during the W6 gap      

for(i in 1:length(q)){ # loop through the gaps
  dd = subset(data1, date1. >= as.numeric(begin[i]) & date1. <= as.numeric(end[i])) # get the W6 runoff data from the period of the gap
  q[i] = sum(dd[,5], na.rm=TRUE) # calculate the predicted runoff in W6 during the gap
  gapAvs[i] <- mean(dd[,5], na.rm=TRUE) #mean PREDICTED flow rate q (mm/5min) for each of the W6 gaps
}

# now estimate the uncertainty due to gaps for the annual runoff
dt = strptime(paste(w6gaps.partial[,3], sprintf("%04d",w6gaps.partial[,4])), format="%m/%d/%Y %H %M",tz="EST") # remake the date object for the gaps list

n = 10000 # set the number of iterations (x10)
best.errors = matrix(NA,ncol=as.integer(n/10),nrow = length(w6gaps.partial[,1]))
for (i in 1:length(q)){
  iterations = as.matrix(mdprdct(nsteps[i],n,data1))
  iterations <- iterations[order(abs(ROs[i]-iterations[,1])),]
  iterations <- head(iterations,(n/10))
  best<-(iterations[,5])
  best.errors[i,]= best
  print(i)
}
# this loop determines the water year for each gap
w.yr = matrix(0, 1,length(q))
for(i in 1:length(q)){
  w.yr[i] = dt$year[i]+1900
  if(dt$mon[i]+1<6){w.yr[i]=dt$year[i]+1899}
}
# this loop sums the random samples for each gap within a water year, so that we have a population of total errors for that water year
gap.uncert = matrix(NA,nrow=length(c(1996:2009)), ncol=(as.integer(n/10)))
yr = c(1996:2009)
for(i in 1:length(yr)){
  ww = subset(best.errors, w.yr==yr[i])
  for(j in 1:(as.integer(n/10))){
    gap.uncert[i,j] = sum(ww[,j])
  }
}
#Calculate 95% confidence interval based on gap.uncert
annquantiles=matrix(NA,nrow = 14,ncol=2)
for (i in 1:14){
  annquantiles[i,1]=quantile(gap.uncert[i,],0.025)  
  annquantiles[i,2]=quantile(gap.uncert[i,],0.975)  
}