Peril 7: Random effect categories used as a proxy for covariance structure



First we load the required R packages and set seed for reproducibility

N.B. This should be reproducible when run in R but the output in the knitted Rmarkdown file is different

set.seed(6)

library(nlme)
library(geoR)
library(lme4)
library(sp)
library(gstat)
library(RColorBrewer)
library(viridis)

Now we define some of the key parameters used in the simulations

#Number of regions
regions<-8
#Number of distinct sites per region
sites_per_region<-4
#Number of samples per site
groupsize<-6

#Number of groups (i.e. total number of sites as site is nested within region)
groups<-regions*sites_per_region

#Overall sample size
n<-regions*sites_per_region*groupsize

#Give each observation a group ID
group_ids<-rep(seq(1,groups,1),each=groupsize)

#Identify the regions that each site belongs in
site_regions<-rep(seq(1,regions,1),each=sites_per_region)
#Identify the regions that each individual observation belongs in
ind_regions<-rep(site_regions,each=groupsize)

#Define the different effect sizes we will simulate
slopes<-seq(0,0.4,0.05)

Now we assign the spatial coordinates of all of our sites and regions (used subsequently as random effects)

#Assign region coordinates
regions_centroid_x<-sample(seq(10,90,10),regions)
regions_centroid_y<-sample(seq(10,90,10),regions)

#Plot these coordinates
plot(regions_centroid_x,regions_centroid_y,pch=18,col="blue",cex=3,xlim=c(0,100),ylim=c(0,100),xaxt="n",yaxt="n")

#Assign site coordinates
site_centroids_x<-sample(seq(-10,10,0.5),groups)
site_centroids_y<-sample(seq(-10,10,0.5),groups)
site_x<-rep(NA,groups)
site_y<-rep(NA,groups)

for(i in 1:groups){
  site_x[i]<-regions_centroid_x[site_regions[i]]+site_centroids_x[i]
  site_y[i]<-regions_centroid_y[site_regions[i]]+site_centroids_y[i]
}

#Add site coordinates to the plot
points(site_x,site_y,pch=16,col="green")

#Add coordinates for individual observations
ind_centroids_x<-sample(seq(-1,1,0.0025),n)
ind_centroids_y<-sample(seq(-1,1,0.0025),n)
ind_sites<-rep(seq(1,groups,1),each=groupsize)
ind_x<-rep(NA,n)
ind_y<-rep(NA,n)

for(i in 1:n){
  ind_x[i]<-site_x[ind_sites[i]]+ind_centroids_x[i]
  ind_y[i]<-site_y[ind_sites[i]]+ind_centroids_y[i]
}

#Add these coordinates to the plot
points(ind_x,ind_y,pch=16,col="red",cex=0.1)


Now we create the spatial field used to generate spatially autocorrelated values of the response variable

Here we simulate a fine-scale spatial autocorrelation

To simulate the broader scale auto-correlation that is also in the paper then change range=10 to range=80 when defining g.dummy as demonstrated here:

g.dummy <- gstat(formula=z~1, locations=~field_sim, dummy=T, beta=1, model=vgm(psill=4,model="Exp",range=10,nugget=0.5), nmax=20)
g.dummy <- gstat(formula=z~1, locations=~field_sim, dummy=T, beta=1, model=vgm(psill=4,model="Exp",range=80,nugget=0.5), nmax=20)

Now to simulate the spatial field for real

#create coordinate sets with which to simulate spatial field
field_sim_x<-rep(seq(0,100,0.1),each=length(seq(0,100,0.1)))
field_sim_y<-rep(seq(0,100,0.1),length(seq(0,100,0.1)))
field_sim<-cbind(field_sim_x,field_sim_y)
d<-rep("d",length(field_sim_x))

#Simulate spatial field
field_sim<-SpatialPoints(field_sim)
  
# define the gstat object (spatial model)
g.dummy <- gstat(formula=z~1, locations=~field_sim, dummy=T, beta=1, model=vgm(psill=4,model="Exp",range=10,nugget=0.5), nmax=20)
yy <- predict(g.dummy, newdata=field_sim, nsim=1)
## [using unconditional Gaussian simulation]
yy$sim1<-scale(yy$sim1)

#Make colour set to plot the spatial field
col<- adjustcolor(inferno(21),alpha.f=0.6)
colz <- col[(yy$sim1 - min(yy$sim1))/diff(range(yy$sim1))*20 + 1] 

#plot map
plot(NULL,xlim=c(0,100),ylim=c(0,100),xaxt="n",yaxt="n",xlab="",ylab="",bty="n")

# show one realization
points(yy,pch=16,col=colz,cex=0.1)

#add the observations
points(regions_centroid_x,regions_centroid_y,pch=22,col="light grey",cex=10,lwd=3)
points(site_x,site_y,pch=18,col="black",cex=2)
points(ind_x,ind_y,pch=16,col="white",cex=0.6)


We now use this spatial field to generate values for the residual deviance around the effects of site and region

spat.ht <- as.data.frame(predict(g.dummy, newdata=SpatialPoints(cbind(ind_x,ind_y)), nsim=1))
## [using unconditional Gaussian simulation]

Now we set-up empty matrices to store the results of the simulations

int_estsR<-matrix(NA,nr=100,nc=length(slopes))
int_errsR<-matrix(NA,nr=100,nc=length(slopes))
slo_estsR<-matrix(NA,nr=100,nc=length(slopes))
slo_errsR<-matrix(NA,nr=100,nc=length(slopes))
aicsR<-matrix(NA,nr=100,nc=length(slopes))

int_estsS<-matrix(NA,nr=100,nc=length(slopes))
int_errsS<-matrix(NA,nr=100,nc=length(slopes))
slo_estsS<-matrix(NA,nr=100,nc=length(slopes))
slo_errsS<-matrix(NA,nr=100,nc=length(slopes))
aicsS<-matrix(NA,nr=100,nc=length(slopes))

int_ests<-matrix(NA,nr=100,nc=length(slopes))
int_errs<-matrix(NA,nr=100,nc=length(slopes))
slo_ests<-matrix(NA,nr=100,nc=length(slopes))
slo_errs<-matrix(NA,nr=100,nc=length(slopes))
aics<-matrix(NA,nr=100,nc=length(slopes))


Now we can conduct the simulations

NB. We are suppressing the warning messages from model fitting from being printed here for presentation purposes. We would recommend re-running after deleting warning=FALSE and message=FALSE from the line below this to see what warnings arise and when they do if you are interested.

#loop over repeat simulations (x100)
for(i in 1:100){

 #loop over different values of the effect size  
 for(j in 1:length(slopes)){
  
  #Simulate values for an explanatory variable (temperature - temp) 
  beta1<-rnorm(n)

  #Simulate values for a response variable (height - ht)
  slope<-slopes[j]
  resid.ht<-1

  ht<-rnorm(n,spat.ht[,3]+slope*(beta1-spat.ht[,3]),resid.ht)
 
  #Create dataframe
  data<-data.frame(ht,beta1,ind_x,ind_y,ind_sites,ind_regions)
  names(data)<-c("height","temp","x","y","site","region")
  
  #Fit models with site and region as random intercepts
  m1a<-lmer(height~temp+(1|site),data=data)
  m1b<-lmer(height~temp+(1|region),data=data)
 
  #Record model results in the results matrices
  int_estsR[i,j]<-summary(m1b)[[10]][1,1]
  int_errsR[i,j]<-summary(m1b)[[10]][1,2]
  slo_estsR[i,j]<-summary(m1b)[[10]][2,1]
  slo_errsR[i,j]<-summary(m1b)[[10]][2,2]
  aicsR[i,j]<-AIC(m1b)

  int_estsS[i,j]<-summary(m1a)[[10]][1,1]
  int_errsS[i,j]<-summary(m1a)[[10]][1,2]
  slo_estsS[i,j]<-summary(m1a)[[10]][2,1]
  slo_errsS[i,j]<-summary(m1a)[[10]][2,2]
  aicsS[i,j]<-AIC(m1a)

  #Fit a model that controls for sptail covariance in the residuals directly
  m0<-gls(height~temp,correlation=corExp(form=~x+y,nugget=TRUE),data=data)
  summary(m0)$tTable

  #Store results from this model in the output
  int_ests[i,j]<-summary(m0)$tTable[1,1]
  int_errs[i,j]<-summary(m0)$tTable[1,2]
  slo_ests[i,j]<-summary(m0)$tTable[2,1]
  slo_errs[i,j]<-summary(m0)$tTable[2,2]
  aics[i,j]<-AIC(m0)

 } #end loop over effect sizes

} #end loop over simulations


Here we do some dat processing before plotting to remove cases (set as NA) where the region model fitted better than the site model, indicating issues with model fit

rem<-which(aicsR-aics<0)
aicsS2<-aicsS
aicsS2[rem]<-NA
aicsR2<-aicsR
aicsR2[rem]<-NA
aics2<-aics
aics2[rem]<-NA

int_estsS2<-int_estsS
int_estsS2[rem]<-NA
int_estsR2<-int_estsR
int_estsR2[rem]<-NA
int_ests2<-int_ests
int_ests2[rem]<-NA

int_errsS2<-int_errsS
int_errsS2[rem]<-NA
int_errsR2<-int_errsR
int_errsR2[rem]<-NA
int_errs2<-int_errs
int_errs2[rem]<-NA

slo_estsS2<-slo_estsS
slo_estsS2[rem]<-NA
slo_estsR2<-slo_estsR
slo_estsR2[rem]<-NA
slo_ests2<-slo_ests
slo_ests2[rem]<-NA

slo_errsS2<-slo_errsS
slo_errsS2[rem]<-NA
slo_errsR2<-slo_errsR
slo_errsR2[rem]<-NA
slo_errs2<-slo_errs
slo_errs2[rem]<-NA

Now we can do some plotting of the results. To reproduce the rest of what is in the main text and other additional plots we don’t show their

#Create colour palette to be used
cols=c("#4477AA", "#DDCC77", "#CC6677")
c1<-col2rgb(cols[1])/255
c2<-col2rgb(cols[2])/255

#Histograms of AIC comparisons

hist(aicsS2[,1]-aics2[,1],xlim=c(-10,130),breaks=seq(-10,130,2),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(aicsR2[,1]-aics2[,1],xlim=c(-10,130),breaks=seq(-10,130,2),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

hist(aicsS2[,9]-aics2[,9],xlim=c(-10,100),breaks=seq(-10,100,2),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(aicsR2[,9]-aics2[,9],xlim=c(-10,100),breaks=seq(-10,100,2),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

#Histograms of comparisons of intercept estimates

hist(int_estsS2[,1]-int_ests2[,1],xlim=c(-0.66,0.6),breaks=seq(-0.66,0.6,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(int_estsR2[,1]-int_ests2[,1],xlim=c(-0.66,0.6),breaks=seq(-0.66,0.6,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

hist(int_estsS2[,9]-int_ests2[,9],xlim=c(-0.66,0.6),breaks=seq(-0.66,0.6,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(int_estsR2[,9]-int_ests2[,9],xlim=c(-0.66,0.6),breaks=seq(-0.66,0.6,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

#Histograms of comparisons of intercept standard errors

hist(int_errsS2[,1]-int_errs2[,1],xlim=c(-1.1,0.3),breaks=seq(-1.1,0.3,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(int_errsR2[,1]-int_errs2[,1],xlim=c(-1.1,0.3),breaks=seq(-1.1,0.3,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

hist(int_errsS2[,9]-int_errs2[,9],xlim=c(-1.1,0.3),breaks=seq(-1.1,0.3,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i")
hist(int_errsR2[,9]-int_errs2[,9],xlim=c(-1.1,0.3),breaks=seq(-1.1,0.3,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

#Histograms of comparisons of slope estimates

hist(slo_estsS2[,5]-slo_ests2[,5],xlim=c(-0.25,0.35),breaks=seq(-0.6,0.6,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i",cex.lab=1.5,cex.axis=1.5,xlab="Difference in model estimate for slope",main="",las=1)
hist(slo_estsR2[,5]-slo_ests2[,5],xlim=c(-0.25,0.35),breaks=seq(-0.6,0.6,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")
par(xpd=NA)
text(0.33,27,"Region - Autocorrelation",cex=1.5,adj=c(1,0.5))
text(0.33,25,"Site - Autocorrelation",cex=1.5,adj=c(1,0.5))
points(x=0.37,y=27,pch=22,col=cols[2],bg=rgb(c2[1],c2[2],c2[3],0.2),cex=4)
points(x=0.37,y=25,pch=22,col=cols[1],bg=rgb(c1[1],c1[2],c1[3],0.2),cex=4)
text(-0.32,34,cex=1.8,"b)")

par(xpd=FALSE)

hist(slo_estsS2[,9]-slo_ests2[,9],xlim=c(-0.3,0.3),breaks=seq(-0.6,0.6,0.02),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i",cex.lab=1.5,cex.axis=1.5,xlab="Difference in model estimate for slope",main="",las=1)
hist(slo_estsR2[,9]-slo_ests2[,9],xlim=c(-0.3,0.3),breaks=seq(-0.6,0.6,0.02),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE)
lines(x=c(-100,200),y=c(0,0),col="black")

#Histograms of comparisons of slope standard errors

hist(slo_errsS2[,1]-slo_errs2[,1],xlim=c(-0.05,0.05),breaks=seq(-0.6,0.6,0.001),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i",cex.lab=1.5,cex.axis=1.5,xlab="Difference in standard error around slope estimate",main="",las=1)
hist(slo_errsR2[,1]-slo_errs2[,1],xlim=c(-0.05,0.05),breaks=seq(-0.6,0.6,0.001),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE,cex.lab=1.5,cex.axis=1.5,xlab="Difference in standard error around slope estimate",main="",las=1)
lines(x=c(-100,200),y=c(0,0),col="black")

hist(slo_errsS2[,5]-slo_errs2[,5],xlim=c(-0.02,0.05),breaks=seq(-0.6,0.6,0.001),col=rgb(c1[1],c1[2],c1[3],0.2),border=cols[1],yaxs="i",cex.lab=1.5,cex.axis=1.5,xlab="Difference in standard error around slope estimate",main="",las=1)
hist(slo_errsR2[,5]-slo_errs2[,5],xlim=c(-0.02,0.05),breaks=seq(-0.6,0.6,0.001),col=rgb(c2[1],c2[2],c2[3],0.2),border=cols[2],add=TRUE,cex.lab=1.5,cex.axis=1.5,xlab="Difference in standard error around slope estimate",main="",las=1)
lines(x=c(-100,200),y=c(0,0),col="black")
par(xpd=NA)
text(0.045,25,"Region - Autocorrelation",cex=1.5,adj=c(1,0.5))
text(0.045,23,"Site - Autocorrelation",cex=1.5,adj=c(1,0.5))
points(x=0.05,y=25,pch=22,col=cols[2],bg=rgb(c2[1],c2[2],c2[3],0.2),cex=4)
points(x=0.05,y=23,pch=22,col=cols[1],bg=rgb(c1[1],c1[2],c1[3],0.2),cex=4)
text(-0.03,27,cex=1.8,"c)")

par(xpd=FALSE)