# Step 1 - collect, stack, and pre-process spatial data used for building models 
# and making spatial predictions. Code was written and performed using 
# RStudio and assumes environment was manually set up via: 
#
# 'RStudio --> Sessions --> Set working directory --> Choose Directory'.


###-----------------------------------------------------------------------------
### Step 0: Set up working environment 
###-----------------------------------------------------------------------------


# Clean environment for fresh start:
rm(list = ls(all.names = TRUE)) 
gc() 

# Load libraries: 
library(dplyr)
library(rnaturalearth)
library(sf)
library(terra)
library(tictoc)

# Load GADM United States polygon into environment:
study_region <- readRDS("Geodata/gadm/gadm41_USA_1_pk.rds") %>%
  st_as_sf() %>%
  filter(NAME_1 %in% c("Washington", "Oregon"))


###-----------------------------------------------------------------------------
### Step 1: Collect records
###-----------------------------------------------------------------------------


###-------------------------------------
### Land cover data
###-------------------------------------

# List of landcover files to import:
lc_type_list <- list.files(path = "Geodata/landuse", pattern = ".tif$", full.names = TRUE, recursive = TRUE)

# Import and crop land cover files:
lc_types <- rast(lc_type_list)
lc_types <- crop(lc_types, study_region, mask = TRUE)

# Plot to check for variability in data:
plot(lc_types)

# Remove 'mangroves' from stack - all values = 0:
lc_types <- subset(lc_types, "mangroves", negate = TRUE)


###-------------------------------------
### Elevation data 
###-------------------------------------


# Import elevation raster:
elevation <- rast( paste0( getwd() ,"/Geodata/elevation/wc2.1_30s/wc2.1_30s_elev.tif" ) )
elevation <- crop(elevation, study_region, mask = TRUE)
names(elevation) <- "elevation"

# Compute slope from elevation data:
slope <- terrain(elevation, v = "slope", neighbors = 8, unit = "radians")
names(slope) <- "slope"

# Compute roughness values from elevation data:
roughness <- terrain(elevation, v = "roughness", neighbors = 8)
names(roughness) <- "roughness"


###-------------------------------------
### WorldClim bioclimatic variables
###-------------------------------------


# List all bioclimatic-variables:
worldclim_list <- list.files(path = "Geodata/climate/wc2.1_30s", pattern = ".tif$", full.names = TRUE, recursive = TRUE)

# Import and crop bioclimatic-variables files:
bioclim_stack <- rast(worldclim_list)
bioclim_stack <- crop(bioclim_stack, study_region, mask = TRUE)
names(bioclim_stack) <- paste("bio", 1:length(worldclim_list) , sep = "")


###-------------------------------------
### Distance-from-rivers raster 
###-------------------------------------


# Download and combined Natural Earth water polygons:
nat_earth_water_polys <- rbind(
  
  # NE primary river polygons:
  ne_download(scale = 10, type = "rivers_lake_centerlines", category = "physical", returnclass = "sf") %>%
    st_intersection(study_region) %>%
    st_as_sf() %>%
    dplyr::select(geometry) ,
  
  # NE supplemental polygons:
  ne_download(scale = 10, type = "rivers_north_america", category = "physical", returnclass = "sf") %>%
    st_intersection(study_region) %>%
    st_as_sf( ) %>%
    dplyr::select(geometry)
  )

# Calculate distance raster.  Results will be in meters:
tic("He's going the distance!")
river_dist <- terra::distance(elevation, nat_earth_water_polys, haversine = TRUE)
names(river_dist) <- "river_distance"
toc()


###-----------------------------------------------------------------------------
### Step 2: Clean data
###-----------------------------------------------------------------------------


###-------------------------------------
### Stack and normalize data
###-------------------------------------


# Combine all layers to single stack:
x_stack <- c(lc_types, elevation, slope, roughness, bioclim_stack, river_dist)

# Define max and min values for each stackto normal data:
x_stack_lims <- minmax(x_stack) 

# Normalize data to apply equal weight to variables:
x_stack_norm <- (x_stack - x_stack_lims[1, ]) / (x_stack_lims[2, ] - x_stack_lims[1, ])


###-------------------------------------
### Split study region by state
###-------------------------------------


# Since models are built using only Washington records, covariate reduction will
#be determined only by these data:
x_stack_WA <- terra::crop(x_stack_norm, study_region[2, ], mask = TRUE)
plot(x_stack_WA$grassland)

# Collect Oregon data:
x_stack_OR <- terra::crop(x_stack_norm, study_region[1, ], mask = TRUE)
plot(x_stack_OR$grassland)


###-------------------------------------
### Reduce dimensionality 
###-------------------------------------


# Compute Pearson correlation matrix:
corr_matrix <-  terra::layerCor(x_stack_WA, "pearson")$correlation

# Plot correlation matrix for visual inspection:
corrplot::corrplot( corr_matrix, order = "hclust")

# Determine which variables to remove:
highly_correlated_covariates <- caret::findCorrelation(corr_matrix, cutoff = 0.8)

# Identify redundant information:
redundancies <- names(x_stack_WA)[highly_correlated_covariates]

# Remove highly correlated layers from Washington stack:
x_stack_WA <- subset(x_stack_WA, redundancies, negate = TRUE)
names(x_stack_WA)

# Remove highly correlated layers from Oregon stack:
x_stack_OR <- subset(x_stack_OR, redundancies, negate = TRUE)
names(x_stack_OR)


#-------------------------------------------------------------------------------
### STEP 3: Save TIFF objects 
#-------------------------------------------------------------------------------


# Save!
writeRaster(x_stack_WA, filename = "x_stack_WA.tif", overwrite = TRUE)
writeRaster(x_stack_OR, filename = "x_stack_OR.tif", overwrite = TRUE)


###-----------------------------------------------------------------------------
### EOC
###-----------------------------------------------------------------------------

