# Step 2 - collect species occurrence records, defien pseudo absences, and 
# compile/ clean records making spatial predictions. Code was written and 
# performed using RStudio and assumes environment was manually set up via: 
#
# 'RStudio --> Sessions --> Set working directory --> Choose Directory'.


###-----------------------------------------------------------------------------
### Environmnet setup
###-----------------------------------------------------------------------------


# Clean environment for fresh start:
rm( list = ls( all.names = TRUE ) ) 
gc() 

# Load necessary packages:
library(CoordinateCleaner)
library(countrycode)
library(dplyr)
library(ggplot2)
library(raster)
library(rgbif)
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)
library(terra)

# Load GADM United States polygon into environment:
WA_border <- readRDS("Geodata/gadm/gadm41_USA_1_pk.rds") %>%
  st_as_sf() %>%
  filter(NAME_1 == "Washington")

# Load GADM United States polygon into environment:
OR_border <- readRDS("Geodata/gadm/gadm41_USA_1_pk.rds") %>%
  st_as_sf() %>%
  filter(NAME_1 == "Oregon")

# Import spatial polygons for plots:
Canada_border <- ne_states(country = "canada")

USA_states <- ne_states(country = "united states of america") %>%
  filter(!name %in% c("Alaska", "Hawaii"))

### Literature notes:

# Miles to kilometer conversion multiple:
kilo_convert <- 1.609344

# Mean: Chukars may travel 3-5 miles a day (i.e., mean is 4) to obtain resources 
# (e.g., water). Value rounded and converted to nearest 100 meters: 
daily_travel <- round((1000 * kilo_convert  * 3 ), -2 ) 

# Import records related from Galbreath & Moreland(1953):
records_WA <- read.csv("Data/Galbreath_Moreland_1953_counties.csv")

# Import Oregon hunting records:
records_OR <- read.csv("Data/Oregon_DFW_harvest_counties.csv")


###-----------------------------------------------------------------------------
### Clean and combine shapefiles
###-----------------------------------------------------------------------------


# Import counties polygon, choose only the counties for WA for model training.
# Add Galbreath & Moreland (1953) records to data frame:
WA_counties <- readRDS("Geodata/gadm/gadm41_USA_2_pk.rds") %>%
  st_as_sf() %>%
  filter( NAME_1 == "Washington") %>%
  cbind(records_WA)

# Import counties polygon:
OR_counties <- readRDS("Geodata/gadm/gadm41_USA_2_pk.rds") %>%
  st_as_sf() %>%
  filter(NAME_1 == "Oregon") %>%
  cbind(records_OR)

# Partition between recorded and extra counties:
counties_recorded <- WA_counties[WA_counties$Recorded == 1, ]
counties_recorded_not <- WA_counties[WA_counties$Recorded == 0, ]

# Create individual shapefiles for each class of records:
recorded_success <- counties_recorded[counties_recorded$Result == 1, ]
recorded_unsure <- counties_recorded[counties_recorded$Result == 0, ]
recorded_failure <- counties_recorded[counties_recorded$Result == -1, ]
recorded_extra <- counties_recorded_not[counties_recorded_not$Result == -2, ]

# Plot for Washington records for visual confirmation:
plot(WA_counties$geometry)
plot(recorded_success$geometry, col = "#0072B2", add = TRUE)
plot(recorded_failure$geometry, col = "#D55E00", add = TRUE)
plot(recorded_unsure$geometry, col = "#999999", add = TRUE)
plot(recorded_extra$geometry, col = "white", add = TRUE)

# Combine records for spatial processing:
modeling_counties <- rbind(recorded_success, recorded_failure, recorded_unsure)


###-------------------------------------
# SUPPLEMENTARY PLOT 1:
###-------------------------------------


# Plot Oregon records for visual confirmation: 
OR_cents<- st_centroid(OR_counties)
world_points <- cbind(OR_counties, st_coordinates(st_centroid(OR_counties$geometry)))
  
# Set colorblind-friendly palette: 
custom_colors <- c(
  "1" = "#E69F00",
  "2" = "#56B4E9",
  "3" = "#009E73" ,
  "4" = "#F0E442",
  "5" = "#0072B2",
  "6" = "#D55E00",
  "7" = "#CC79A7")

# Plot OR records:
ggplot(data = world_points) +
  geom_sf(aes(fill = factor(Unit)), color = "black") +
  geom_text(data= world_points,aes(x=X, y=Y, label=NAME_2),
            color = "black", check_overlap = FALSE) +
  theme_classic() + 
  scale_color_manual(values = custom_colors) +
  scale_fill_manual(values = custom_colors) +
  xlab("Longitude") +
  ylab("Latitude") +
  labs(fill = "Unit")


###-----------------------------------------------------------------------------
### STEP 2: GBIF occurrences
###-----------------------------------------------------------------------------


# NOTE: This process is used to pull data from GBIF. The code in section provides
# the initial download procedure (commented out to prevent new queries), and the 
# query download key for the example used in this study:

# ---- Uncommnent the following to run query ----:

# # Download GBIF occurrence data for this species:
# gbif_download <- rgbif::occ_download(pred("taxonKey", 2474029),
#                                      pred("HAS_COORDINATE", TRUE),
#                                      pred_in("stateProvince", c("Washington", "Oregon")),
#                                      pred_gte("year", 2014),
#                                      pred_lte("year", 2023),
#                                      format = "SIMPLE_CSV"
# )
# 
# # Import occurrence records:
# occ_pts <- occ_download_get(gbif_download) %>%
#   occ_download_import()

# ---- The following is the prodcut of the above query search ----:

# Import occurrence records for study:
occ_pts <- occ_download_get("0001719-250515123054153") %>%
  occ_download_import()

# Refer to for citation:
gbif_citation(occ_download_meta("0001719-250515123054153"))

# Clean records by addressing distance uncertainty and removing redundant and/or
# erroneous points.  Select records from regions with recognized hunting season 
# to ensure records are likely from wildlife populations. Rename and transform 
# records for further modelling procedures:
occ_pts <- occ_pts %>%
  dplyr::filter(occurrenceStatus  == "PRESENT") %>%
  dplyr::filter(!basisOfRecord %in% c("FOSSIL_SPECIMEN", "LIVING_SPECIMEN" )) %>%
  dplyr::filter(coordinateUncertaintyInMeters <= daily_travel | is.na(coordinateUncertaintyInMeters)) %>%
  dplyr::filter(!coordinateUncertaintyInMeters %in% c(301, 3036, 999, 9999)) %>% 
  dplyr::filter(!decimalLatitude == 0 | !decimalLongitude == 0) %>%
  # remove country centroids within 5 miles:
  CoordinateCleaner::cc_cen(buffer = daily_travel) %>% 
  # remove capitals centroids within 5 miles:
  CoordinateCleaner::cc_cap(buffer = daily_travel) %>% 
  # remove zoo and herbaria within 5 miles:
  CoordinateCleaner::cc_inst(buffer = daily_travel) %>% 
  # remove from ocean:
  CoordinateCleaner::cc_sea( ) %>%  
  dplyr::distinct(decimalLongitude, decimalLatitude, .keep_all = TRUE) %>%
  dplyr::rename(long = decimalLongitude ) %>%
  dplyr::rename(lat = decimalLatitude ) %>%
  sf::st_as_sf(coords = c("long", "lat"), crs = 4326)


###-------------------------------------
# Partition records by state:
###-------------------------------------


# Washington records:
occ_pts_WA <- occ_pts %>%
  filter(stateProvince == "Washington")

# Oregon records:
occ_pts_OR <- occ_pts %>%
  filter(stateProvince == "Oregon")


###-----------------------------------------------------------------------------
### STEP 3a: WASHINGTON data prep
###-----------------------------------------------------------------------------


# Determine which counties have recorded hunting bagging:
harvest_counties_WA <- WA_counties %>%
  filter(records_WA$Harvest == 1)

# Occurrences that are within harvest counties. Records are from the Washingtion
# Department of Fish & Wildlife 2013-24 harvest records 
# <https://wdfw.wa.gov/hunting/management/game-harvest>: 
occ_pts_WA <- occ_pts_WA[lengths(st_intersects(occ_pts_WA, harvest_counties_WA)) > 0, ]

# Plot study area (Figure 1):
ggplot(data = WA_counties) +
  geom_sf(data = USA_states, fill = "antiquewhite") +
  geom_sf(data = Canada_border, fill = "antiquewhite") +
  geom_sf(color = "black", fill = "white", linewidth = 0.75) +
  scale_fill_manual(breaks = c(1, -1, 0),
                    labels = c("Success" , "Failure", "Inconclusive"),
                    values = c("#0072B2", "#D55E00", "#999999"))+
  geom_sf(data = counties_recorded, color = "black", aes(fill = factor(Result)), linewidth = 0.65) +
  geom_sf( data = occ_pts_WA, shape=23, fill="yellow", color="black", size = 1.5) +
  xlim(-125, -117) +
  ylim(45.5, 49) +
  labs(fill = "Status") +
  xlab("Longitude") +
  ylab("Latitude") +
  theme(panel.grid.major = element_line(color = gray(0.5), linetype = "dashed", size = 0.5), 
        panel.background = element_rect(fill = "aliceblue"),
        legend.title.align = 0.5)


###-------------------------------------
###  Define range buffer
###-------------------------------------


# Create a daily buffer around occurrences.  Accounts for travel and error:
#sf_use_s2( TRUE )
occ_pts_WA_buffer <- occ_pts_WA %>%
  st_buffer(daily_travel ,  
            endCapStyle = "flat", 
            joinStyle = "mitre" ,
            nQuadSegs = 6) %>%
  st_union( ) %>%
  st_sf( ) 

# Limit buffer to study range:
sf_use_s2(FALSE) 
occ_pts_WA_buffer <- st_intersection(occ_pts_WA_buffer, WA_border)


###-------------------------------------
### Partition training & testing spaces 
###-------------------------------------


# Define area to extract records for training models:
occ_training_space <- st_intersection(occ_pts_WA_buffer, recorded_success)

# Define area to extract records for testing model predictions:
occ_testing_space_WA <- st_intersection(occ_pts_WA_buffer , rbind(recorded_unsure, recorded_extra))

# Training areas where it is assumed species is absent:
sf_use_s2( FALSE )
pseudo_abs_WA_training_space <- recorded_success %>%
  st_difference( occ_pts_WA_buffer ) %>%
  dplyr::select( geometry) %>%
  rbind(recorded_failure %>% 
          dplyr::select(geometry)) 

# Testing areas where it is assumed species is absent:
pseudo_abs_WA_testing_space <- rbind(recorded_unsure, recorded_extra) %>%
  st_difference(occ_pts_WA_buffer)

# Plot for visual confirmation:
plot(WA_border$geometry)
plot(occ_training_space$geometry, col = "#0021A5" , add = TRUE)
plot(occ_testing_space_WA$geometry , col = "lightblue" , add = TRUE)
plot(pseudo_abs_WA_training_space$geometry, col = "yellow" , add = TRUE)
plot(pseudo_abs_WA_testing_space$geometry , col = "#F2A900" , add = TRUE)


###-------------------------------------
### Extract records
###-------------------------------------


# Load predictor stack for value extractions:
x_stack_WA <- terra::rast("x_stack_WA.tif") 
names(x_stack_WA)

# Set seed for reproducibility:
set.seed(8)


###-------------------------------------
# Train data 
###-------------------------------------


# Select occurrences in training training range:
train_occ_pts_WA <- st_intersection(occ_pts_WA, recorded_success) 

train_occ <- train_occ_pts_WA %>%
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 1) 

# Sample random pseudo absences from training range. Count = # of occurrences:
train_abs <- pseudo_abs_WA_training_space %>%
  st_sample(nrow(train_occ)) %>%
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 0) 


###-------------------------------------
# Test data - all points 
###-------------------------------------


# Select points in testing range:
test_occ_all <- occ_pts_WA %>%
  filter( !gbifID %in% c(train_occ_pts_WA$gbifID)) %>%
  #anti_join( df2, by = "name")
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 1) 

# Sample random pseudo absences from testing range:
test_abs_all_sample <- pseudo_abs_WA_testing_space %>%
  #st_sample(nrow(test_occ_all)) %>%
  st_sample(10000) 

# Collect spatial data for all pseudo absences:
test_abs_all <- test_abs_all_sample %>%
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 0)


###-------------------------------------
# Test data - unsure locations
###-------------------------------------


# Select only points in unsure region: 
test_occ_unsure <- st_intersection(occ_pts_WA, recorded_unsure) %>%
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 1)

# Define pseudo absence space in unsure region:
pseudo_abs_WA_space_unsure <- st_intersection(recorded_unsure, pseudo_abs_WA_testing_space)
plot(pseudo_abs_WA_space_unsure$geometry, add = TRUE, col = "gray")

# Sample random pseudo absences from testing range:
test_abs_unsure <- st_intersection(test_abs_all_sample, pseudo_abs_WA_space_unsure) %>%
  terra::vect() %>%
  terra::extract(x = x_stack_WA,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 0)


###-------------------------------------
# Combine data frames 
###-------------------------------------


# Train data:
train_data <- rbind(train_occ, train_abs) %>%
  na.omit()
summary(train_data)

# Test data for counties outside training range:
test_data_all <- rbind(test_occ_all, test_abs_all) %>%
  na.omit()
summary(test_data_all)

# Test data for counties unsure range:
test_data_unsure <- rbind(test_occ_unsure, test_abs_unsure) %>%
  na.omit()
summary(test_data_unsure)


###-----------------------------------------------------------------------------
### STEP 3b: OREGON data prep
###-----------------------------------------------------------------------------


# Determine which counties have recorded hunting bagging:
harvest_counties_OR <- OR_counties %>%
  filter(records_OR$Harvest == 1)

# Retain points in confirmed harvest counties:
occ_pts_OR <- occ_pts_OR[lengths(st_intersects(occ_pts_OR, harvest_counties_OR)) > 0, ]

# Create a daily buffer around occurrences.  Accounts for travel and error:
sf_use_s2( TRUE )
occ_pts_OR_buffer <- occ_pts_OR %>%
  st_buffer( daily_travel ,  
             endCapStyle = "flat", 
             joinStyle = "mitre" ,
             nQuadSegs = 6 ) %>%
  st_union( ) %>%
  st_sf( ) 

# Limit buffer to study range:
sf_use_s2(FALSE) 
occ_pts_OR_buffer <- st_intersection(occ_pts_OR_buffer, OR_border)


###-------------------------------------
### Partition training and testing spaces 
###-------------------------------------


# Training areas where it is assumed species is absent:
pseudo_abs_OR_space <- OR_border %>%
  st_difference( occ_pts_OR_buffer ) %>%
  dplyr::select( geometry)

# Load predictor stack for value extractions:
x_stack_OR <- terra::rast("x_stack_OR.tif") 
names(x_stack_OR )

# Select points in testing range:
test_occ_OR <- occ_pts_OR %>%
  terra::vect() %>%
  terra::extract(x = x_stack_OR,  ID = FALSE, xy = TRUE) %>%
  mutate(Status = 1) 

# Set seed for reproducibility:
set.seed(8)

# Sample random pseudo absences from testing range:
test_abs_OR <- pseudo_abs_OR_space %>%
  st_sample(10000) %>%
  terra::vect() %>%
  terra::extract(x = x_stack_OR, ID = FALSE, xy = TRUE) %>%
  mutate(Status = 0)


###-------------------------------------
# Combine data frames 
###-------------------------------------


# Test data for counties outside training range:
test_data_OR <- rbind(test_occ_OR, test_abs_OR) %>%
  na.omit() 

# Review records:
summary(test_data_OR)


#-------------------------------------------------------------------------------
### STEP 4: Save RDS objects 
#-------------------------------------------------------------------------------


saveRDS( train_data , "RDS_objects/train_data.rds")
saveRDS( test_data_all , "RDS_objects/test_data_all.rds")
saveRDS( test_data_unsure , "RDS_objects/test_data_unsure.rds")
saveRDS( test_data_OR , "RDS_objects/test_data_OR.rds")


###-----------------------------------------------------------------------------
### EOC
###-----------------------------------------------------------------------------
