###Importing and cleaning the data, and running the Bayesian model using Stan

# Loading necessary packages ----------------------------------------------

library(tidyverse)
library(cmdstanr) #The user must have CmdStan installed

# Importing data ----------------------------------------------------------

##Note: The data required for this analysis can be downloaded from Dryad
##digital repository (https://doi.org/10.5061/dryad.76hdr7stg). The Stan
##model file is included in the PeerJ manuscript supplementary material. The
##Stan model will be compiled in a later section of the script below.

##Importing the raw data (in this case, from a subdirectory folder named "Data")
raw_data <- read_csv("Data/JAV_-_parasite_and_host_data.csv")

# Initial data cleaning and restructuring ---------------------------------

##Initial cleaning
ptarm_data <- raw_data %>% 
  mutate(across(where(is.character), str_trim), #Removing starting and trailing 
                                                #whitespace from character
                                                #strings
         Year = factor(CollYear)) %>% #Converting collection year to a factor
  rename("E. muta" = em_n, #Converting species codes to explicit names
         "E. rjupa" = er_n,
         "C. caudinflata" = cc_n, 
         "T. tenuis" = tt_n, 
         "P. serpentulus" = ps_n, 
         "M. c. tetrathyridia" = mctet, 
         "T. lagopi (presence)" = tlinf,
         "T. lagopi" = tltot, 
         "S. holoaspis" = shtot, 
         "M. islandicus" = mitot, 
         "M. borealis" = mbyof, 
         "M. lagopus (score)" = ml_14, 
         "M. lagopus" = ml_fi, #From filter
         "G. lagopi" = gltot, 
         "L. affinis" = latot, 
         "A. lagopi" = altot, 
         "O. chloropus" = oc_n, 
         "O. chloropus (pupae)" = ocpup,
         "C. garei (flea)" = Flea) %>% 
  filter(Phase == "early") %>%  #Removing late-sampled birds, as that different
                                #sampling protocol does not ensure accurate
                                #quantification of the ectoparasites
  dplyr::select(-c( #Removing columns for parasites whose prevalences were too
                    #low, or quantification methods were only proxies (see
                    #explanation in manuscript at doi: 10.1111/jav.02472)
    "P. serpentulus", "M. c. tetrathyridia", 
    "T. lagopi (presence)", "M. lagopus (score)",
    "O. chloropus (pupae)", "C. garei (flea)"
      ), -CollYear, -Phase) #Removing other unnecessary columns

##Converting main data to long format (each observation/row becomes one 
##parasite abundance in one host)
long_data <- ptarm_data %>% 
  pivot_longer(`E. muta`:`O. chloropus`, names_to = "Parasite",
               values_to = "Abundance") %>% 
  drop_na() %>% #Removing empty parasite abundance numbers
  mutate(Endo_ecto = ifelse(Parasite %in% c("E. muta", "E. rjupa", 
                                            "C. caudinflata", "T. tenuis"),
                            "Endoparasite",
                            "Ectoparasite"), #Specifying endoparasite vs.
                                             #ectoparasite
         Par_group = case_when( #Broad parasite grouping
           Parasite %in% c("E. muta", "E. rjupa") ~ "Coccidian", 
           Parasite %in% c("C. caudinflata", 
                           "T. tenuis") ~ "Nematode", 
           Parasite %in% c("A. lagopi", "L. affinis", 
                           "G. lagopi", "O. chloropus") ~ "Insect",
           TRUE ~ "Mite"
         )) %>% 
  mutate(across(where(is.character), factor)) #Converting character strings to
                                              #factors

# Preparing data for modeling ---------------------------------------------

##Function for Poulin's D calculation
d_calc <- function(x) {
  x_sorted <- sort(x)
  1 - (2 * sum(cumsum(x_sorted))) / 
    (mean(x) * length(x) * (length(x) + 1))
}

##Summarizing sample data for modeling
d_summ_data <- long_data %>% 
  group_by(Parasite, Endo_ecto, Par_group, Year) %>% 
  summarize(D = d_calc(Abundance), #Quantifying aggregation
            Mean_abund = mean(Abundance)) #Quantifying mean abundances

##Conversion of factors to numeric (for Stan), and transforming/
##standardizing mean abundances
d_prep_data <- d_summ_data %>% 
  ungroup() %>% 
  filter(across(where(is.numeric), ~ is.finite(.x))) %>% 
  mutate(across(where(is.factor), ~ as.numeric(.x)),
         Mean_abund = (log2(Mean_abund) - mean(log2(Mean_abund))) / 
           sd(log2(Mean_abund)))

##Final conversion to list, and providing vector lengths as variables for Stan
d_mod_data <- d_prep_data %>% 
  as.list() %>% 
  c(n = length(.$Mean_abund),
    n_Year = max(.$Year),
    n_Parasite = max(.$Parasite),
    n_Endo_ecto = max(.$Endo_ecto),
    n_Par_group = max(.$Par_group))

# Fitting Stan model ------------------------------------------------------

##Compiling the Stan model (here, saved in a subdirectory folder named 
##"Stan model"), and loading in R
stan_mod <- cmdstan_model("Stan model/Beta regression - best-fitting.stan")

##Running the MCMC chains in Stan and saving the model output
model_fit <- stan_mod$sample(data = d_mod_data, seed = 3, 
                             chains = 3, parallel_chains = 3,
                             iter_warmup = 2500, iter_sampling = 1500, 
                             max_treedepth = 10, adapt_delta = 0.99)

##Fitted model summary
##(Note that the parameters may not be listed in the exact same order as they
##appear in Table 1 of the manuscript, and that here phi is equivalent 
##to what is listed as nu in the manuscript.)
model_fit$summary(variables = c("theta", "beta", "phi"))

#-------------------------------------------------------------------------