---
title: "HOBO"
author: "Caroline Palmer"
date: "2025-01-25"
output: html_document
---

```{r preprocess data}

# Read and preprocess data
prevsub <- read_csv("prevsub.csv", col_types = cols(
  bleached = col_double(),
  diseased = col_double(),
  coral = col_double(),
  mean_temp = col_double(),
  max_temp = col_double(),
  min_temp = col_double(),
  macroalgae = col_double(),
  turf = col_double(),
  cyanobacteria = col_double(),
  caulerpa = col_double(),
  deadcoral = col_double(),
  cca = col_double(),
  other = col_double(), 
  rubble = col_double(),
  sand = col_double(),
  rock = col_double(),
  proppoc = col_double(),
  proppor = col_double(),
  proppav = col_double(),
  proppsam = col_double(),
  propoth = col_double(),
  poc_mbb = col_double(),
  por_mbb = col_double(),
  pav_mbb = col_double(),
  psam_mbb = col_double(),
  other_mbb = col_double(),
  poc_bleached = col_double(),
  por_bleached = col_double(),
  pav_bleached = col_double(),
  psamm_bleached = col_double(),
  other_corals_bleaching = col_double(),
  totalpoc = col_double(),
  totalpor = col_double(),
  totalpav = col_double(),
  totalpsamm = col_double(),
  totalother = col_double(),
  site = col_character(),
  date = col_character()  # Treat 'date' as character initially
))

# Parse 'date' into a proper Date format
prevsub <- prevsub %>% mutate(date = dmy(date))

# Remove rows where all values are NA
prevsub <- prevsub %>%
    filter(rowSums(is.na(.)) != ncol(.))


# Ensure 'date' format is consistent (redundant if using dmy, but kept as a check)
prevsub <- prevsub %>%
  mutate(date = as.Date(date, format = "%d/%m/%Y"))

# Convert site to factor
prevsub <- prevsub %>%
  mutate(site = as.factor(site))

# Add scaled date
prevsub <- prevsub %>%
  mutate(
    z_date = scale(as.numeric(date))
  )

# Ensure rows with missing 'bleached' but valid 'coral' are retained
prevsub <- prevsub %>%
  filter(!is.na(z_date) & is.finite(z_date) & (!is.na(bleached) | !is.na(coral)))


# Check for invalid dates
invalid_dates <- prevsub %>%
  filter(is.na(date))

if (nrow(invalid_dates) > 0) {
  print("Invalid dates found:")
  print(invalid_dates)
  # Remove invalid dates
  prevsub <- prevsub %>% filter(!is.na(date))
} else {
  print("All dates are valid!")
}


# Step 1: Convert NA to structural zeros for total bleaching columns
bleaching_cols <- c("poc_bleached", "por_bleached", "pav_bleached", "psamm_bleached", "other_corals_bleaching")
total_cols <- c("totalpoc", "totalpor", "totalpav", "totalpsamm", "totalother")

for (i in seq_along(bleaching_cols)) {
  prevsub[[bleaching_cols[i]]] <- ifelse(
    is.na(prevsub[[bleaching_cols[i]]]) & prevsub[[total_cols[i]]] == 0,
    0,
    prevsub[[bleaching_cols[i]]]
  )
}

# Step 2: Convert NA to structural zeros for mostly bleached and bleached data
species_cols <- c("poc_mbb", "por_mbb", "pav_mbb", "psam_mbb", "other_mbb")

for (i in seq_along(species_cols)) {
  prevsub[[species_cols[i]]] <- ifelse(
    is.na(prevsub[[species_cols[i]]]) & prevsub[[total_cols[i]]] == 0,
    0,
    prevsub[[species_cols[i]]]
  )
}

# Step 3: Constrain proportions for beta regression for additional columns
additional_proportion_cols <- c(
  "coral", "bleached", "diseased", "macroalgae", "turf", "cyanobacteria", "caulerpa", "cca", "rubble", "rock", "sand", "deadcoral", "other",
  "proppoc", "proppor", "proppav", "proppsam", "propoth",
  "poc_mbb", "por_mbb", "pav_mbb", "psam_mbb", "other_mbb",
  "poc_bleached", "por_bleached", "pav_bleached", "psamm_bleached", "other_corals_bleaching"
)

prevsub <- prevsub %>%
  mutate(across(all_of(additional_proportion_cols), ~ pmax(pmin(., 0.9999), 0.0001)))

# Step 4: Create structural zero indicators for all relevant columns
for (i in seq_along(total_cols)) {
  prevsub[[paste0(species_cols[i], "_structural_zero")]] <- ifelse(
    prevsub[[total_cols[i]]] == 0,
    1,
    0
  )
  prevsub[[paste0(bleaching_cols[i], "_structural_zero")]] <- ifelse(
    prevsub[[total_cols[i]]] == 0,
    1,
    0
  )
}

# Constrain response variables for zero_inflated_beta family
prevsub <- prevsub %>%
  mutate(
    macroalgae = pmax(pmin(macroalgae, 0.9999), 0.0001),
    caulerpa = pmax(pmin(caulerpa, 0.9999), 0.0001),
    turf = pmax(pmin(turf, 0.9999), 0.0001),
    cyanobacteria = pmax(pmin(cyanobacteria, 0.9999), 0.0001),
    cca = pmax(pmin(cca, 0.9999), 0.0001)
  )


# Filter for selected sites
selected_sites <- c("Tina", "Chorro", "San Josecito", "Gallos", "Cueva", "Barco Somero", "Esquina", "Este Intermedio", "Barco Profundo", "Jardin")
filtered_prevsub <- prevsub %>%
  filter(site %in% selected_sites) %>%
  mutate(site = droplevels(site)) 

# Save removed data for later use
removed_data <- filtered_prevsub %>%
    filter(site == "Barco Somero" & format(date, "%Y-%m") == "2024-03")

# Proceed with filtered data for analysis
filtered_prevsub <- filtered_prevsub %>%
    filter(!(site == "Barco Somero" & format(date, "%Y-%m") == "2024-03"))


# Preprocess temperature predictors
prevsub_clean <- prevsub %>%
    select(bleached, mean_temp, max_temp, min_temp) %>%
    drop_na()  # Remove rows with any missing values

if (nrow(prevsub_clean) == 0) {
    stop("Filtered dataset prevsub_clean is empty. Check mean_temp, max_temp, min_temp, or bleached columns.")
}
```


```{r Load required libraries}
library(dplyr)
library(tidyr)
library(stringr)

# Load data
hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE)

# Replace empty cells with NA
hobo_data <- hobo_data %>%
  mutate(across(everything(), ~ ifelse(. == "", NA, .)))

# Clean column names
# Remove duplicate suffixes like ".1", ".2" from site names
names(hobo_data) <- names(hobo_data) %>%
  str_replace_all("\\.\\d+$", "")  # Remove suffixes like ".1", ".2"

# Pivot longer for site-wise temperature readings
hobo_long <- hobo_data %>%
  pivot_longer(
    cols = -c(Date, Time),
    names_to = "site",
    values_to = "temperature"
  ) %>%
  mutate(
    temperature = as.numeric(temperature),  # Ensure temperature is numeric
    date = as.Date(Date, format = "%d/%m/%Y")  # Convert Date to Date class
  )

# Remove rows with completely missing temperature
hobo_long <- hobo_long %>%
  filter(!is.na(temperature))

# Aggregate daily statistics (mean, max, min) by site
daily_stats <- hobo_long %>%
  group_by(date, site) %>%
  summarize(
    mean_temp = mean(temperature, na.rm = TRUE),
    max_temp = max(temperature, na.rm = TRUE),
    min_temp = min(temperature, na.rm = TRUE),
    .groups = "drop"
  )

# Inspect the resulting dataset
head(daily_stats)

# Save cleaned data for future use
write.csv(daily_stats, "HOBO_daily_stats.csv", row.names = FALSE)

```

```{r calculate-monthly-stats-cleaned, echo=TRUE, message=FALSE, warning=FALSE}

# Load required libraries
library(dplyr)
library(lubridate)
library(stringr)
library(tidyr)

# Read the HOBO dataset
hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE)

# Replace empty cells with NA
hobo_data <- hobo_data %>%
  mutate(across(everything(), ~ ifelse(. == "", NA, .)))

# Convert Date and Time to datetime format
hobo_data <- hobo_data %>%
  mutate(
    datetime = dmy_hms(paste(Date, Time)), # Combine Date and Time into a single datetime
    date = as.Date(datetime),              # Extract date
    month = floor_date(date, "month")      # Extract the month for aggregation
  )


# Pivot the data longer to gather temperature columns
# Assuming site names are columns (excluding Line#, Date, Time, and metadata)
temperature_columns <- setdiff(names(hobo_data), c("Line#", "Date", "Time", "datetime", "date", "month"))

hobo_long <- hobo_data %>%
  pivot_longer(
    cols = all_of(temperature_columns),
    names_to = "site",
    values_to = "temperature"
  ) %>%
  mutate(
    site = str_trim(site),  # Trim whitespace from site names
    temperature = as.numeric(temperature)  # Convert temperature to numeric
  )


# Group by month and site to calculate mean, max, and min temperatures
monthly_stats <- hobo_long %>%
  group_by(month = floor_date(date, "month"), site) %>%
  summarize(
    mean_temp = mean(temperature, na.rm = TRUE),
    max_temp = max(temperature, na.rm = TRUE),
    min_temp = min(temperature, na.rm = TRUE),
    .groups = "drop"  # Ungroup the data after summarization
  )

# Inspect the resulting dataset
head(monthly_stats)

# Save the monthly statistics to a CSV file (optional)
write.csv(monthly_stats, "HOBO_monthly_stats_cleaned.csv", row.names = FALSE)


```


```{r compare-temperatures-fixed, echo=TRUE, message=FALSE, warning=FALSE}
# Load required libraries
library(dplyr)
library(ggplot2)

hobo_data <- read.csv("trimmed_hobo_data.csv", stringsAsFactors = FALSE)

# Step 1: Filter for the selected sites only
filtered_sites <- c("Barco.Somero", "Barco.Profundo", "Tina", "Ancla")
filtered_data <- hobo_data %>%
  filter(site %in% filtered_sites)


# Step 2: Calculate Monthly Statistics for the Selected Sites
filtered_data <- filtered_data %>%
  mutate(date = as.Date(date),  # ensure proper date format
         month = as.Date(format(date, "%Y-%m-01")))  # extract month

monthly_stats <- filtered_data %>%
  group_by(site, month) %>%
  summarize(
    mean_temp = mean(temperature, na.rm = TRUE),
    max_temp = max(temperature, na.rm = TRUE),
    min_temp = min(temperature, na.rm = TRUE),
    .groups = "drop"
  )

# Step 3: Remove rows with NA, NaN, or Inf in temperature columns
monthly_stats_clean <- monthly_stats %>%
  filter(
    !is.na(mean_temp) & !is.nan(mean_temp) & is.finite(mean_temp),
    !is.na(max_temp) & !is.nan(max_temp) & is.finite(max_temp),
    !is.na(min_temp) & !is.nan(min_temp) & is.finite(min_temp)
  )

# Step 4: Visualize Mean, Max, and Min Temperatures Among Sites
ggplot(monthly_stats_clean, aes(x = site, y = mean_temp)) +
  geom_boxplot(fill = "lightblue") +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Comparison of Mean Temperatures Among Selected Sites",
       x = "Site", y = "Mean Temperature (°C)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(monthly_stats_clean, aes(x = site, y = max_temp)) +
  geom_boxplot(fill = "lightcoral") +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Comparison of Max Temperatures Among Selected Sites",
       x = "Site", y = "Max Temperature (°C)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(monthly_stats_clean, aes(x = site, y = min_temp)) +
  geom_boxplot(fill = "lightgreen") +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Comparison of Min Temperatures Among Selected Sites",
       x = "Site", y = "Min Temperature (°C)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Step 5: Visualize Trends Over Time
ggplot(monthly_stats_clean, aes(x = month)) +
  geom_line(aes(y = mean_temp, color = site), size = 1) +
  labs(title = "Mean Temperature Trends Over Time (Selected Sites)",
       x = "Month", y = "Mean Temperature (°C)") +
  theme_minimal()

ggplot(monthly_stats_clean, aes(x = month)) +
  geom_line(aes(y = max_temp, color = site), size = 1) +
  labs(title = "Max Temperature Trends Over Time (Selected Sites)",
       x = "Month", y = "Max Temperature (°C)") +
  theme_minimal()

ggplot(monthly_stats_clean, aes(x = month)) +
  geom_line(aes(y = min_temp, color = site), size = 1) +
  labs(title = "Min Temperature Trends Over Time (Selected Sites)",
       x = "Month", y = "Min Temperature (°C)") +
  theme_minimal()

# Step 6: Statistical Tests to Compare Temperatures Among Selected Sites
# ANOVA for mean temperature comparison
anova_mean <- aov(mean_temp ~ site, data = monthly_stats_clean)
summary(anova_mean)

# ANOVA for max temperature comparison
anova_max <- aov(max_temp ~ site, data = monthly_stats_clean)
summary(anova_max)

# ANOVA for min temperature comparison
anova_min <- aov(min_temp ~ site, data = monthly_stats_clean)
summary(anova_min)

# Post-hoc Tukey's Test for mean temperature
tukey_mean <- TukeyHSD(anova_mean)
print(tukey_mean)

# Post-hoc Tukey's Test for max temperature
tukey_max <- TukeyHSD(anova_max)
print(tukey_max)

# Post-hoc Tukey's Test for min temperature
tukey_min <- TukeyHSD(anova_min)
print(tukey_min)

# Save Tukey results to a file (optional)
capture.output(tukey_mean, file = "tukey_mean_results.txt")
capture.output(tukey_max, file = "tukey_max_results.txt")
capture.output(tukey_min, file = "tukey_min_results.txt")


```


```{r trim-hobo-data, echo=TRUE, message=FALSE, warning=FALSE}

# Load necessary libraries
library(dplyr)
library(lubridate)
library(tidyr)

# Step 1: Load the HOBO dataset
hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE)

# Step 2: Replace empty cells with NA
hobo_data <- hobo_data %>%
  mutate(across(everything(), ~ ifelse(. == "", NA, .)))

# Step 3: Parse Date column only (ignore Time column)
hobo_data <- hobo_data %>%
  mutate(
    date = dmy(Date, quiet = TRUE)  # Parse the Date column safely
  )

# Check for issues with date parsing
if (any(is.na(hobo_data$date))) {
  cat("Warning: Some Date values could not be parsed. Check the Date format.\n")
}

# Step 4: Remove rows with NA values in critical columns (date and temperature columns)
hobo_data <- hobo_data %>%
  drop_na(date)  # Remove rows where date is NA

# Step 5: Pivot data to long format
temperature_columns <- setdiff(names(hobo_data), c("Line#", "Date", "Time", "date"))
hobo_long <- hobo_data %>%
  pivot_longer(
    cols = all_of(temperature_columns),
    names_to = "site",
    values_to = "temperature"
  ) %>%
  drop_na(temperature)  # Remove rows where temperature is NA

# Step 6: Identify dates with maximum site overlap
common_dates <- hobo_long %>%
  group_by(date) %>%                      # Group by date
  summarize(
    sites_with_data = n_distinct(site),   # Count distinct sites with data for each date
    site_list = list(unique(site))        # Store the list of sites with data for each date
  ) %>%
  arrange(desc(sites_with_data))          # Sort by the number of sites with data

# Step 7: Filter the dataset to retain only dates with maximum overlap
# Choose the maximum number of sites shared on any given date
max_sites <- max(common_dates$sites_with_data, na.rm = TRUE)
dates_with_max_sites <- common_dates %>%
  filter(sites_with_data == max_sites) %>%
  pull(date)

# Filter the dataset for these dates
trimmed_hobo_data <- hobo_long %>%
  filter(date %in% dates_with_max_sites)

# Debugging: Print out which dates and sites were retained
cat("Dates retained with maximum overlap:\n")
print(common_dates %>%
        filter(date %in% dates_with_max_sites))

# Step 8: Save the trimmed dataset to a CSV file
write.csv(trimmed_hobo_data, "trimmed_HOBO_data.csv", row.names = FALSE)

# Optional: Print a preview of the trimmed dataset
head(trimmed_hobo_data)

```


## R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

```{r cars}
summary(cars)
```

## Including Plots

You can also embed plots, for example:

```{r pressure, echo=FALSE}
plot(pressure)
```

Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.