--- title: "HOBO" author: "Caroline Palmer" date: "2025-01-25" output: html_document --- ```{r preprocess data} # Read and preprocess data prevsub <- read_csv("prevsub.csv", col_types = cols( bleached = col_double(), diseased = col_double(), coral = col_double(), mean_temp = col_double(), max_temp = col_double(), min_temp = col_double(), macroalgae = col_double(), turf = col_double(), cyanobacteria = col_double(), caulerpa = col_double(), deadcoral = col_double(), cca = col_double(), other = col_double(), rubble = col_double(), sand = col_double(), rock = col_double(), proppoc = col_double(), proppor = col_double(), proppav = col_double(), proppsam = col_double(), propoth = col_double(), poc_mbb = col_double(), por_mbb = col_double(), pav_mbb = col_double(), psam_mbb = col_double(), other_mbb = col_double(), poc_bleached = col_double(), por_bleached = col_double(), pav_bleached = col_double(), psamm_bleached = col_double(), other_corals_bleaching = col_double(), totalpoc = col_double(), totalpor = col_double(), totalpav = col_double(), totalpsamm = col_double(), totalother = col_double(), site = col_character(), date = col_character() # Treat 'date' as character initially )) # Parse 'date' into a proper Date format prevsub <- prevsub %>% mutate(date = dmy(date)) # Remove rows where all values are NA prevsub <- prevsub %>% filter(rowSums(is.na(.)) != ncol(.)) # Ensure 'date' format is consistent (redundant if using dmy, but kept as a check) prevsub <- prevsub %>% mutate(date = as.Date(date, format = "%d/%m/%Y")) # Convert site to factor prevsub <- prevsub %>% mutate(site = as.factor(site)) # Add scaled date prevsub <- prevsub %>% mutate( z_date = scale(as.numeric(date)) ) # Ensure rows with missing 'bleached' but valid 'coral' are retained prevsub <- prevsub %>% filter(!is.na(z_date) & is.finite(z_date) & (!is.na(bleached) | !is.na(coral))) # Check for invalid dates invalid_dates <- prevsub %>% filter(is.na(date)) if (nrow(invalid_dates) > 0) { print("Invalid dates found:") print(invalid_dates) # Remove invalid dates prevsub <- prevsub %>% filter(!is.na(date)) } else { print("All dates are valid!") } # Step 1: Convert NA to structural zeros for total bleaching columns bleaching_cols <- c("poc_bleached", "por_bleached", "pav_bleached", "psamm_bleached", "other_corals_bleaching") total_cols <- c("totalpoc", "totalpor", "totalpav", "totalpsamm", "totalother") for (i in seq_along(bleaching_cols)) { prevsub[[bleaching_cols[i]]] <- ifelse( is.na(prevsub[[bleaching_cols[i]]]) & prevsub[[total_cols[i]]] == 0, 0, prevsub[[bleaching_cols[i]]] ) } # Step 2: Convert NA to structural zeros for mostly bleached and bleached data species_cols <- c("poc_mbb", "por_mbb", "pav_mbb", "psam_mbb", "other_mbb") for (i in seq_along(species_cols)) { prevsub[[species_cols[i]]] <- ifelse( is.na(prevsub[[species_cols[i]]]) & prevsub[[total_cols[i]]] == 0, 0, prevsub[[species_cols[i]]] ) } # Step 3: Constrain proportions for beta regression for additional columns additional_proportion_cols <- c( "coral", "bleached", "diseased", "macroalgae", "turf", "cyanobacteria", "caulerpa", "cca", "rubble", "rock", "sand", "deadcoral", "other", "proppoc", "proppor", "proppav", "proppsam", "propoth", "poc_mbb", "por_mbb", "pav_mbb", "psam_mbb", "other_mbb", "poc_bleached", "por_bleached", "pav_bleached", "psamm_bleached", "other_corals_bleaching" ) prevsub <- prevsub %>% mutate(across(all_of(additional_proportion_cols), ~ pmax(pmin(., 0.9999), 0.0001))) # Step 4: Create structural zero indicators for all relevant columns for (i in seq_along(total_cols)) { prevsub[[paste0(species_cols[i], "_structural_zero")]] <- ifelse( prevsub[[total_cols[i]]] == 0, 1, 0 ) prevsub[[paste0(bleaching_cols[i], "_structural_zero")]] <- ifelse( prevsub[[total_cols[i]]] == 0, 1, 0 ) } # Constrain response variables for zero_inflated_beta family prevsub <- prevsub %>% mutate( macroalgae = pmax(pmin(macroalgae, 0.9999), 0.0001), caulerpa = pmax(pmin(caulerpa, 0.9999), 0.0001), turf = pmax(pmin(turf, 0.9999), 0.0001), cyanobacteria = pmax(pmin(cyanobacteria, 0.9999), 0.0001), cca = pmax(pmin(cca, 0.9999), 0.0001) ) # Filter for selected sites selected_sites <- c("Tina", "Chorro", "San Josecito", "Gallos", "Cueva", "Barco Somero", "Esquina", "Este Intermedio", "Barco Profundo", "Jardin") filtered_prevsub <- prevsub %>% filter(site %in% selected_sites) %>% mutate(site = droplevels(site)) # Save removed data for later use removed_data <- filtered_prevsub %>% filter(site == "Barco Somero" & format(date, "%Y-%m") == "2024-03") # Proceed with filtered data for analysis filtered_prevsub <- filtered_prevsub %>% filter(!(site == "Barco Somero" & format(date, "%Y-%m") == "2024-03")) # Preprocess temperature predictors prevsub_clean <- prevsub %>% select(bleached, mean_temp, max_temp, min_temp) %>% drop_na() # Remove rows with any missing values if (nrow(prevsub_clean) == 0) { stop("Filtered dataset prevsub_clean is empty. Check mean_temp, max_temp, min_temp, or bleached columns.") } ``` ```{r Load required libraries} library(dplyr) library(tidyr) library(stringr) # Load data hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE) # Replace empty cells with NA hobo_data <- hobo_data %>% mutate(across(everything(), ~ ifelse(. == "", NA, .))) # Clean column names # Remove duplicate suffixes like ".1", ".2" from site names names(hobo_data) <- names(hobo_data) %>% str_replace_all("\\.\\d+$", "") # Remove suffixes like ".1", ".2" # Pivot longer for site-wise temperature readings hobo_long <- hobo_data %>% pivot_longer( cols = -c(Date, Time), names_to = "site", values_to = "temperature" ) %>% mutate( temperature = as.numeric(temperature), # Ensure temperature is numeric date = as.Date(Date, format = "%d/%m/%Y") # Convert Date to Date class ) # Remove rows with completely missing temperature hobo_long <- hobo_long %>% filter(!is.na(temperature)) # Aggregate daily statistics (mean, max, min) by site daily_stats <- hobo_long %>% group_by(date, site) %>% summarize( mean_temp = mean(temperature, na.rm = TRUE), max_temp = max(temperature, na.rm = TRUE), min_temp = min(temperature, na.rm = TRUE), .groups = "drop" ) # Inspect the resulting dataset head(daily_stats) # Save cleaned data for future use write.csv(daily_stats, "HOBO_daily_stats.csv", row.names = FALSE) ``` ```{r calculate-monthly-stats-cleaned, echo=TRUE, message=FALSE, warning=FALSE} # Load required libraries library(dplyr) library(lubridate) library(stringr) library(tidyr) # Read the HOBO dataset hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE) # Replace empty cells with NA hobo_data <- hobo_data %>% mutate(across(everything(), ~ ifelse(. == "", NA, .))) # Convert Date and Time to datetime format hobo_data <- hobo_data %>% mutate( datetime = dmy_hms(paste(Date, Time)), # Combine Date and Time into a single datetime date = as.Date(datetime), # Extract date month = floor_date(date, "month") # Extract the month for aggregation ) # Pivot the data longer to gather temperature columns # Assuming site names are columns (excluding Line#, Date, Time, and metadata) temperature_columns <- setdiff(names(hobo_data), c("Line#", "Date", "Time", "datetime", "date", "month")) hobo_long <- hobo_data %>% pivot_longer( cols = all_of(temperature_columns), names_to = "site", values_to = "temperature" ) %>% mutate( site = str_trim(site), # Trim whitespace from site names temperature = as.numeric(temperature) # Convert temperature to numeric ) # Group by month and site to calculate mean, max, and min temperatures monthly_stats <- hobo_long %>% group_by(month = floor_date(date, "month"), site) %>% summarize( mean_temp = mean(temperature, na.rm = TRUE), max_temp = max(temperature, na.rm = TRUE), min_temp = min(temperature, na.rm = TRUE), .groups = "drop" # Ungroup the data after summarization ) # Inspect the resulting dataset head(monthly_stats) # Save the monthly statistics to a CSV file (optional) write.csv(monthly_stats, "HOBO_monthly_stats_cleaned.csv", row.names = FALSE) ``` ```{r compare-temperatures-fixed, echo=TRUE, message=FALSE, warning=FALSE} # Load required libraries library(dplyr) library(ggplot2) hobo_data <- read.csv("trimmed_hobo_data.csv", stringsAsFactors = FALSE) # Step 1: Filter for the selected sites only filtered_sites <- c("Barco.Somero", "Barco.Profundo", "Tina", "Ancla") filtered_data <- hobo_data %>% filter(site %in% filtered_sites) # Step 2: Calculate Monthly Statistics for the Selected Sites filtered_data <- filtered_data %>% mutate(date = as.Date(date), # ensure proper date format month = as.Date(format(date, "%Y-%m-01"))) # extract month monthly_stats <- filtered_data %>% group_by(site, month) %>% summarize( mean_temp = mean(temperature, na.rm = TRUE), max_temp = max(temperature, na.rm = TRUE), min_temp = min(temperature, na.rm = TRUE), .groups = "drop" ) # Step 3: Remove rows with NA, NaN, or Inf in temperature columns monthly_stats_clean <- monthly_stats %>% filter( !is.na(mean_temp) & !is.nan(mean_temp) & is.finite(mean_temp), !is.na(max_temp) & !is.nan(max_temp) & is.finite(max_temp), !is.na(min_temp) & !is.nan(min_temp) & is.finite(min_temp) ) # Step 4: Visualize Mean, Max, and Min Temperatures Among Sites ggplot(monthly_stats_clean, aes(x = site, y = mean_temp)) + geom_boxplot(fill = "lightblue") + geom_jitter(width = 0.2, alpha = 0.5) + labs(title = "Comparison of Mean Temperatures Among Selected Sites", x = "Site", y = "Mean Temperature (°C)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ggplot(monthly_stats_clean, aes(x = site, y = max_temp)) + geom_boxplot(fill = "lightcoral") + geom_jitter(width = 0.2, alpha = 0.5) + labs(title = "Comparison of Max Temperatures Among Selected Sites", x = "Site", y = "Max Temperature (°C)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ggplot(monthly_stats_clean, aes(x = site, y = min_temp)) + geom_boxplot(fill = "lightgreen") + geom_jitter(width = 0.2, alpha = 0.5) + labs(title = "Comparison of Min Temperatures Among Selected Sites", x = "Site", y = "Min Temperature (°C)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Step 5: Visualize Trends Over Time ggplot(monthly_stats_clean, aes(x = month)) + geom_line(aes(y = mean_temp, color = site), size = 1) + labs(title = "Mean Temperature Trends Over Time (Selected Sites)", x = "Month", y = "Mean Temperature (°C)") + theme_minimal() ggplot(monthly_stats_clean, aes(x = month)) + geom_line(aes(y = max_temp, color = site), size = 1) + labs(title = "Max Temperature Trends Over Time (Selected Sites)", x = "Month", y = "Max Temperature (°C)") + theme_minimal() ggplot(monthly_stats_clean, aes(x = month)) + geom_line(aes(y = min_temp, color = site), size = 1) + labs(title = "Min Temperature Trends Over Time (Selected Sites)", x = "Month", y = "Min Temperature (°C)") + theme_minimal() # Step 6: Statistical Tests to Compare Temperatures Among Selected Sites # ANOVA for mean temperature comparison anova_mean <- aov(mean_temp ~ site, data = monthly_stats_clean) summary(anova_mean) # ANOVA for max temperature comparison anova_max <- aov(max_temp ~ site, data = monthly_stats_clean) summary(anova_max) # ANOVA for min temperature comparison anova_min <- aov(min_temp ~ site, data = monthly_stats_clean) summary(anova_min) # Post-hoc Tukey's Test for mean temperature tukey_mean <- TukeyHSD(anova_mean) print(tukey_mean) # Post-hoc Tukey's Test for max temperature tukey_max <- TukeyHSD(anova_max) print(tukey_max) # Post-hoc Tukey's Test for min temperature tukey_min <- TukeyHSD(anova_min) print(tukey_min) # Save Tukey results to a file (optional) capture.output(tukey_mean, file = "tukey_mean_results.txt") capture.output(tukey_max, file = "tukey_max_results.txt") capture.output(tukey_min, file = "tukey_min_results.txt") ``` ```{r trim-hobo-data, echo=TRUE, message=FALSE, warning=FALSE} # Load necessary libraries library(dplyr) library(lubridate) library(tidyr) # Step 1: Load the HOBO dataset hobo_data <- read.csv("HOBO.csv", stringsAsFactors = FALSE) # Step 2: Replace empty cells with NA hobo_data <- hobo_data %>% mutate(across(everything(), ~ ifelse(. == "", NA, .))) # Step 3: Parse Date column only (ignore Time column) hobo_data <- hobo_data %>% mutate( date = dmy(Date, quiet = TRUE) # Parse the Date column safely ) # Check for issues with date parsing if (any(is.na(hobo_data$date))) { cat("Warning: Some Date values could not be parsed. Check the Date format.\n") } # Step 4: Remove rows with NA values in critical columns (date and temperature columns) hobo_data <- hobo_data %>% drop_na(date) # Remove rows where date is NA # Step 5: Pivot data to long format temperature_columns <- setdiff(names(hobo_data), c("Line#", "Date", "Time", "date")) hobo_long <- hobo_data %>% pivot_longer( cols = all_of(temperature_columns), names_to = "site", values_to = "temperature" ) %>% drop_na(temperature) # Remove rows where temperature is NA # Step 6: Identify dates with maximum site overlap common_dates <- hobo_long %>% group_by(date) %>% # Group by date summarize( sites_with_data = n_distinct(site), # Count distinct sites with data for each date site_list = list(unique(site)) # Store the list of sites with data for each date ) %>% arrange(desc(sites_with_data)) # Sort by the number of sites with data # Step 7: Filter the dataset to retain only dates with maximum overlap # Choose the maximum number of sites shared on any given date max_sites <- max(common_dates$sites_with_data, na.rm = TRUE) dates_with_max_sites <- common_dates %>% filter(sites_with_data == max_sites) %>% pull(date) # Filter the dataset for these dates trimmed_hobo_data <- hobo_long %>% filter(date %in% dates_with_max_sites) # Debugging: Print out which dates and sites were retained cat("Dates retained with maximum overlap:\n") print(common_dates %>% filter(date %in% dates_with_max_sites)) # Step 8: Save the trimmed dataset to a CSV file write.csv(trimmed_hobo_data, "trimmed_HOBO_data.csv", row.names = FALSE) # Optional: Print a preview of the trimmed dataset head(trimmed_hobo_data) ``` ## R Markdown This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ```{r cars} summary(cars) ``` ## Including Plots You can also embed plots, for example: ```{r pressure, echo=FALSE} plot(pressure) ``` Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.