library(tidyverse)
list.files(path = "../input")
library('tensorflow')
library('keras')
library('cowplot')
library('tidyverse')
library('ggplot2')
library('gridExtra')
library('imager')
library('caret')
set.seed(42)
root_dir <- "/kaggle/input/brain-mri-images-for-brain-tumor-detection/brain_tumor_dataset"
class_names <- list.dirs(root_dir, full.names = FALSE, recursive = FALSE)
class_names
for (class_name in class_names) {
    class_path <- file.path(root_dir, class_name)
    num_files <- length(list.files(class_path, full.names = TRUE))
    cat("Class", class_name, ":", num_files, "\n")
}

for (class_name in class_names) {
    class_path = file.path(root_dir, class_name)
    files = list.files(class_path, full.names = TRUE)

    indices <- sample(1:length(files))

    train_indices <- indices[1:floor(0.8 * length(files))]
    valid_indices <- indices[(floor(0.8 * length(files)) + 1):(floor((0.8 + 0.10) * length(files)))]
    test_indices <- indices[(floor((0.8 + 0.10) * length(files)) + 1):length(files)]

    class_train_path <- file.path("train_set", class_name)
    class_valid_path <- file.path("valid_set", class_name)
    class_test_path <- file.path("test_set", class_name)

    dir.create(class_train_path, recursive = TRUE, showWarnings = FALSE)
    dir.create(class_valid_path, recursive = TRUE, showWarnings = FALSE)
    dir.create(class_test_path, recursive = TRUE, showWarnings = FALSE)

    for (i in train_indices) {
        file.copy(files[i], class_train_path)
    }

    for (i in valid_indices) {
        file.copy(files[i], class_valid_path)
    }

    for (i in test_indices) {
        file.copy(files[i], class_test_path)
    }
}


train_path <- "train_set"
valid_path <- "valid_set"
test_path <- "test_set"

train_files <- list.files(train_path, recursive = TRUE, full.names = TRUE)
valid_files <- list.files(valid_path, recursive = TRUE, full.names = TRUE)
test_files <- list.files(test_path, recursive = TRUE, full.names = TRUE)

num_train_samples <- length(train_files)
num_valid_samples <- length(valid_files)
num_test_samples <- length(test_files)
num_total_samples <- sum(c(num_train_samples, num_valid_samples, num_test_samples))

cat('Num Total Samples:', num_total_samples, '\n')
cat('Num Train Samples:', num_train_samples, '\n')
cat('Num Valid Samples:', num_valid_samples, '\n')
cat('Num Test Samples:', num_test_samples, '\n')

my_bar <- barplot(c(num_train_samples, num_valid_samples, num_test_samples),
        names.arg = c("Train Set" , "Valid Set" , "Test Set"),
        col= c(rgb(0.3, 0.1, 0.4, 0.6), rgb(0.3, 0.5, 0.4, 0.6) , rgb(0.3, 0.9, 0.4, 0.6)),
        border = "black",
        horiz = FALSE,
        cex.axis = 0.8,
        cex.names = 0.8,
        axis.lty = 1,
        ylim = c(0, num_total_samples),
        main = "Distribution of Sets",
        xlab = "Set name",
        ylab = "Num. of samples in set"
)

text(x = my_bar, 
     y = c(num_train_samples, num_valid_samples, num_test_samples), 
     label = c(num_train_samples, num_valid_samples, num_test_samples), 
     pos = 3, cex = 0.8, col = "black")

legend("topright", legend = c("Train Set" , "Valid Set" , "Test Set"), 
     col = c(rgb(0.3, 0.1, 0.4, 0.6), rgb(0.3, 0.5, 0.4, 0.6) , rgb(0.3, 0.9, 0.4, 0.6)), 
     bty = "n", pch = 20 , pt.cex = 2, cex = 0.8, horiz = FALSE, inset = c(0.05, 0.05))

all_images <- list.files(root_dir, full.names = TRUE, recursive = TRUE)
sample_paths <- sample(all_images, 16)

images <- lapply(sample_paths, function(image_file) {
  img <- load.image(image_file)
  img
})

labels <- sapply(sample_paths, function(x) { basename(dirname(x)) })

par(mfrow = c(4, 4), mar = c(2, 2, 2, 2))

for (i in 1:16) {
    plot(images[[i]], main = paste("Tumor:", labels[i]), axes = FALSE)
}

train_datagen <- image_data_generator(
    rescale = 1/255,
    rotation_range = 20,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    shear_range = 0.1,
    zoom_range = 0.1,
    horizontal_flip = TRUE,
    fill_mode = "nearest"
)

valid_datagen <- image_data_generator(
    rescale = 1/255
)

test_datagen <- image_data_generator(
    rescale = 1/255
)

train_generator <- flow_images_from_directory(
    train_path,
    train_datagen,
    target_size = c(299, 299),
    batch_size = 32,
    class_mode = "categorical"
)

valid_generator <- flow_images_from_directory(
    valid_path,
    valid_datagen,
    target_size = c(299, 299),
    batch_size = 32,
    class_mode = "categorical"
)

test_generator <- flow_images_from_directory(
    test_path,
    test_datagen,
    target_size = c(299, 299),
    batch_size = 32,
    class_mode = "categorical"
)

base_model <- application_xception(
    weights = "imagenet",
    include_top = FALSE,
    input_shape = c(299, 299, 3)
)

model <- keras_model_sequential() %>%
    base_model %>%
    layer_global_average_pooling_2d() %>%
    layer_batch_normalization() %>%
    layer_dense(units = 256, activation = "relu", kernel_regularizer = regularizer_l2(0.01)) %>% 
    layer_batch_normalization() %>%
    layer_dropout(0.5) %>%
    layer_dense(units = length(class_names), activation = "softmax")

freeze_weights(base_model)

model %>% compile(
    optimizer = optimizer_adam(),
    loss = "categorical_crossentropy",
    metrics = c("accuracy")
)

history <- model %>% fit(
    train_generator,
    epochs = 50,
    validation_data = valid_generator,
)

history_df <- data.frame(
    epoch = 1:50,
    accuray = history$metrics$accuracy,
    val_accuracy = history$metrics$val_accuracy,
    loss = history$metrics$loss,
    val_loss = history$metrics$val_loss
)

tail(history_df)

accuracy_df <- data.frame(epoch = 1:50, train_acc = history$metrics$accuracy, val_acc = history$metrics$val_accuracy)

ggplot(accuracy_df, aes(x = epoch)) +
  geom_line(aes(y = train_acc, color = "Train Accuracy")) +
  geom_line(aes(y = val_acc, color = "Validation Accuracy")) +
  labs(title = "Accuracy Curve", x = "Epoch", y = "Accuracy") +
  scale_color_manual("", 
                     breaks = c("Train Accuracy", "Validation Accuracy"),
                     values = c("blue", "red")) +
  theme_minimal()



loss_df <- data.frame(epoch = 1:50, train_acc = history$metrics$loss, val_acc = history$metrics$val_loss)

ggplot(loss_df, aes(x = epoch)) +
  geom_line(aes(y = train_acc, color = "Train Loss")) +
  geom_line(aes(y = val_acc, color = "Validation Loss")) +
  labs(title = "Loss Curve", x = "Epoch", y = "Loss") +
  scale_color_manual("", 
                     breaks = c("Train Loss", "Validation Loss"),
                     values = c("blue", "red")) +
  theme_minimal()


pred_probs <- model %>% predict(test_generator)
pred_labels <- apply(pred_probs, 1, which.max) - 1

true_labels <- test_generator$classes

conf_matrix <- confusionMatrix(factor(pred_labels), factor(true_labels))
print(conf_matrix)

accuracy <- conf_matrix$overall["Accuracy"]
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Accuracy:", accuracy, "\n")
cat("Precision:", precision, "\n")
cat("Recall:", recall, "\n")
cat("F1 Score:", f1_score, "\n")

images <- lapply(test_files, function(image_file) {
  img <- load.image(image_file)
  img
})

labels <- sapply(test_files, function(x) { basename(dirname(x)) })

par(mfrow = c(5, 5), mar = c(3, 3, 3, 3))

for (i in 1:25) {
    plot(images[[i]], axes = FALSE)
    color <- ifelse(true_labels[i] == pred_labels[i], "blue", "red")
    title(main = paste("True:", class_names[true_labels[i]+1], "\nPredicted:", class_names[pred_labels[i]+1]), col.main = color)
}