import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import joblib
import os

def load_data():
    """Load train and test datasets"""
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')
    
    X_train = train.drop('loan_status', axis=1)
    y_train = train['loan_status']
    X_test = test.drop('loan_status', axis=1)
    y_test = test['loan_status']
    
    return X_train, y_train, X_test, y_test

def train_models(X_train, y_train):
    """Train multiple classification models"""
    models = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'LightGBM': LGBMClassifier(random_state=42),
        'KNN': KNeighborsClassifier(),
        'XGBoost': XGBClassifier(random_state=42)
    }
    
    trained_models = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        trained_models[name] = model
    
    return trained_models

def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """Evaluate model performance on both train and test sets"""
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics for train set
    train_metrics = {
        'Accuracy': accuracy_score(y_train, y_train_pred),
        'Precision': precision_score(y_train, y_train_pred),
        'Recall': recall_score(y_train, y_train_pred),
        'F1 Score': f1_score(y_train, y_train_pred),
        'ROC AUC': roc_auc_score(y_train, y_train_pred)
    }
    
    # Calculate metrics for test set
    test_metrics = {
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred),
        'F1 Score': f1_score(y_test, y_test_pred),
        'ROC AUC': roc_auc_score(y_test, y_test_pred)
    }
    
    # Print results
    print(f"\n{model_name} Performance:")
    print("\nTrain Set Metrics:")
    for metric, value in train_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nTest Set Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Print confusion matrix for test set
    print("\nTest Set Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    
    return train_metrics, test_metrics

def save_best_model(models, X_test, y_test, save_dir='./models'):
    """Save the best performing model based on test set F1 score"""
    best_score = 0
    best_model_name = None
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        
        if f1 > best_score:
            best_score = f1
            best_model_name = name
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    best_model = models[best_model_name]
    model_path = os.path.join(save_dir, f'best_model_{best_model_name.lower().replace(" ", "_")}.joblib')
    joblib.dump(best_model, model_path)
    print(f"\nBest model ({best_model_name}) saved to {model_path}")
    print(f"Best F1 Score: {best_score:.4f}")

def main():
    # Load data
    print("Loading data...")
    X_train, y_train, X_test, y_test = load_data()
    
    # Train models
    print("\nTraining models...")
    trained_models = train_models(X_train, y_train)
    
    # Evaluate all models
    print("\nEvaluating models...")
    all_metrics = {}
    for name, model in trained_models.items():
        train_metrics, test_metrics = evaluate_model(
            model, X_train, y_train, X_test, y_test, name
        )
        all_metrics[name] = {'train': train_metrics, 'test': test_metrics}
    
    # Save best model
    save_best_model(trained_models, X_test, y_test)

if __name__ == "__main__":
    main()
