# The range of alpha values to search alphas <- seq(0,1,by=.01) # Search for optimal values of lambda and alpha hyperparameters through cross-validation # The results will slightly differ from run to run, so use set.seed() head(training) set.seed(2018) PoissonCV <- cva.glmnet(Accidents~ ., data = training, alpha = alphas, nfolds = 10, family = "poisson") plot(PoissonCV) # Perform grid search on best alpha + lambda: gridSearchResult <- list() for (i in 1:length(PoissonCV$modlist)) { #modlist contains individual cv.glmnet objects for each value of alpha lambda<-t(t(PoissonCV$modlist[[i]]$lambda)) # vector of lambda values cvm<-t(t(PoissonCV$modlist[[i]]$cvm)) # cross-validation errors, here: deviance nz<-t(t(PoissonCV$modlist[[i]]$nzero)) # number of predictors with zero beta coefficients lambdamin <- PoissonCV$modlist[[i]]$lambda.min # minimum lambda gridSearchResult[[i]] <- cbind(lambda, cvm, nz,lambdamin) colnames(gridSearchResult[[i]]) <- c("lambda", "cvm", "nonzero", "lambdamin") } # Check the number of variables included in each cell of the grid: nvars <- vector() for (i in 1:length(gridSearchResult)) { temp <- gridSearchResult[[i]] nvars[i] <- temp[which(temp[,1] == temp[,4]),3] } nvars # Ninth cell has 16 vars, good compromise. # The choice of optimal alpha is to a degree subjective. From the documentation: # Because crossvalidation is often a statistically noisy procedure, it doesn’t try to automatically choose α and λ for you. # Instead you can plot the output, to see how the results depend on the values of these parameters. # Using this information, you can choose appropriate values for your data. ### Fit model in the training sample ### alphas[9] # 0.08 --> More of a ridge regression than lasso # Check the results when α = 0.08 PoissonCV$modlist[[9]] # Save values of lambdas best.lambda <- PoissonCV$modlist[[9]]$lambda.min #Picks 16 variables optimal.lambda <- PoissonCV$modlist[[9]]$lambda.1se #Picks 0 variables (!) gridSearchResult[9] wholeTrainingFit <- glmnet(Accidents~ ., data = training, alpha = alphas[9], family = "poisson") #show solution path, line at best value of lambda from cv par(mfrow=c(1,1)) plot(wholeTrainingFit, xvar=c("lambda"), label = TRUE) abline(v=c(log(best.lambda))) #coefficient vector from elastic net model with best lambda b <- coef(wholeTrainingFit, s = best.lambda) # Which variables were not regularised to zero? which(b!=0) #get final scale score from best model final_score <- predict(wholeTrainingFit,x1, type=c("response"),s=best.lambda) # z-score final_scoreZ <- scale(final_score) ### Fit model in the training sample, alpha = 0, i.e. fit ridge regression model ### alphas[1] # 0 --> Ridge-regression PoissonCV$modlist[[1]] #This is the result with alpha == 0 gridSearchResult[1] best.lambdaAlpha0 <- PoissonCV$modlist[[1]]$lambda.min #Picks 42 variables (of course) optimal.lambdaAlpha0 <- PoissonCV$modlist[[1]]$lambda.1se #Ditto wholeTrainingFitAlpha0 <- glmnet(Accidents~ ., data = training, alpha = alphas[1], family = "poisson") #show solution path, line at best value of lambda from cv par(mfrow=c(1,1)) plot(wholeTrainingFitAlpha0, xvar=c("lambda"), label = TRUE) abline(v=c(log(best.lambdaAlpha0))) #coefficient vector from model with best lambda bAlpha0 <- coef(wholeTrainingFitAlpha0, s = best.lambdaAlpha0) #get final scale score from best model final_scoreAlpha0 <- predict(wholeTrainingFitAlpha0,x1, type=c("response"),s=best.lambdaAlpha0) # z-score final_scoreZAlpha0 <- scale(final_scoreAlpha0) ### Fit model in whole training sample, naive Poisson regression ### wholeTrainingFitNaivePoisson <- glm(Accidents~ ., data = training, family = "poisson") bNaivePoisson <- coef(wholeTrainingFitNaivePoisson) final_scoreNaivePoisson <- predict(wholeTrainingFitNaivePoisson, as.data.frame(x1), type=c("response")) final_scoreZNaivePoisson <- scale(final_scoreNaivePoisson) ######### #generate the scale scores in hold-out sample #make matrix of items (predictors) from new data frame of hold out sample x2 <- model.matrix(Accidents ~ ., data=testing)[,-1] yTest <- testing$Accidents #add score based on final model to holdout data finalScorePred1 <- predict(wholeTrainingFit, x2, type=c("response"), s=best.lambda) finalScorePred1Z <- scale(finalScorePred1) finalScorePredAlpha0 <- predict(wholeTrainingFitAlpha0, x2, type=c("response"), s=best.lambdaAlpha0) finalScorePredAlpha0Z <- scale(finalScorePredAlpha0) finalScorePredNaive <- predict(wholeTrainingFitNaivePoisson, as.data.frame(x2), type=c("response")) finalScorePredNaiveZ <- scale(finalScorePredNaive) ########################################################################################################################## ########################################################################################################################## ############################# ######################################## ############################# Write a couple of functions for assessing model fit ######################################## ############################# ######################################## ########################################################################################################################## ########################################################################################################################## # Mean squared error: mse <- function(y, pred) { mse <- mean((y - pred)^2) return(mse) } # Min-max index: min_max <- function(df,nres) { result <- data.frame() for (i in 1:nres) { temp <- df[,c(1,i+1)] result[1,i] <- mean(apply(temp, 1, min) / apply(temp, 1, max)) } return(as.numeric(result)) } # Log-likelihood (needed in various other calculations): poissonloglik <- function(lambda, y) { return(y*log(lambda) - lambda - log(factorial(y))) } # McFadden's pseudo R2: pseudor2 <- function(yobs, ypred) { ynull <- mean(yobs) lnull <- sum(poissonloglik(ynull,yobs)) lpred <- sum(poissonloglik(ypred,yobs)) return(1-(lpred/lnull)) } # Deviance: devianceFun <- function(yobs, ypred) { saturated <- poissonloglik(yobs,yobs) saturated[yobs == 0] = 0 modeled <- poissonloglik(ypred,yobs) return(sum(2 * (saturated - modeled))) } ############## Model fit indices, training data ############## # Get accidents for training data: Accs <- training$Accidents # Create a data frame for the model fit calculations: DataFitTraining <- cbind(Accs,final_score,final_scoreAlpha0,final_scoreNaivePoisson) DataFitTraining <- as.data.frame(DataFitTraining) names(DataFitTraining) <- c("Accidents", "Alpha0.07", "Alpha0", "NaivePoisson") # Calculate mse: mseTrain <- with(DataFitTraining,c(mse(Accidents,Alpha0.07),mse(Accidents,Alpha0),mse(Accidents,NaivePoisson))) # Calculate deviance: devianceTrain <- with(DataFitTraining, c(devianceFun(Accidents,Alpha0.07), devianceFun(Accidents,Alpha0), devianceFun(Accidents,NaivePoisson))) # Calculate minmax: minmaxTrain <- min_max(DataFitTraining,3) # Calculate McFadden Pseudo-R2 McFadsTrain <- c(pseudor2(Accs,final_score), pseudor2(Accs,final_scoreAlpha0), pseudor2(Accs,final_scoreNaivePoisson)) # Store results in a dataframe: FitTrain <- data.frame() FitTrain <- rbind(FitTrain,cor(DataFitTraining)[2:4]) names(FitTrain) <- c("Lasso, a = 0.07", "Lasso, a = 0", "Naive regression") FitTrain <- rbind(FitTrain,mseTrain, minmaxTrain,McFadsTrain, devianceTrain) row.names(FitTrain) <- c("corPearson","mse", "minmax", "McFaddenPseudoR2", "deviance") # Add null deviance: FitTrain <- rbind(FitTrain, devianceFun(Accs,rep(mean(Accs), length(Accs)))) rownames(FitTrain)[6] <- "nullDeviance" FitTrain <- round(FitTrain,3) # Save results: FitTraincode <- kable(round(FitTrain,3), format = "html") %>% paste(as.character(), collapse = "\n") write.table(FitTraincode, "FitTrain.html", quote = FALSE,col.names = FALSE, row.names = FALSE) ############## Model fit indices, test data ############## AccsTest <- testing$Accidents # Create a data frame for the model fit calculations: DataFitTesting <- cbind(AccsTest,finalScorePred1,finalScorePredAlpha0,finalScorePredNaive) DataFitTesting <- as.data.frame(DataFitTesting) names(DataFitTesting) <- c("Accidents", "Alpha0.07", "Alpha0", "NaivePoisson") # Calculate mse: mseTest <- with(DataFitTesting,c(mse(Accidents,Alpha0.07),mse(Accidents,Alpha0),mse(Accidents,NaivePoisson))) # Calculate deviance: devianceTest <- with(DataFitTesting,c(devianceFun(Accidents,Alpha0.07), devianceFun(Accidents,Alpha0), devianceFun(Accidents,NaivePoisson))) # Calculate minmax: minmaxTest <- min_max(DataFitTesting,3) McFadsTest <- c(pseudor2(AccsTest,finalScorePred1), pseudor2(AccsTest,finalScorePredAlpha0), pseudor2(AccsTest,finalScorePredNaive)) # Store results in a dataframe: FitTest <- data.frame() FitTest <- rbind(FitTest,cor(DataFitTesting)[2:4]) names(FitTest) <- c("Lasso, a = 0.07", "Lasso, a = 0", "Naive regression") FitTest <- rbind(FitTest,mseTest, minmaxTest, McFadsTest, devianceTest) row.names(FitTest) <- c("corPearson","mse", "minmax", "McFaddenPseudoR2", "deviance") # Add null deviance: FitTest <- rbind(FitTest, devianceFun(AccsTest,rep(mean(AccsTest), length(AccsTest)))) rownames(FitTest)[6] <- "nullDeviance" FitTest <- round(FitTest,3) FitTestcode <- kable(round(FitTest,3), format = "html") %>% paste(as.character(), collapse = "\n") write.table(FitTestcode, "FitTest.html", quote = FALSE,col.names = FALSE, row.names = FALSE) ######################################################################################################