source("packages.R") # load four surveys & the ltdq data deq1 <- read.csv("deq1.csv",header = TRUE,na.strings = c("-8","-9","-99")) deq2 <- read.csv("deq2.csv",header = TRUE,na.strings = c("-8","-9","-99")) deq3 <- read.csv("deq3.csv",header = TRUE,na.strings = c("-8","-9","-99")) deq4 <- read.csv("deq4.csv",header = TRUE,na.strings = c("-8","-9","-99")) ltdq <- read.csv("ltdq.csv",header = TRUE,na.strings = c("-8","-9","-99")) #Rename id-variable names(deq1)[1] <- names(deq2)[1] <- names(deq3)[1] <- names(deq4)[1] <- "ID" names(ltdq)[1] <- "ID" #Find the number of unique IDs in the data AllIDS <- unique(c(deq1[,1], deq2[,1], deq3[,1], deq4[,1])) length(AllIDS) # 12272 AllIDS <- sort(AllIDS) ltdq <- ltdq[ltdq$ID %in% AllIDS,] dim(ltdq) #12250 200, so 22 questionnaires missing # Create data frames of the same size for all four time points & ltdq: temp2 <- temp3 <- temp4 <- temp1 <- rep(NA,length(AllIDS)) temp.ltdq <- rep(NA,length(AllIDS)) # Create index variables for the four time points & the ltdq to show which cases are to be included: for (i in 1:length(AllIDS)) { t <- which(deq1[,1] == AllIDS[i]) # Find the ID-number in deq1 that corresponds to AllIDS-number i if (!!length(t)) temp1[i] <- t # Check if t has a length, i.e. if it is found and if so, include in temp1 t <- which(deq2[,1] == AllIDS[i]) if (!!length(t)) temp2[i] <- t t <- which(deq3[,1] == AllIDS[i]) if (!!length(t)) temp3[i] <- t t <- which(deq4[,1] == AllIDS[i]) if (!!length(t)) temp4[i] <- t } for (i in 1:length(AllIDS)) { t <- which(ltdq[,1] == AllIDS[i]) # Find the ID-number in deq1 that corresponds to AllIDS-number i if (!!length(t)) temp.ltdq[i] <- t # Check if t has a length, i.e. if it is found and if so, include in temp1 } # How much non-missing data is there? length(na.omit(temp1)) # 10064 length(which(is.na(temp1))) # 2208 length(na.omit(temp2)) # 7450 length(which(is.na(temp2))) # 4822 length(na.omit(temp3)) # 4187 length(which(is.na(temp3))) # 8085 length(na.omit(temp4)) # 2760 length(which(is.na(temp4))) # 9512 length(na.omit(temp.ltdq)) # 12250 length(which(is.na(temp.ltdq))) # 22 # R can't handle indexing with NA so replace NAs with zeros: temp1[is.na(temp1)] <- 0 temp2[is.na(temp2)] <- 0 temp3[is.na(temp3)] <- 0 temp4[is.na(temp4)] <- 0 temp.ltdq[is.na(temp.ltdq)] <- 0 # Create new data frames of equal dimensions DEQ1 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq1))) DEQ2 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq2))) DEQ3 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq3))) DEQ4 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq4))) LTDQ <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(ltdq))) # Fill in the new data frames for the non-missing observations: DEQ1[temp1>0,] <- deq1[temp1[temp1>0],] DEQ2[temp2>0,] <- deq2[temp2[temp2>0],] DEQ3[temp3>0,] <- deq3[temp3[temp3>0],] DEQ4[temp4>0,] <- deq4[temp4[temp4>0],] LTDQ[temp.ltdq > 0,] <- ltdq[temp.ltdq[temp.ltdq > 0], ] names(DEQ1) <- names(deq1) names(DEQ2) <- names(deq2) names(DEQ3) <- names(deq3) names(DEQ4) <- names(deq4) names(LTDQ) <- names(ltdq) # Create dataframes of the DBQ variables at the four time points: DBQ1 <- DEQ1[,c(52:87, 40, 88, 107)] # 52 = too high a gear; 40 = slippery road, 88 = drugs, 107 = swerve to avoid accident DBQ2 <- DEQ2[,c(140:177, 184)] # 140 = too high a gear; 176 = slippery road, 177 = drugs, 184 = swerve to avoid accident DBQ3 <- DEQ3[,c(181:218, 140)] # 181 = too high a gear; 217 = slippery road, 218 = drugs, 140 = swerve to avoid accident DBQ4 <- DEQ4[,c(181:218, 140)] # 181 = too high a gear; 217 = slippery road, 218 = drugs, 140 = swerve to avoid accident # Pick background variables from among the LTDQ variables # See the R-file for Supplementary materials for the procedure used in choosing the variables BGVarsInclude <- as.character(expression(V00185,V00174,V00186,V00288,V00302,V00304,V00245,V00251,V00264,V00099,V00101)) varnamesBG <- c( "compare driving to others drivers in general", "Kind of driver - irritable/placid", "Kind of driver - nervous/confident", "Kind of driver - patient/impatient", "Kind of driver - safe/risky", "Kind of driver - slow/fast", "improve car controls", "improve on spotting hazards", "improve on knowing what speed is safe", "Cars should never overtake on inside lane", "Penalties for speeding should be more severe") BGVarGroups <- c(rep("Self-image",6), rep("Improvement needs",3), rep("Attitudes", 2)) # Pick variables for regression analysis: # Accidents to be predicted: Acc.2to4 <- cbind(DEQ2[,231], DEQ3[,229], DEQ4[,229]) # Age, gender & mileage at first time point: Mileage1 <- DEQ1[,7] Gender <- DEQ1[,6] Age<- DEQ1[,3] # indices without any missing values for regression nomissings.regression <- which(!is.na(rowSums( cbind(Gender, Age, Mileage1, DBQ1, rowSums(Acc.2to4))))) length(nomissings.regression) # 1152 ## Create preliminary data frame for regression ### Accidents <- rowSums(Acc.2to4[nomissings.regression,]) Demographics <- cbind(Mileage1[nomissings.regression],Gender[nomissings.regression],Age[nomissings.regression]) names(Demographics) <- c("Mileage", "Sex", "Age") DBQ1temp <- DEQ1[nomissings.regression,c(52:87, 40, 88, 107)] # 52 = too high a gear; 40 = slippery road, 88 = drugs, 107 = swerve to avoid accident names(DBQ1temp) <- paste("v",seq(1:39),sep="") Predictors <- cbind(Demographics,DBQ1temp) Allvars <- cbind(Accidents,Predictors) ############# # Split dataframe into training data and hold-out data (test data): set.seed(2018) Train <- createDataPartition(Allvars$Accidents, p = 0.75, list = FALSE) training <- Allvars[Train, ] testing <- Allvars[ -Train, ] dim(training) # 864 43 dim(testing) # 288 43 names(training)[2:4] <- c("Mileage", "Gender", "Age") training$Mileage <- log(training$Mileage) x1 <- model.matrix(Accidents ~ ., data=training)[,-1] y1 <- training$Accidents names(testing)[2:4] <- c("Mileage", "Gender", "Age") testing$Mileage <- log(testing$Mileage)