source("packages.R")
# load four surveys & the ltdq data

deq1 <- read.csv("deq1.csv",header = TRUE,na.strings = c("-8","-9","-99"))
deq2 <- read.csv("deq2.csv",header = TRUE,na.strings = c("-8","-9","-99"))
deq3 <- read.csv("deq3.csv",header = TRUE,na.strings = c("-8","-9","-99"))
deq4 <- read.csv("deq4.csv",header = TRUE,na.strings = c("-8","-9","-99"))
ltdq <- read.csv("ltdq.csv",header = TRUE,na.strings = c("-8","-9","-99"))

#Rename id-variable
names(deq1)[1] <- names(deq2)[1] <- names(deq3)[1] <- names(deq4)[1] <- "ID"
names(ltdq)[1] <- "ID"

#Find the number of unique IDs in the data
AllIDS <- unique(c(deq1[,1], deq2[,1], deq3[,1], deq4[,1]))
length(AllIDS) # 12272
AllIDS <- sort(AllIDS)
ltdq <- ltdq[ltdq$ID %in% AllIDS,]
dim(ltdq) #12250 200, so 22 questionnaires missing

# Create data frames of the same size for all four time points & ltdq:

temp2 <- temp3 <- temp4 <- temp1 <- rep(NA,length(AllIDS))
temp.ltdq <- rep(NA,length(AllIDS))

# Create index variables for the four time points & the ltdq to show which cases are to be included:

for (i in 1:length(AllIDS)) {
  t <- which(deq1[,1] == AllIDS[i]) # Find the ID-number in deq1 that corresponds to AllIDS-number i
  if (!!length(t)) temp1[i] <- t # Check if t has a length, i.e. if it is found and if so, include in temp1
  t <- which(deq2[,1] == AllIDS[i]) 
  if (!!length(t)) temp2[i] <- t
  t <- which(deq3[,1] == AllIDS[i])
  if (!!length(t)) temp3[i] <- t
  t <- which(deq4[,1] == AllIDS[i])
  if (!!length(t)) temp4[i] <- t
}

for (i in 1:length(AllIDS)) {
  t <- which(ltdq[,1] == AllIDS[i]) # Find the ID-number in deq1 that corresponds to AllIDS-number i
  if (!!length(t)) temp.ltdq[i] <- t # Check if t has a length, i.e. if it is found and if so, include in temp1
}  

# How much non-missing data is there?
length(na.omit(temp1)) # 10064
length(which(is.na(temp1))) # 2208
length(na.omit(temp2)) # 7450
length(which(is.na(temp2))) # 4822
length(na.omit(temp3)) # 4187
length(which(is.na(temp3))) # 8085
length(na.omit(temp4)) # 2760
length(which(is.na(temp4))) # 9512
length(na.omit(temp.ltdq)) # 12250
length(which(is.na(temp.ltdq))) # 22

# R can't handle indexing with NA so replace NAs with zeros:
temp1[is.na(temp1)] <- 0
temp2[is.na(temp2)] <- 0
temp3[is.na(temp3)] <- 0
temp4[is.na(temp4)] <- 0
temp.ltdq[is.na(temp.ltdq)] <- 0

# Create new data frames of equal dimensions 
DEQ1 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq1)))
DEQ2 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq2)))
DEQ3 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq3)))
DEQ4 <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(deq4)))
LTDQ <- data.frame(matrix(NA, nrow = length(AllIDS), ncol = ncol(ltdq)))

# Fill in the new data frames for the non-missing observations:
DEQ1[temp1>0,] <- deq1[temp1[temp1>0],]
DEQ2[temp2>0,] <- deq2[temp2[temp2>0],]
DEQ3[temp3>0,] <- deq3[temp3[temp3>0],]
DEQ4[temp4>0,] <- deq4[temp4[temp4>0],]
LTDQ[temp.ltdq > 0,] <- ltdq[temp.ltdq[temp.ltdq > 0], ]

names(DEQ1) <- names(deq1)
names(DEQ2) <- names(deq2)
names(DEQ3) <- names(deq3)
names(DEQ4) <- names(deq4)
names(LTDQ) <- names(ltdq)

# Create dataframes of the DBQ variables at the four time points:

DBQ1 <- DEQ1[,c(52:87, 40, 88, 107)] # 52 = too high a gear; 40 = slippery road, 88 = drugs, 107 = swerve to avoid accident
DBQ2 <- DEQ2[,c(140:177, 184)] # 140 = too high a gear; 176 = slippery road, 177 = drugs, 184 = swerve to avoid accident
DBQ3 <- DEQ3[,c(181:218, 140)] # 181 = too high a gear; 217 = slippery road, 218 = drugs, 140 = swerve to avoid accident
DBQ4 <- DEQ4[,c(181:218, 140)] # 181 = too high a gear; 217 = slippery road, 218 = drugs, 140 = swerve to avoid accident

# Pick background variables from among the LTDQ variables
# See the R-file for Supplementary materials for the procedure used in choosing the variables 

BGVarsInclude <- as.character(expression(V00185,V00174,V00186,V00288,V00302,V00304,V00245,V00251,V00264,V00099,V00101))
varnamesBG <- c(
  "compare driving to others drivers in general",
  "Kind of driver - irritable/placid",
  "Kind of driver - nervous/confident",
  "Kind of driver - patient/impatient",
  "Kind of driver - safe/risky",
  "Kind of driver - slow/fast",
  "improve car controls",
  "improve on spotting hazards",
  "improve on knowing what speed is safe",
  "Cars should never overtake on inside lane",
  "Penalties for speeding should be more severe")
BGVarGroups <- c(rep("Self-image",6), rep("Improvement needs",3), rep("Attitudes", 2))


# Pick variables for regression analysis:
# Accidents to be predicted:

Acc.2to4 <- cbind(DEQ2[,231], DEQ3[,229], DEQ4[,229])

# Age, gender & mileage at first time point:
Mileage1 <- DEQ1[,7]
Gender <- DEQ1[,6]
Age<- DEQ1[,3]

# indices without any missing values for regression
nomissings.regression <- which(!is.na(rowSums( cbind(Gender, Age, Mileage1, DBQ1, rowSums(Acc.2to4))))) 
length(nomissings.regression) # 1152 

## Create preliminary data frame for regression ###
Accidents <- rowSums(Acc.2to4[nomissings.regression,])
Demographics <- cbind(Mileage1[nomissings.regression],Gender[nomissings.regression],Age[nomissings.regression])
names(Demographics) <- c("Mileage", "Sex", "Age")
DBQ1temp <- DEQ1[nomissings.regression,c(52:87, 40, 88, 107)] # 52 = too high a gear; 40 = slippery road, 88 = drugs, 107 = swerve to avoid accident
names(DBQ1temp) <- paste("v",seq(1:39),sep="")
Predictors <- cbind(Demographics,DBQ1temp)
Allvars <- cbind(Accidents,Predictors)
#############

# Split dataframe into training data and hold-out data (test data):

set.seed(2018)
Train <- createDataPartition(Allvars$Accidents, p = 0.75, list = FALSE)
training <- Allvars[Train, ]
testing <- Allvars[ -Train, ]

dim(training) # 864 43
dim(testing) # 288 43

names(training)[2:4] <- c("Mileage", "Gender", "Age")
training$Mileage <- log(training$Mileage)
x1 <- model.matrix(Accidents ~ ., data=training)[,-1]
y1 <- training$Accidents

names(testing)[2:4] <- c("Mileage", "Gender", "Age")
testing$Mileage <- log(testing$Mileage)