#
# R code for "A machine learning approach for identification of gastrointestinal 
# predictors for the risk of COVID-19 related hospitalization" by Liptak et al 
#
# marian.grendar@jfmed.uniba.sk
#
#


#+ a, include=F  
knitr::opts_chunk$set(comment = NA,   # no hash is included into R 
                      echo=FALSE, warning=FALSE, message=FALSE, 
                      cache=FALSE)
options(warn = -1)
options(scipen = 999)  # suppress the scientific notation
options(digits = 4)
options(gtsummary.print_engine = "gt")


################################################################################
#
# libs
library('gtsummary')
library('rstatix')
library('ggpubr')
library('DescTools')
library('randomForestSRC')
library('ggRandomForests')
library('emmeans')
library('PCAmixdata')
library('mccr')  # Matthews correlation correl; for RF; performance measure


################################################################################
#
# fnc
#
# Oup_not_hospitalized=> Discharged home
# Oup_hospitalized=>Admitted to hospital
# OUP_hospitalized_ICU=> Required intensive care unit
#
#
anv = function(da, what, ylab_txt, step.increase = 0, ...) {
  #
  # KW ANOVA + Dunn post-hoc
  #
  levels(da$group) = c('Discharged home', 
                       'Admitted to hospital',
                       'Required intensive care unit')
  #
  fml = as.formula(paste0(what, ' ~ group'))
  res.aov <- da %>% kruskal_test(fml)
  print(knitr::kable(res.aov))
  #
  # pairwise comp
  pwc <- da %>% dunn_test(fml, p.adjust.method = "BH")
  print(knitr::kable(pwc))
  #
  pwc <- pwc %>% add_xy_position(x = "group")
  bp = ggboxplot(da, x = "group", y = what, add = c("mean", "jitter"),
                 shape = 'group',
                 color = 'group', palette = 'lancet', 
                 outlier.shape = NA, ...) +
    stat_pvalue_manual(pwc, hide.ns = TRUE, step.increase = step.increase) +
    ylab(ylab_txt) + 
    xlab('') + 
    labs(
      subtitle = get_test_label(res.aov, detailed = FALSE),
      caption = get_pwc_label(pwc)
    )
  print(bp)
  #
  #
  # save it
  ggsave(filename = paste(what, '.tiff', sep = ''), plot = bp, 
         width = 20,
         height = 20,
         units = 'cm',
         device = 'tiff')
  #
}
#


################################################################################
#
# data
#
# OUP
da_oup = read.csv('data_OUP.csv')  
names(da_oup)[4] = 'No_days'  # shorten No_of_days_from_onset_of_the_symptoms 
names(da_oup)[18] = 'XrayCTpneumo'
names(da_oup)[19] = 'pO2'
names(da_oup)[33] = 'ExitusInHospital'
names(da_oup)[34] = 'ChrLiverD'
#
# MOM 
# controls
da_mom_co = read.csv('data_MOM_controls.csv') 
da_mom_co = da_mom_co[,-c(1)] # exclude id
#
# cases
da_mom_ca = read.csv('data_MOM_cases.csv') 
da_mom_ca = da_mom_ca[,-c(1)] # exclude id
#


#+ br1, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Q1: gastrointestinal symptoms vs group
#' 
#' Symptom is present is ((New_or_Worsened = 1) & (symptom = 1))
#
################################################################################
#
# prepare data
im = match(names(da_mom_ca), names(da_oup))
#[1]  3  2  7  6 NA NA NA NA  8  9 16 NA 15 12 14 NA
im = im[is.na(im) == F]
im2 = match(names(da_oup), names(da_mom_ca))
im2 = im2[is.na(im2) == F]
#im2
#
da = rbind(da_oup[,im],
           da_mom_ca[,im2],
           da_mom_co[,im2])
#names(da)
#[1] "age"             "gender"          "Fever"           "Cough"          
#[5] "New_or_Worsened" "Diarrhea"        "Constipation"    "Abdominal_pain" 
#[9] "Bloating"        "Nausea"          "Pyrosis" 
#
#
# group
n_oup = nrow(da_oup)
n_mom_ca = nrow(da_mom_ca)
n_mom_co = nrow(da_mom_co)
da$group = c(rep('oup', n_oup),
             rep('mom_ca', n_mom_ca),
             rep('mom_co', n_mom_co))
#
# now, in oup specify 
# - admitted (== hospitalized)
# - ICU
#
inothospit = which(da_oup$Admitted == 0)
ihospit = which(da_oup$Admitted == 1)
iICU = which(da_oup$ICU == 1)
ihospitICU = intersect(ihospit, iICU)
#
#
da$group[inothospit] = 'oup_not_hospitalized'    # since do_oup is the first in da; the index of iadmit/iICU is ok
da$group[ihospit] = 'oup_hospitalized'
da$group[ihospitICU] = 'oup_hospitalized_ICU'
#
################################################################################
#
# create symptoms (Fever, Cough, ..., Pyrosis)
# in such a way that 1 == ((New_or_Worsened == 1) & (Symptom == 1))
nr = nrow(da)
#
da$Feverx = numeric(nr) 
da$Feverx[(da$Fever == 1) & (da$New_or_Worsened) == 1] = 1
#
da$Coughx = numeric(nr)
da$Coughx[(da$Cough == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Diarrheax = numeric(nr)
da$Diarrheax[(da$Diarrhea == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Constipationx = numeric(nr)
da$Constipationx[(da$Constipation == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Bloatingx = numeric(nr)
da$Bloatingx[(da$Bloating == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Nauseax = numeric(nr)
da$Nauseax[(da$Nausea == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Pyrosisx = numeric(nr)
da$Pyrosisx[(da$Pyrosis == 1) & (da$New_or_Worsened) == 1] = 1 
#
da$Abdominal_painx = numeric(nr)
da$Abdominal_painx[(da$Abdominal_pain == 1) & (da$New_or_Worsened) == 1] = 1 
#
# names(da)
# => 
da = da[,c(1,2,13:20,12)]  # age, gender, the x vars, and group
# da
#
#
# factors
da$group = factor(da$group)
da$gender = factor(da$gender)
da$Feverx = factor(da$Feverx)
da$Coughx = factor(da$Coughx)
da$Diarrheax = factor(da$Diarrheax)
da$Constipationx = factor(da$Constipationx)
da$Bloatingx = factor(da$Bloatingx)
da$Nauseax = factor(da$Nauseax)
da$Pyrosisx = factor(da$Pyrosisx)
da$Abdominal_painx = factor(da$Abdominal_painx)
#
# reorder group levels
da$group = reorder.factor(da$group, new.order = c(2, 1, 5, 3, 4))
#
# work with it
#

############################################
#
#' ## EDA + Tests
#
## define custom test
fisher.test.simulate.p.values <- function(data, variable, by, ...) {
  result <- list()
  test_results <- stats::fisher.test(data[[variable]], data[[by]], simulate.p.value = TRUE)
  result$p <- test_results$p.value
  result$test <- test_results$method
  result
}

## add p-values to your gtsummary table, using custom test defined above
tb = tbl_summary(da, by = group) %>% add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values")) %>% add_q()
tb
# save it
xlsx::write.xlsx(tb$table_body, 'Q1_table.xlsx')
#

#' ### Fisher test: multiple comparisons
# for: fever cough diarrhea nausea
#
#' ### fever
ta = table(da$Feverx, da$group, dnn = c('Fever', 'group'))
tt = t(as.matrix(ta))
# knitr::kable(tt)
#' #### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q1_table_Fever.xlsx')
#
#' #### mosaicplot
#+ figM1, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')

  
#' ### cough
ta = table(da$Coughx, da$group, dnn = c('Cough', 'group'))
tt = t(as.matrix(ta))
# knitr::kable(tt)
#' #### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q1_table_Cough.xlsx')
#
#' #### mosaicplot
#+ figM2, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' ### diarrhea
#
ta = table(da$Diarrheax, da$group, dnn = c('Diarrhea', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
# knitr::kable(tt)
#' #### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q1_table_Diarrhea.xlsx')
#
#' #### mosaicplot
#+ figM3, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' ### nausea
#
ta = table(da$Nauseax, da$group, dnn = c('Nausea', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
# knitr::kable(tt)
#' #### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q1_table_Nausea.xlsx')
#
#' #### mosaicplot
#+ figM4, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#+ br2, results='asis'
cat('\\pagebreak')

  
################################################################################
#
#' # Q2: OUP data; gender, age, etc vs group (OUP nothosp, hosp, ICU, exitus)
#
################################################################################
#
# names(da_oup)
# [1] "id"                                    "gender"                               
# [3] "age"                                   "No_of_days_from_onset_of_the_symptoms"
# [5] "Dyspnoe"                               "Cough"                                
# [7] "Fever"                                 "New_or_Worsened"                      
# [9] "Diarrhea"                              "Abdominal_pain"                       
# [11] "Vomitus"                               "Nausea"                               
# [13] "Anorexia"                              "Pyrosis"                              
# [15] "Bloating"                              "Constipation"                         
# [17] "O2_saturation"                         "Xray_or_CT_confirmed_pneumonia"       
# [19] "pO2_from_arterial_or_peripheal_blood"  "CRP"                                  
# [21] "D_DIM"                                 "GMT"                                  
# [23] "AST"                                   "ALT"                                  
# [25] "ALP"                                   "Bilirubin"                            
# [27] "Recent_ATB_usage"                      "Diabetes_Mellitus"                    
# [29] "Arterial_Hypertension"                 "Admitted"                             
# [31] "ICU"                                   "Invasive_ventilation"                 
# [33] "Exitus_during_hospitalisation"  
# [34] ChrLiverD
#
#
#
dao = da_oup[,-c(1)] # exclude id
#
# create group var:
inothospit = which(dao$Admitted == 0)
# inothospit
ihospit = which(dao$Admitted == 1)
# ihospit
iICU = which(dao$ICU == 1)
# iICU
ihospitICU = intersect(ihospit, iICU)
# 7 12 39 58 62 73 80 81 90 92
#
#
nr = nrow(dao)
dao$group = character(nr)
dao$group[inothospit] = 'oup_not_hospitalized'    
dao$group[ihospit] = 'oup_hospitalized'
dao$group[ihospitICU] = 'oup_hospitalized_ICU'
#
#
dao = dao[,-c(29:32)] # + excl Admitted, ICU
#
# turn categorical vars into factor
ii = c(1, 4:15, 17, 26:30) 
for (i in ii) {
  #
  dao[,i] = factor(dao[,i])
  #
}
# reorder group levels
dao$group = reorder.factor(dao$group, new.order = c(3, 1, 2))
#
#
#
############################################
#
#' ## EDA + Tests
#
ta = table(dao$group)
knitr::kable(ta)
#
tb = tbl_summary(dao,  by = group) %>% add_p() %>% add_q()
tb
# save it
xlsx::write.xlsx(tb$table_body, 'Q2_table.xlsx')
#
#
#
#' ### post-hoc, where appropriate:
#' ### continuous:  => KW and Dunn post-hoc
#' #### age
anv(dao, 'age', 'Age [years]', step.increase = 0.05, ylim = c(18, 100))
#' #### O2_saturation
anv(dao[-c(39,61,58),], 'O2_saturation', 'O2 saturation [%]', step.increase = 0.06) 
#' #### pO2
anv(dao, 'pO2', 'pO2 [?]')  # identify outlier
#' #### CRP
anv(dao, 'CRP', 'CRP [mg/l]')
#' #### D_DIM
anv(dao[-c(75,61,147,47,152),], 'D_DIM', 'D-DIM [mg/l]') # outliers  # sort.int(dao$D_DIM, index.return = T, na.last = F)
#' #### AST
anv(dao[-c(62),], 'AST', expression('AST ['*mu*'kat/l]')) # outlier 
#' #### ALP
anv(dao, 'ALP', expression('ALP ['*mu*'kat/l]'))
#' #### ALT
anv(dao, 'ALT', expression('ALT ['*mu*'kat/l]'))


#
#' ### factor:
#' #### dispnoe
ta = table(dao$Dyspnoe, dao$group, dnn = c('Dispnoe', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_Dispnoe.xlsx')
#
#' #### mosaicplot
#+ figM4x, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' #### diarrhea
ta = table(dao$Diarrhea, dao$group, dnn = c('Diarrhea', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_Diarrhea.xlsx')
#
#' #### mosaicplot
#+ figM5, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' #### bloating
ta = table(dao$Bloating, dao$group, dnn = c('Bloating', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_Bloating.xlsx')
#
#' #### mosaicplot
#+ figM6, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' #### diabetes mellitus
ta = table(dao$Diabetes_Mellitus, dao$group, dnn = c('Diabetes mellitus', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_Diabetes_Mellitus.xlsx')
#
#' #### mosaicplot
#+ figM7, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' #### arterial hypertension
ta = table(dao$Arterial_Hypertension, dao$group, dnn = c('Arterial hypertension', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_Arterial_Hypertension.xlsx')
#
#' #### mosaicplot
#+ figM8, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#' #### ChrLiverD
ta = table(dao$ChrLiverD, dao$group, dnn = c('Chronic Liver Disease', 'group'))
#mosaicplot(t(ta), main = '')
tt = t(as.matrix(ta))
#knitr::kable(tt)
#' ### multiple comparisons
pwft = pairwise_fisher_test(tt, p.adjust.method = "BH")
knitr::kable(pwft)
# save it
xlsx::write.xlsx(pwft, 'Q2_table_ChrLiverD.xlsx')
#
#' #### mosaicplot
#+ figM9, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold'
mosaicplot(t(ta), main = '')
#' #### Cocharan Armitage test of trend
CochranArmitageTest(tt, alternative = 'increasing')


#+ br3, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Q3: OUP data; RF prediction of group (non-hosp, hosp)
#
################################################################################
#
# lump together oup_hospit * oup_hospit_ICU
levels(dao$group) = c('oup_not_hospitalized', 'oup_hospitalized', 'oup_hospitalized')
#
nr = nrow(dao)
# make the x vars, here
dao$Feverx = numeric(nr) 
dao$Feverx[(dao$Fever == 1) & (dao$New_or_Worsened) == 1] = 1
#
dao$Coughx = numeric(nr)
dao$Coughx[(dao$Cough == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Diarrheax = numeric(nr)
dao$Diarrheax[(dao$Diarrhea == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Constipationx = numeric(nr)
dao$Constipationx[(dao$Constipation == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Bloatingx = numeric(nr)
dao$Bloatingx[(dao$Bloating == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Nauseax = numeric(nr)
dao$Nauseax[(dao$Nausea == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Pyrosisx = numeric(nr)
dao$Pyrosisx[(dao$Pyrosis == 1) & (dao$New_or_Worsened) == 1] = 1 
#
dao$Abdominal_painx = numeric(nr)
dao$Abdominal_painx[(dao$Abdominal_pain == 1) & (dao$New_or_Worsened) == 1] = 1 
#


# factors
dao$group = factor(dao$group)
dao$gender = factor(dao$gender)
dao$Feverx = factor(dao$Feverx)
dao$Coughx = factor(dao$Coughx)
dao$Diarrheax = factor(dao$Diarrheax)
dao$Constipationx = factor(dao$Constipationx)
dao$Bloatingx = factor(dao$Bloatingx)
dao$Nauseax = factor(dao$Nauseax)
dao$Pyrosisx = factor(dao$Pyrosisx)
dao$Abdominal_painx = factor(dao$Abdominal_painx)


################################################################################
#
# RF
# excl vars w many NAs 
# + vars that are used to decide ICU: CRP, O2_satur ?!: 16,17,19
# PL: exclude also XrayCT, Cough, Dyspnoe, Fever
#
set.seed(123)
rf = rfsrc(group ~ ., data = dao[,-c(4:21,24)], importance = T) # excl var w many NAs
#' ## Confusion table
rf
#
#' ## Matthews correl coeff (performance measure)
mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1))
mcc


#' ## Importance plot
#+ fig1, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold'
plot(rf, verbose = F)
# save it
tiff('RF_importance_plot.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rf)
dev.off()

#' ## ROC
rroc = gg_roc(rf, 1, oob = T)  # ok, checked
plot(rroc)
# save it
tiff('RF_ROC.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rroc)
dev.off()


#' ## AST vs antibiotics

#' ### two-way ANOVA: log(AST) ~ group*Recent_ATB_usage
#
dao2 = dao
dao2$logAST = log(dao2$AST)
levels(dao2$group) = c('Discharged home', 'Admitted to hospital')
#
bxp = ggboxplot(dao2, x = "group", y = "logAST", add = c("mean", "jitter"),
                shape = 'Recent_ATB_usage',
                color = "Recent_ATB_usage", palette = "lancet",
                outlier.shape = NA) +
                ylab(expression('log(AST) ['*mu*'kat/l]')) + 
                xlab('')
bxp = ggpar(bxp, legend.title = "Recent ATB usage")
bxp
# save it
ggsave(filename = 'logAST_vs_group_RecentATB.tiff', plot = bxp, 
       width = 20,
       height = 20,
       units = 'cm',
       device = 'tiff')


# normality
# model  <- lm(logAST ~ group*Recent_ATB_usage,
#             data = dao2)
# summary(model)
# Create a QQ plot of residuals
# ggqqplot(residuals(model))
# logAST close to normality
#

# homogeneity of variance
# lt = dao2 %>% levene_test(logAST ~ group*Recent_ATB_usage)
# knitr::kable(lt)
# pval 0.2642 # ok

#' ## aov
ao = dao2 %>% anova_test(logAST ~ group*Recent_ATB_usage)
ao

#' ## => group, only; t test
#
tt = dao2 %>% t_test(logAST ~ group, detailed = T)
knitr::kable(tt)
ttt <- tt %>% add_y_position() #x = "Recent_ATB_usage")
#
bp = ggboxplot(dao2, x = "group", y = 'logAST', add = c("mean", "jitter"),
               shape = 'group',
               color = 'group', palette = 'lancet', 
               outlier.shape = NA) + 
  ylab(expression('AST ['*mu*'kat/l]')) + 
  xlab('') +
  stat_pvalue_manual(ttt, hide.ns = TRUE)
bp
#
# save it
ggsave(filename = 'logAST_vs_group.tiff', plot = bp, 
       width = 20,
       height = 20,
       units = 'cm',
       device = 'tiff')


################################################################################
#
#' ## AST vs chronic liver disease
#
################################################################################
#
#
#' ### two-way ANOVA: log(AST) ~ group*ChrLiverD
#
# dao2 = dao
# dao2$logAST = log(dao2$AST)
# levels(dao2$group) = c('Discharged home', 'Admitted to hospital')
#
bxp = ggboxplot(dao2, x = "group", y = "logAST", add = c("mean", "jitter"),
                shape = 'ChrLiverD',
                color = "ChrLiverD", palette = "lancet",
                outlier.shape = NA) +
  ylab(expression('log(AST) ['*mu*'kat/l]')) + 
  xlab('')
bxp = ggpar(bxp, legend.title = "Chronic liver disease")
# bxp
#' => exclude outliers in hospitalized & ChrLiverD==0, etc
# sort.int(dao2$logAST, decreasing = T, index.return = T)
# => 
bxp = ggboxplot(dao2[-c(62,104,29,51,32), ], x = "group", y = "logAST", add = c("mean", "jitter"),
                shape = 'ChrLiverD',
                color = "ChrLiverD", palette = "lancet",
                outlier.shape = NA) +
  ylab(expression('log(AST) ['*mu*'kat/l]')) + 
  xlab('')
bxp = ggpar(bxp, legend.title = "Chronic liver disease")
bxp


# normality
# model  <- lm(logAST ~ group*ChrLiverD,
#             data = dao2)
# summary(model)
# Create a QQ plot of residuals
# ggqqplot(residuals(model))
# logAST non-gaussian right tale
#

# homogeneity of variance
# lt = dao2 %>% levene_test(logAST ~ group*ChrLiverD)
# knitr::kable(lt)
# pval 0.24 # ok

#' ### aov (wo 4 outliers)
ao = dao2[-c(62,104,29,51,32), ] %>% anova_test(logAST ~ group*ChrLiverD)
ao

#' ### Interaction plot
mo = lm(logAST ~ group*ChrLiverD, data = dao2[-c(62,104,29,51,32),])
#anova(mo)
#summary(mo)
emmip(mo, ChrLiverD ~ group, CIs = T) #, col = c('blue', 'red'))


#' ### pairwise comparisons: group (4 outliers excluded)
pwc1 <- dao2[-c(62,104,29,51,32), ] %>% 
 group_by(ChrLiverD) %>%
 emmeans_test(logAST ~ group, p.adjust.method = "BH") 
knitr::kable(pwc1)

#' ### pairwise comparisons: ChrLiverD (4 outliers excluded)
pwc2 <- dao2[-c(62,104,29,51,32), ] %>% 
  group_by(group) %>%
  emmeans_test(logAST ~ ChrLiverD, p.adjust.method = "BH") 
knitr::kable(pwc2)


# add to boxplot
pwc <- pwc1 %>% add_xy_position(x = "ChrLiverD")
bxp +
 stat_pvalue_manual(pwc) +
 labs(
   subtitle = get_test_label(ao, detailed = TRUE),
  caption = get_pwc_label(pwc)
)

# save it
ggsave(filename = 'logAST_vs_group_ChrLiverD_wo_outliers.tiff', plot = bxp, 
       width = 20,
       height = 20,
       units = 'cm',
       device = 'tiff')

#' ### interaction (via regression model) & pairwise
em = emmeans(mo,  specs = c('group', 'ChrLiverD'),
             adjust = "sidak")
em
cem = contrast(em, "pairwise") # to get pairwise pvals
cem


#+ br3a, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Q3a: OUP data; RF prediction of group (non-hosp, hosp) based on AST, ALT, Bilirubin, ChrLiverD 
#
################################################################################
#
#' NOTE: D_DIM has 58 missing values, GMT has 89 missing values
#' hence they were ignored
#' 
set.seed(123)
rf = rfsrc(group ~ ., data = dao[,c(22:23,25,29,30)], importance = T) # excl var w many NAs
#' ## Confusion table
rf
#
#' ## Matthews correl coeff (performance measure)
mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1))
mcc


#' ## Importance plot
#+ fig1x, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold'
plot(rf, verbose = F)
# save it
tiff('RF_importance_plot_for_AST_ALT_Bilirubin_ChrLiverD.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rf)
dev.off()

#' ## ROC
rroc = gg_roc(rf, 1, oob = T)  # ok, checked
plot(rroc)
# save it
tiff('RF_ROC_for_AST_ALT_Bilirubin_ChrLiverD.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rroc)
dev.off()


#+ br3b, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Q3b: OUP data; RF prediction of group (non-hosp, hosp) based on 
#' AST, ALT, DM, Age, diarrheax, bloatingx, ChrLiverD
#
################################################################################
#
#' NOTE: D_DIM has 58 missing values, GMT has 89 missing values
#' hence they were ignored
#' 
set.seed(123)
rf = rfsrc(group ~ ., data = dao[,c(22:23, 27, 33,35,  2,  29, 30)], importance = T) # excl var w many NAs
#' ## Confusion table
rf
#

#' ## Matthews correl coeff (performance measure)
mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1))
mcc

#' ## Importance plot
#+ fig1xx, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold'
plot(rf, verbose = F)
# save it
tiff('RF_importance_plot_for_AST_ALT_DM_Bloating_Diarrhea_Age_ChrLiverD.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rf)
dev.off()
#
#


#' ## ROC
rroc = gg_roc(rf, 1, oob = T)  # ok, checked
plot(rroc)
# save it
tiff('RF_ROC_for_AST_ALT_DM_Bloating_Diarrhea_Age_ChrLiverD.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
plot(rroc)
dev.off()


#+ br4, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Q4: OUP data; PCA
#
################################################################################
#
# PCA is eigen analysis of covariance matrix
# - problem: covar matrix of cont&factor variables 
# PCAmixdata R lib
# Implements principal component analysis, orthogonal rotation and multiple factor analysis 
# for a mixture of quantitative and qualitative variables.
#
# https://stats.stackexchange.com/questions/5774/can-principal-component-analysis-be-applied-to-datasets-containing-a-mix-of-cont
#
#
pc = PCAmix(X.quanti = dao[,c(2,3, 16, 18:25)], 
            X.quali = dao[,c(1, 30:37, 26:29)], 
            rename.level = T,
            graph = F)
#
plot(pc, choice="ind",
     coloring.ind=dao$group,
     cex=1,
     label = F, 
     leg = F,
     posleg="topright",
     main="")
#
# save it
tiff('PCAmixdata_2D_plot.tiff',
     units='cm', 
     height=20, width=20, res=300, pointsize=10, compression = 'lzw')
#
plot(pc, choice="ind",
     coloring.ind=dao$group,
     cex=1,
     label = F, 
     leg = F,
     posleg="topright",
     main="")
#
dev.off()


#+ brX, results='asis'
cat('\\pagebreak')


################################################################################
#
#' # Extra: reviewer No 3 -- AST, ALT

# variability in ALT is (much) higher than in AST
# log-transformed (close to normality)
#
#' ## logALT
boxplot(log(dao$ALT) ~ dao$group, ylim = c(-2,2), ylab = 'logALT')
#
#' ### variance oup_hospitalized
var(log(dao$ALT[dao$group == 'oup_hospitalized']))
#[1] 0.3895
#' ### variance oup_not_hospitalized
var(log(dao$ALT[dao$group == 'oup_not_hospitalized']))
#[1] 0.5445

#' ## logAST
boxplot(log(dao$AST) ~ dao$group, ylim = c(-2,2), ylab = 'logAST')
#
#' ### variance oup_hospitalized
var(log(dao$AST[dao$group == 'oup_hospitalized']))
#[1] 0.2932
#' ### variance oup_not_hospitalized
var(log(dao$AST[dao$group == 'oup_not_hospitalized']))
#[1] 0.2642
#

#' ## logALT: t test
t.test(log(dao$ALT)~dao$group)
# p-value = 0.2
#  mean in group oup_not_hospitalized     mean in group oup_hospitalized 
# -0.6051                            -0.4655 
#' => difference in mean: -0.14
#'


#' ## logAST: t test
t.test(log(dao$AST)~dao$group)
# p-value = 0.00002
# mean in group oup_not_hospitalized     mean in group oup_hospitalized 
# -0.5218                            -0.1582 
#' => difference in mean: -0.36
#'


#+ br5a, results='asis'
cat('\\pagebreak')

################################################################################
#
#' # Data Analysis (for manuscript)
#
################################################################################

#' The data were visualized and analyzed in R [1], ver. 4.0.5, with the aid of libraries
#' gtsummary [2], rstatix [3], DescTools [4], randomForestSRC [5], PCAmixdata [6] and ggpubr [7].
#' The sample median and the lower and upper quartiles were used to summarize
#' the data on continuous variables (e.g., age); counts and percentages were used to summarize
#' factors (e.g., gender). The Chi-squared or Fisher test were applied to test the
#' null hypothesis of independence between factors, followed, where appropriate, 
#' by the multiple comparisons with the Benjamini Hochberg adjustment.
#' Using a contingency table, an absence of trend was tested by Cochran Armitage test.
#' The null hypothesis of the equality of the population medians of a continuos variable
#' was tested by the Kruskal Wallis test, followed by the Dunn multiple comparisons test with
#' the Benjamini Hochberg correction of p-values. Two-way ANOVA was used to model
#' the assocication between AST and group (Discharged home, Admitted to hospital)
#' in interaction with Recent ATB use (yes, no). The AST values were log-transformed
#' to bring data to normality. Normality of residuals was assessed by the quantile-quantile
#' plot with the 95% confidence band constructed by bootstrap. Assumption of homogeneity of
#' variance was testes by the Levene test.     
#' In order to assess predictive power of the gastrointestinal parameters and
#' other measured variables for predicting the patient group 
#' (i.e., Discharged home, Admitted to hospital), the RandomForest Machine Learning
#' algorithm was trained on the data. The predictive ability was quantified
#' by the ROC curve, constructed from the Out-of-Bag data. Importance
#' of the predictors was measured by the Variable Importance. A 2D representation
#' of the data was obtained by Principal Component Analysis for mixed type of data. 
#' Findings with the p-value below 0.05 were considered statistically significant.
#' 
#' 
#' [1]  R Core Team (2021). R: A language and environment for statistical
#' computing. R Foundation for Statistical Computing, Vienna, Austria.
#' URL https://www.R-project.org/.    
#'
#' [2] Daniel D. Sjoberg, Michael Curry, Margie Hannum, Karissa Whiting and 
#' Emily C. Zabor (2020). gtsummary: Presentation-Ready Data Summary and 
#' Analytic Result Tables. R package version 1.3.5. 
#' https://CRAN.R-project.org/package=gtsummary
#'
#' [3] Alboukadel Kassambara (2021). rstatix: Pipe-Friendly Framework for 
#' Basic Statistical Tests. R package version 0.7.0. 
#' https://CRAN.R-project.org/package=rstatix
#'
#' [4] Andri Signorell et mult. al. (2021). DescTools: Tools for descriptive 
#' statistics. R package version 0.99.41.
#'
#' [5] Ishwaran H. and Kogalur U.B. (2021). Fast Unified Random Forests for 
#' Survival, Regression, and Classification (RF-SRC), R package version 
#' 2.11.0.
#'
#' [6] Marie Chavent, Vanessa Kuentz, Amaury Labenne, Benoit Liquet and 
#' Jerome Saracco (2017). PCAmixdata: Multivariate Analysis of Mixed 
#' Data. R package version 3.1. 
#' https://CRAN.R-project.org/package=PCAmixdata
#'
#' [7] Alboukadel Kassambara (2020). ggpubr: 'ggplot2' Based Publication 
#' Ready Plots. R package version 0.4.0. 
#' https://CRAN.R-project.org/package=ggpubr
#' 
#' 


#+ br6, results='asis'
cat('\\pagebreak')

################################################################################
#
#' # Session info
#
################################################################################
#
si = sessionInfo()
si