# # R code for "A machine learning approach for identification of gastrointestinal # predictors for the risk of COVID-19 related hospitalization" by Liptak et al # # marian.grendar@jfmed.uniba.sk # # #+ a, include=F knitr::opts_chunk$set(comment = NA, # no hash is included into R echo=FALSE, warning=FALSE, message=FALSE, cache=FALSE) options(warn = -1) options(scipen = 999) # suppress the scientific notation options(digits = 4) options(gtsummary.print_engine = "gt") ################################################################################ # # libs library('gtsummary') library('rstatix') library('ggpubr') library('DescTools') library('randomForestSRC') library('ggRandomForests') library('emmeans') library('PCAmixdata') library('mccr') # Matthews correlation correl; for RF; performance measure ################################################################################ # # fnc # # Oup_not_hospitalized=> Discharged home # Oup_hospitalized=>Admitted to hospital # OUP_hospitalized_ICU=> Required intensive care unit # # anv = function(da, what, ylab_txt, step.increase = 0, ...) { # # KW ANOVA + Dunn post-hoc # levels(da$group) = c('Discharged home', 'Admitted to hospital', 'Required intensive care unit') # fml = as.formula(paste0(what, ' ~ group')) res.aov <- da %>% kruskal_test(fml) print(knitr::kable(res.aov)) # # pairwise comp pwc <- da %>% dunn_test(fml, p.adjust.method = "BH") print(knitr::kable(pwc)) # pwc <- pwc %>% add_xy_position(x = "group") bp = ggboxplot(da, x = "group", y = what, add = c("mean", "jitter"), shape = 'group', color = 'group', palette = 'lancet', outlier.shape = NA, ...) + stat_pvalue_manual(pwc, hide.ns = TRUE, step.increase = step.increase) + ylab(ylab_txt) + xlab('') + labs( subtitle = get_test_label(res.aov, detailed = FALSE), caption = get_pwc_label(pwc) ) print(bp) # # # save it ggsave(filename = paste(what, '.tiff', sep = ''), plot = bp, width = 20, height = 20, units = 'cm', device = 'tiff') # } # ################################################################################ # # data # # OUP da_oup = read.csv('data_OUP.csv') names(da_oup)[4] = 'No_days' # shorten No_of_days_from_onset_of_the_symptoms names(da_oup)[18] = 'XrayCTpneumo' names(da_oup)[19] = 'pO2' names(da_oup)[33] = 'ExitusInHospital' names(da_oup)[34] = 'ChrLiverD' # # MOM # controls da_mom_co = read.csv('data_MOM_controls.csv') da_mom_co = da_mom_co[,-c(1)] # exclude id # # cases da_mom_ca = read.csv('data_MOM_cases.csv') da_mom_ca = da_mom_ca[,-c(1)] # exclude id # #+ br1, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q1: gastrointestinal symptoms vs group #' #' Symptom is present is ((New_or_Worsened = 1) & (symptom = 1)) # ################################################################################ # # prepare data im = match(names(da_mom_ca), names(da_oup)) #[1] 3 2 7 6 NA NA NA NA 8 9 16 NA 15 12 14 NA im = im[is.na(im) == F] im2 = match(names(da_oup), names(da_mom_ca)) im2 = im2[is.na(im2) == F] #im2 # da = rbind(da_oup[,im], da_mom_ca[,im2], da_mom_co[,im2]) #names(da) #[1] "age" "gender" "Fever" "Cough" #[5] "New_or_Worsened" "Diarrhea" "Constipation" "Abdominal_pain" #[9] "Bloating" "Nausea" "Pyrosis" # # # group n_oup = nrow(da_oup) n_mom_ca = nrow(da_mom_ca) n_mom_co = nrow(da_mom_co) da$group = c(rep('oup', n_oup), rep('mom_ca', n_mom_ca), rep('mom_co', n_mom_co)) # # now, in oup specify # - admitted (== hospitalized) # - ICU # inothospit = which(da_oup$Admitted == 0) ihospit = which(da_oup$Admitted == 1) iICU = which(da_oup$ICU == 1) ihospitICU = intersect(ihospit, iICU) # # da$group[inothospit] = 'oup_not_hospitalized' # since do_oup is the first in da; the index of iadmit/iICU is ok da$group[ihospit] = 'oup_hospitalized' da$group[ihospitICU] = 'oup_hospitalized_ICU' # ################################################################################ # # create symptoms (Fever, Cough, ..., Pyrosis) # in such a way that 1 == ((New_or_Worsened == 1) & (Symptom == 1)) nr = nrow(da) # da$Feverx = numeric(nr) da$Feverx[(da$Fever == 1) & (da$New_or_Worsened) == 1] = 1 # da$Coughx = numeric(nr) da$Coughx[(da$Cough == 1) & (da$New_or_Worsened) == 1] = 1 # da$Diarrheax = numeric(nr) da$Diarrheax[(da$Diarrhea == 1) & (da$New_or_Worsened) == 1] = 1 # da$Constipationx = numeric(nr) da$Constipationx[(da$Constipation == 1) & (da$New_or_Worsened) == 1] = 1 # da$Bloatingx = numeric(nr) da$Bloatingx[(da$Bloating == 1) & (da$New_or_Worsened) == 1] = 1 # da$Nauseax = numeric(nr) da$Nauseax[(da$Nausea == 1) & (da$New_or_Worsened) == 1] = 1 # da$Pyrosisx = numeric(nr) da$Pyrosisx[(da$Pyrosis == 1) & (da$New_or_Worsened) == 1] = 1 # da$Abdominal_painx = numeric(nr) da$Abdominal_painx[(da$Abdominal_pain == 1) & (da$New_or_Worsened) == 1] = 1 # # names(da) # => da = da[,c(1,2,13:20,12)] # age, gender, the x vars, and group # da # # # factors da$group = factor(da$group) da$gender = factor(da$gender) da$Feverx = factor(da$Feverx) da$Coughx = factor(da$Coughx) da$Diarrheax = factor(da$Diarrheax) da$Constipationx = factor(da$Constipationx) da$Bloatingx = factor(da$Bloatingx) da$Nauseax = factor(da$Nauseax) da$Pyrosisx = factor(da$Pyrosisx) da$Abdominal_painx = factor(da$Abdominal_painx) # # reorder group levels da$group = reorder.factor(da$group, new.order = c(2, 1, 5, 3, 4)) # # work with it # ############################################ # #' ## EDA + Tests # ## define custom test fisher.test.simulate.p.values <- function(data, variable, by, ...) { result <- list() test_results <- stats::fisher.test(data[[variable]], data[[by]], simulate.p.value = TRUE) result$p <- test_results$p.value result$test <- test_results$method result } ## add p-values to your gtsummary table, using custom test defined above tb = tbl_summary(da, by = group) %>% add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values")) %>% add_q() tb # save it xlsx::write.xlsx(tb$table_body, 'Q1_table.xlsx') # #' ### Fisher test: multiple comparisons # for: fever cough diarrhea nausea # #' ### fever ta = table(da$Feverx, da$group, dnn = c('Fever', 'group')) tt = t(as.matrix(ta)) # knitr::kable(tt) #' #### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q1_table_Fever.xlsx') # #' #### mosaicplot #+ figM1, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' ### cough ta = table(da$Coughx, da$group, dnn = c('Cough', 'group')) tt = t(as.matrix(ta)) # knitr::kable(tt) #' #### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q1_table_Cough.xlsx') # #' #### mosaicplot #+ figM2, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' ### diarrhea # ta = table(da$Diarrheax, da$group, dnn = c('Diarrhea', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) # knitr::kable(tt) #' #### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q1_table_Diarrhea.xlsx') # #' #### mosaicplot #+ figM3, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' ### nausea # ta = table(da$Nauseax, da$group, dnn = c('Nausea', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) # knitr::kable(tt) #' #### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q1_table_Nausea.xlsx') # #' #### mosaicplot #+ figM4, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #+ br2, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q2: OUP data; gender, age, etc vs group (OUP nothosp, hosp, ICU, exitus) # ################################################################################ # # names(da_oup) # [1] "id" "gender" # [3] "age" "No_of_days_from_onset_of_the_symptoms" # [5] "Dyspnoe" "Cough" # [7] "Fever" "New_or_Worsened" # [9] "Diarrhea" "Abdominal_pain" # [11] "Vomitus" "Nausea" # [13] "Anorexia" "Pyrosis" # [15] "Bloating" "Constipation" # [17] "O2_saturation" "Xray_or_CT_confirmed_pneumonia" # [19] "pO2_from_arterial_or_peripheal_blood" "CRP" # [21] "D_DIM" "GMT" # [23] "AST" "ALT" # [25] "ALP" "Bilirubin" # [27] "Recent_ATB_usage" "Diabetes_Mellitus" # [29] "Arterial_Hypertension" "Admitted" # [31] "ICU" "Invasive_ventilation" # [33] "Exitus_during_hospitalisation" # [34] ChrLiverD # # # dao = da_oup[,-c(1)] # exclude id # # create group var: inothospit = which(dao$Admitted == 0) # inothospit ihospit = which(dao$Admitted == 1) # ihospit iICU = which(dao$ICU == 1) # iICU ihospitICU = intersect(ihospit, iICU) # 7 12 39 58 62 73 80 81 90 92 # # nr = nrow(dao) dao$group = character(nr) dao$group[inothospit] = 'oup_not_hospitalized' dao$group[ihospit] = 'oup_hospitalized' dao$group[ihospitICU] = 'oup_hospitalized_ICU' # # dao = dao[,-c(29:32)] # + excl Admitted, ICU # # turn categorical vars into factor ii = c(1, 4:15, 17, 26:30) for (i in ii) { # dao[,i] = factor(dao[,i]) # } # reorder group levels dao$group = reorder.factor(dao$group, new.order = c(3, 1, 2)) # # # ############################################ # #' ## EDA + Tests # ta = table(dao$group) knitr::kable(ta) # tb = tbl_summary(dao, by = group) %>% add_p() %>% add_q() tb # save it xlsx::write.xlsx(tb$table_body, 'Q2_table.xlsx') # # # #' ### post-hoc, where appropriate: #' ### continuous: => KW and Dunn post-hoc #' #### age anv(dao, 'age', 'Age [years]', step.increase = 0.05, ylim = c(18, 100)) #' #### O2_saturation anv(dao[-c(39,61,58),], 'O2_saturation', 'O2 saturation [%]', step.increase = 0.06) #' #### pO2 anv(dao, 'pO2', 'pO2 [?]') # identify outlier #' #### CRP anv(dao, 'CRP', 'CRP [mg/l]') #' #### D_DIM anv(dao[-c(75,61,147,47,152),], 'D_DIM', 'D-DIM [mg/l]') # outliers # sort.int(dao$D_DIM, index.return = T, na.last = F) #' #### AST anv(dao[-c(62),], 'AST', expression('AST ['*mu*'kat/l]')) # outlier #' #### ALP anv(dao, 'ALP', expression('ALP ['*mu*'kat/l]')) #' #### ALT anv(dao, 'ALT', expression('ALT ['*mu*'kat/l]')) # #' ### factor: #' #### dispnoe ta = table(dao$Dyspnoe, dao$group, dnn = c('Dispnoe', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_Dispnoe.xlsx') # #' #### mosaicplot #+ figM4x, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' #### diarrhea ta = table(dao$Diarrhea, dao$group, dnn = c('Diarrhea', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_Diarrhea.xlsx') # #' #### mosaicplot #+ figM5, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' #### bloating ta = table(dao$Bloating, dao$group, dnn = c('Bloating', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_Bloating.xlsx') # #' #### mosaicplot #+ figM6, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' #### diabetes mellitus ta = table(dao$Diabetes_Mellitus, dao$group, dnn = c('Diabetes mellitus', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_Diabetes_Mellitus.xlsx') # #' #### mosaicplot #+ figM7, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' #### arterial hypertension ta = table(dao$Arterial_Hypertension, dao$group, dnn = c('Arterial hypertension', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_Arterial_Hypertension.xlsx') # #' #### mosaicplot #+ figM8, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #' #### ChrLiverD ta = table(dao$ChrLiverD, dao$group, dnn = c('Chronic Liver Disease', 'group')) #mosaicplot(t(ta), main = '') tt = t(as.matrix(ta)) #knitr::kable(tt) #' ### multiple comparisons pwft = pairwise_fisher_test(tt, p.adjust.method = "BH") knitr::kable(pwft) # save it xlsx::write.xlsx(pwft, 'Q2_table_ChrLiverD.xlsx') # #' #### mosaicplot #+ figM9, fig.width=10, fig.height=7, include=T, fig.align='center', fig.show='hold' mosaicplot(t(ta), main = '') #' #### Cocharan Armitage test of trend CochranArmitageTest(tt, alternative = 'increasing') #+ br3, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q3: OUP data; RF prediction of group (non-hosp, hosp) # ################################################################################ # # lump together oup_hospit * oup_hospit_ICU levels(dao$group) = c('oup_not_hospitalized', 'oup_hospitalized', 'oup_hospitalized') # nr = nrow(dao) # make the x vars, here dao$Feverx = numeric(nr) dao$Feverx[(dao$Fever == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Coughx = numeric(nr) dao$Coughx[(dao$Cough == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Diarrheax = numeric(nr) dao$Diarrheax[(dao$Diarrhea == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Constipationx = numeric(nr) dao$Constipationx[(dao$Constipation == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Bloatingx = numeric(nr) dao$Bloatingx[(dao$Bloating == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Nauseax = numeric(nr) dao$Nauseax[(dao$Nausea == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Pyrosisx = numeric(nr) dao$Pyrosisx[(dao$Pyrosis == 1) & (dao$New_or_Worsened) == 1] = 1 # dao$Abdominal_painx = numeric(nr) dao$Abdominal_painx[(dao$Abdominal_pain == 1) & (dao$New_or_Worsened) == 1] = 1 # # factors dao$group = factor(dao$group) dao$gender = factor(dao$gender) dao$Feverx = factor(dao$Feverx) dao$Coughx = factor(dao$Coughx) dao$Diarrheax = factor(dao$Diarrheax) dao$Constipationx = factor(dao$Constipationx) dao$Bloatingx = factor(dao$Bloatingx) dao$Nauseax = factor(dao$Nauseax) dao$Pyrosisx = factor(dao$Pyrosisx) dao$Abdominal_painx = factor(dao$Abdominal_painx) ################################################################################ # # RF # excl vars w many NAs # + vars that are used to decide ICU: CRP, O2_satur ?!: 16,17,19 # PL: exclude also XrayCT, Cough, Dyspnoe, Fever # set.seed(123) rf = rfsrc(group ~ ., data = dao[,-c(4:21,24)], importance = T) # excl var w many NAs #' ## Confusion table rf # #' ## Matthews correl coeff (performance measure) mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1)) mcc #' ## Importance plot #+ fig1, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold' plot(rf, verbose = F) # save it tiff('RF_importance_plot.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rf) dev.off() #' ## ROC rroc = gg_roc(rf, 1, oob = T) # ok, checked plot(rroc) # save it tiff('RF_ROC.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rroc) dev.off() #' ## AST vs antibiotics #' ### two-way ANOVA: log(AST) ~ group*Recent_ATB_usage # dao2 = dao dao2$logAST = log(dao2$AST) levels(dao2$group) = c('Discharged home', 'Admitted to hospital') # bxp = ggboxplot(dao2, x = "group", y = "logAST", add = c("mean", "jitter"), shape = 'Recent_ATB_usage', color = "Recent_ATB_usage", palette = "lancet", outlier.shape = NA) + ylab(expression('log(AST) ['*mu*'kat/l]')) + xlab('') bxp = ggpar(bxp, legend.title = "Recent ATB usage") bxp # save it ggsave(filename = 'logAST_vs_group_RecentATB.tiff', plot = bxp, width = 20, height = 20, units = 'cm', device = 'tiff') # normality # model <- lm(logAST ~ group*Recent_ATB_usage, # data = dao2) # summary(model) # Create a QQ plot of residuals # ggqqplot(residuals(model)) # logAST close to normality # # homogeneity of variance # lt = dao2 %>% levene_test(logAST ~ group*Recent_ATB_usage) # knitr::kable(lt) # pval 0.2642 # ok #' ## aov ao = dao2 %>% anova_test(logAST ~ group*Recent_ATB_usage) ao #' ## => group, only; t test # tt = dao2 %>% t_test(logAST ~ group, detailed = T) knitr::kable(tt) ttt <- tt %>% add_y_position() #x = "Recent_ATB_usage") # bp = ggboxplot(dao2, x = "group", y = 'logAST', add = c("mean", "jitter"), shape = 'group', color = 'group', palette = 'lancet', outlier.shape = NA) + ylab(expression('AST ['*mu*'kat/l]')) + xlab('') + stat_pvalue_manual(ttt, hide.ns = TRUE) bp # # save it ggsave(filename = 'logAST_vs_group.tiff', plot = bp, width = 20, height = 20, units = 'cm', device = 'tiff') ################################################################################ # #' ## AST vs chronic liver disease # ################################################################################ # # #' ### two-way ANOVA: log(AST) ~ group*ChrLiverD # # dao2 = dao # dao2$logAST = log(dao2$AST) # levels(dao2$group) = c('Discharged home', 'Admitted to hospital') # bxp = ggboxplot(dao2, x = "group", y = "logAST", add = c("mean", "jitter"), shape = 'ChrLiverD', color = "ChrLiverD", palette = "lancet", outlier.shape = NA) + ylab(expression('log(AST) ['*mu*'kat/l]')) + xlab('') bxp = ggpar(bxp, legend.title = "Chronic liver disease") # bxp #' => exclude outliers in hospitalized & ChrLiverD==0, etc # sort.int(dao2$logAST, decreasing = T, index.return = T) # => bxp = ggboxplot(dao2[-c(62,104,29,51,32), ], x = "group", y = "logAST", add = c("mean", "jitter"), shape = 'ChrLiverD', color = "ChrLiverD", palette = "lancet", outlier.shape = NA) + ylab(expression('log(AST) ['*mu*'kat/l]')) + xlab('') bxp = ggpar(bxp, legend.title = "Chronic liver disease") bxp # normality # model <- lm(logAST ~ group*ChrLiverD, # data = dao2) # summary(model) # Create a QQ plot of residuals # ggqqplot(residuals(model)) # logAST non-gaussian right tale # # homogeneity of variance # lt = dao2 %>% levene_test(logAST ~ group*ChrLiverD) # knitr::kable(lt) # pval 0.24 # ok #' ### aov (wo 4 outliers) ao = dao2[-c(62,104,29,51,32), ] %>% anova_test(logAST ~ group*ChrLiverD) ao #' ### Interaction plot mo = lm(logAST ~ group*ChrLiverD, data = dao2[-c(62,104,29,51,32),]) #anova(mo) #summary(mo) emmip(mo, ChrLiverD ~ group, CIs = T) #, col = c('blue', 'red')) #' ### pairwise comparisons: group (4 outliers excluded) pwc1 <- dao2[-c(62,104,29,51,32), ] %>% group_by(ChrLiverD) %>% emmeans_test(logAST ~ group, p.adjust.method = "BH") knitr::kable(pwc1) #' ### pairwise comparisons: ChrLiverD (4 outliers excluded) pwc2 <- dao2[-c(62,104,29,51,32), ] %>% group_by(group) %>% emmeans_test(logAST ~ ChrLiverD, p.adjust.method = "BH") knitr::kable(pwc2) # add to boxplot pwc <- pwc1 %>% add_xy_position(x = "ChrLiverD") bxp + stat_pvalue_manual(pwc) + labs( subtitle = get_test_label(ao, detailed = TRUE), caption = get_pwc_label(pwc) ) # save it ggsave(filename = 'logAST_vs_group_ChrLiverD_wo_outliers.tiff', plot = bxp, width = 20, height = 20, units = 'cm', device = 'tiff') #' ### interaction (via regression model) & pairwise em = emmeans(mo, specs = c('group', 'ChrLiverD'), adjust = "sidak") em cem = contrast(em, "pairwise") # to get pairwise pvals cem #+ br3a, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q3a: OUP data; RF prediction of group (non-hosp, hosp) based on AST, ALT, Bilirubin, ChrLiverD # ################################################################################ # #' NOTE: D_DIM has 58 missing values, GMT has 89 missing values #' hence they were ignored #' set.seed(123) rf = rfsrc(group ~ ., data = dao[,c(22:23,25,29,30)], importance = T) # excl var w many NAs #' ## Confusion table rf # #' ## Matthews correl coeff (performance measure) mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1)) mcc #' ## Importance plot #+ fig1x, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold' plot(rf, verbose = F) # save it tiff('RF_importance_plot_for_AST_ALT_Bilirubin_ChrLiverD.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rf) dev.off() #' ## ROC rroc = gg_roc(rf, 1, oob = T) # ok, checked plot(rroc) # save it tiff('RF_ROC_for_AST_ALT_Bilirubin_ChrLiverD.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rroc) dev.off() #+ br3b, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q3b: OUP data; RF prediction of group (non-hosp, hosp) based on #' AST, ALT, DM, Age, diarrheax, bloatingx, ChrLiverD # ################################################################################ # #' NOTE: D_DIM has 58 missing values, GMT has 89 missing values #' hence they were ignored #' set.seed(123) rf = rfsrc(group ~ ., data = dao[,c(22:23, 27, 33,35, 2, 29, 30)], importance = T) # excl var w many NAs #' ## Confusion table rf # #' ## Matthews correl coeff (performance measure) mcc = mccr(act = as.vector(as.numeric(dao$group)-1), pred = as.vector(as.numeric(rf$class.oob)-1)) mcc #' ## Importance plot #+ fig1xx, fig.width=15, fig.height=15, include=T, fig.align='center', fig.show='hold' plot(rf, verbose = F) # save it tiff('RF_importance_plot_for_AST_ALT_DM_Bloating_Diarrhea_Age_ChrLiverD.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rf) dev.off() # # #' ## ROC rroc = gg_roc(rf, 1, oob = T) # ok, checked plot(rroc) # save it tiff('RF_ROC_for_AST_ALT_DM_Bloating_Diarrhea_Age_ChrLiverD.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') plot(rroc) dev.off() #+ br4, results='asis' cat('\\pagebreak') ################################################################################ # #' # Q4: OUP data; PCA # ################################################################################ # # PCA is eigen analysis of covariance matrix # - problem: covar matrix of cont&factor variables # PCAmixdata R lib # Implements principal component analysis, orthogonal rotation and multiple factor analysis # for a mixture of quantitative and qualitative variables. # # https://stats.stackexchange.com/questions/5774/can-principal-component-analysis-be-applied-to-datasets-containing-a-mix-of-cont # # pc = PCAmix(X.quanti = dao[,c(2,3, 16, 18:25)], X.quali = dao[,c(1, 30:37, 26:29)], rename.level = T, graph = F) # plot(pc, choice="ind", coloring.ind=dao$group, cex=1, label = F, leg = F, posleg="topright", main="") # # save it tiff('PCAmixdata_2D_plot.tiff', units='cm', height=20, width=20, res=300, pointsize=10, compression = 'lzw') # plot(pc, choice="ind", coloring.ind=dao$group, cex=1, label = F, leg = F, posleg="topright", main="") # dev.off() #+ brX, results='asis' cat('\\pagebreak') ################################################################################ # #' # Extra: reviewer No 3 -- AST, ALT # variability in ALT is (much) higher than in AST # log-transformed (close to normality) # #' ## logALT boxplot(log(dao$ALT) ~ dao$group, ylim = c(-2,2), ylab = 'logALT') # #' ### variance oup_hospitalized var(log(dao$ALT[dao$group == 'oup_hospitalized'])) #[1] 0.3895 #' ### variance oup_not_hospitalized var(log(dao$ALT[dao$group == 'oup_not_hospitalized'])) #[1] 0.5445 #' ## logAST boxplot(log(dao$AST) ~ dao$group, ylim = c(-2,2), ylab = 'logAST') # #' ### variance oup_hospitalized var(log(dao$AST[dao$group == 'oup_hospitalized'])) #[1] 0.2932 #' ### variance oup_not_hospitalized var(log(dao$AST[dao$group == 'oup_not_hospitalized'])) #[1] 0.2642 # #' ## logALT: t test t.test(log(dao$ALT)~dao$group) # p-value = 0.2 # mean in group oup_not_hospitalized mean in group oup_hospitalized # -0.6051 -0.4655 #' => difference in mean: -0.14 #' #' ## logAST: t test t.test(log(dao$AST)~dao$group) # p-value = 0.00002 # mean in group oup_not_hospitalized mean in group oup_hospitalized # -0.5218 -0.1582 #' => difference in mean: -0.36 #' #+ br5a, results='asis' cat('\\pagebreak') ################################################################################ # #' # Data Analysis (for manuscript) # ################################################################################ #' The data were visualized and analyzed in R [1], ver. 4.0.5, with the aid of libraries #' gtsummary [2], rstatix [3], DescTools [4], randomForestSRC [5], PCAmixdata [6] and ggpubr [7]. #' The sample median and the lower and upper quartiles were used to summarize #' the data on continuous variables (e.g., age); counts and percentages were used to summarize #' factors (e.g., gender). The Chi-squared or Fisher test were applied to test the #' null hypothesis of independence between factors, followed, where appropriate, #' by the multiple comparisons with the Benjamini Hochberg adjustment. #' Using a contingency table, an absence of trend was tested by Cochran Armitage test. #' The null hypothesis of the equality of the population medians of a continuos variable #' was tested by the Kruskal Wallis test, followed by the Dunn multiple comparisons test with #' the Benjamini Hochberg correction of p-values. Two-way ANOVA was used to model #' the assocication between AST and group (Discharged home, Admitted to hospital) #' in interaction with Recent ATB use (yes, no). The AST values were log-transformed #' to bring data to normality. Normality of residuals was assessed by the quantile-quantile #' plot with the 95% confidence band constructed by bootstrap. Assumption of homogeneity of #' variance was testes by the Levene test. #' In order to assess predictive power of the gastrointestinal parameters and #' other measured variables for predicting the patient group #' (i.e., Discharged home, Admitted to hospital), the RandomForest Machine Learning #' algorithm was trained on the data. The predictive ability was quantified #' by the ROC curve, constructed from the Out-of-Bag data. Importance #' of the predictors was measured by the Variable Importance. A 2D representation #' of the data was obtained by Principal Component Analysis for mixed type of data. #' Findings with the p-value below 0.05 were considered statistically significant. #' #' #' [1] R Core Team (2021). R: A language and environment for statistical #' computing. R Foundation for Statistical Computing, Vienna, Austria. #' URL https://www.R-project.org/. #' #' [2] Daniel D. Sjoberg, Michael Curry, Margie Hannum, Karissa Whiting and #' Emily C. Zabor (2020). gtsummary: Presentation-Ready Data Summary and #' Analytic Result Tables. R package version 1.3.5. #' https://CRAN.R-project.org/package=gtsummary #' #' [3] Alboukadel Kassambara (2021). rstatix: Pipe-Friendly Framework for #' Basic Statistical Tests. R package version 0.7.0. #' https://CRAN.R-project.org/package=rstatix #' #' [4] Andri Signorell et mult. al. (2021). DescTools: Tools for descriptive #' statistics. R package version 0.99.41. #' #' [5] Ishwaran H. and Kogalur U.B. (2021). Fast Unified Random Forests for #' Survival, Regression, and Classification (RF-SRC), R package version #' 2.11.0. #' #' [6] Marie Chavent, Vanessa Kuentz, Amaury Labenne, Benoit Liquet and #' Jerome Saracco (2017). PCAmixdata: Multivariate Analysis of Mixed #' Data. R package version 3.1. #' https://CRAN.R-project.org/package=PCAmixdata #' #' [7] Alboukadel Kassambara (2020). ggpubr: 'ggplot2' Based Publication #' Ready Plots. R package version 0.4.0. #' https://CRAN.R-project.org/package=ggpubr #' #' #+ br6, results='asis' cat('\\pagebreak') ################################################################################ # #' # Session info # ################################################################################ # si = sessionInfo() si