## The raw code to analyze the TCGA raw data and GEO datasets are list below ## The expression analysis using TCGA raw data ## data preprocessing require(dplyr) require(tidyr) LUAD_nopoint <- LUAD %>% tidyr::separate(gene_id,into = c("gene_id"),sep="\\.") exprSet <- gtf_df %>% dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>% dplyr::select(c(gene_name,gene_id,gene_biotype)) %>% dplyr::inner_join(LUAD_nopoint,by ="gene_id") %>% tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ") write.csv(exprSet, file = "exprSet1.csv") exprSet1_ = as.matrix(exprSet1_) row.names(exprSet1_) <- exprSet1_[,1] exprSet1_ = exprSet1_[,-1] show(exprSet1_[1:10,1:10]) exprSet1_ = as.data.frame(exprSet1_) metadata1 <- data.frame(colnames(exprSet1)) for (i in 1:length(metadata1[,1])) { num <- as.numeric(substring(metadata1[i,1],14,15)) if (num %in% seq(1,9)) {metadata1[i,2] <- "Tumor"} if (num %in% seq(10,29)) {metadata1[i,2] <- "Normal"} } names(metadata1) <- c("TCGA_id","Group") exprSet1_ <- t(exprSet1_) exprSet1_ <- as.data.frame(exprSet1_) exprSet1_ <- exprSet1_[,-1] exprSet1_ = cbind(TCGA_id = row.names(exprSet1_),exprSet1_) row.names(exprSet1_) = NULL exprSet1_ <- merge(metadata1, exprSet1_, by="TCGA_id", all = FALSE) save(metadata1, file = "metadata1.Rda") save(exprSet1_, file = "exprSet1_.Rda") ## analyze and make plots library(ggpubr) library(ggstatsplot) exprSet1_$sample<- as.factor(exprSet1_$sample) ggbetweenstats(data = exprSet1_, x = Group, y = ITGB8) ## or y = ITGA11/ITGB4 ggbetweenstats(data = exprSet1_, x = Group, y = ITGB8, results.subtitle = FALSE, subtitle = "p <0.001") ## The Kaplan-Meier survival analysis using TCGA raw data library(RTCGA) library(dplyr) library(survival) library(survminer) library(ggplot2) group <- ifelse(exprSet1$ITGA11>median(exprSet1$ITGA11),'high','low') sfit <- survfit(Surv(times, patient.vital_status)~group, data=exprSet1) sfit summary(sfit) ggsurvplot(sfit, conf.int=F, pval=TRUE, legend.title = "ITGA11", risk.table = TRUE, tables.height = 0.2, tables.theme = theme_cleantable(), # Color palettes. Use custom color: c("#E7B800", "#2E9FDF"), # or brewer color (e.g.: "Dark2"), or ggsci color (e.g.: "jco") palette = c("#E7B800", "#2E9FDF"), ggtheme = theme_bw() # Change ggplot2 theme ) ## The association between the expression level and cinical stages ## data preprocessing exprSet1$stage[exprSet1$stage == 1] <- 'I/II' exprSet1$stage[exprSet1$stage == 2] <- 'I/II' exprSet1$stage[exprSet1$stage == 3] <- 'III/IV' exprSet1$stage[exprSet1$stage == 4] <- 'III/IV' boxplot(`ITGB8 expression`~stage, data = exprSet1, col = topo.colors(2,alpha = 0.4) ) table(exprSet1$stage) t.test(`ITGB8 expression`~stage, data = exprSet1) ## Univariate and mulitvariate analysis ## data preprocessing new_exprSet$age[new_exprSet$age_at_initial_pathologic_diagnosis > 65] <- '1' new_exprSet$age[new_exprSet$age_at_initial_pathologic_diagnosis <= 65] <- '0' new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 1] <- '0' new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 2] <- '1' new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 3] <- '1' new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 4] <- '1' new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 5] <- '1' new_exprSet$Stage[new_exprSet$stage == 1] <- '0' new_exprSet$Stage[new_exprSet$stage == 2] <- '0' new_exprSet$Stage[new_exprSet$stage == 3] <- '1' new_exprSet$Stage[new_exprSet$stage == 4] <- '1' ## Univariate analysis Basurv <- Surv(time = new_Baseline$times, event = new_Baseline$patient.vital_status) new_Baseline$Basurv <- with(new_Baseline,Basurv) GCox <- coxph(Basurv~gender, data = new_Baseline) GSum <- summary(GCox) GSum$coefficients CI <- paste0(round(GSum$conf.int[,3:4],3),collapse = '-') HR <- round(GSum$coefficients[,2],3) Pvalue <- round(GSum$coefficients[,5],3) Ucox <- function(x){ FML <- as.formula(paste0('Basurv~',x)) GCox <- coxph(FML,data = new_Baseline) GSum <- summary(GCox) CI <- paste0(round(GSum$conf.int[,3:4],3),collapse = '-') HR <- round(GSum$coefficients[,2],3) Pvalue <- round(GSum$coefficients[,5],3) Unicox <- data.frame('Chracteristics' = x, 'HR' = HR, '95%CI' = CI, 'p' = Pvalue) return(Unicox) } varNames <- colnames(new_Baseline)[c(19,22:27)] Univar <- lapply(varNames, Ucox) Univar <- ldply(Univar,data.frame) Univar$Chracteristics[Univar$p < 0.2] ## Multivariate analysis fml <-as.formula(paste0('Basurv~',paste0(Univar$Chracteristics[Univar$p < 0.2],collapse = '+'))) MultiCox <- coxph(fml, data = new_Baseline,ties = "breslow") MultiSum <- summary(MultiCox) MultiName <- as.character(Univar$Chracteristics[Univar$p < 0.2]) MHR <- round(MultiSum$coefficients[,2],3) MPV <- round(MultiSum$coefficients[,5],3) MCIL <- round(MultiSum$conf.int[,3],3) MCIU <- round(MultiSum$conf.int[,4],3) MCI <- paste0(MCIL,'-',MCIU) Mulcox <- data.frame('Chracteristics' = MultiName, 'HR' = MHR, '95%CI' = MCI, 'P' = MPV) ## Output results Final <- merge.data.frame(Univar,Mulcox, by = 'Chracteristics', all = T, sort = T) ## GEO datasets meta-analysis data <- read.csv('Table1.csv',header=T) colnames(table1) library(ggplot2) library(meta) metawsd1 = metacont(Tumor_count,Tumor_mean,Tumor_sd, Normal_count,Normal_mean,Normal_sd, data=table1 ,sm="SMD", comb.fixed = FALSE, comb.random = TRUE, studlab = paste(Study)) metawsd1 forest(metawsd1,test.overall.random = TRUE, comb.fixed = FALSE, comb.random = TRUE) metainf(metawsd1, pooled = "random") forest(metainf(metawsd1, pooled = "random")) forest(metainf(metawsd1),comb.fixed = TRUE, comb.random = FALSE) metabias(metawsd1, method.bias = "linreg", plotit = TRUE) funnel(metawsd1,comb.fixed = F,comb.random = T) ## If there is publication bias,the'trim and fill'will performed to adjust the bias new_metawsd2_ <- trimfill(metawsd2_, comb.random =TRUE) ## The expression analysis in GEO datasets ## library packages library(GEOquery) library(stringr) library(limma) library(dplyr) library('Biobase') library(ggstatsplot) library(tidyr) library(ggpubr) ## data preprocessing gset = getGEO("GSE21933", destdir = ".", getGPL = F) gset <- gset[[1]] pdata = pData(gset) pdata <- pdata[,c(2,1,11)] save(pdata, file = "pdata.Rda") write.csv(pdata, file = "pdata.csv") colnames(pdata) <- c("GSM", "group") group_list = ifelse(str_detect(pdata2$group,"Tumor")==TRUE,"Tumor","Normal") group_list <- as.factor(group_list) exprSet = exprs(gset) exprSet2 <- exprSet[,c(2:79)] ## select LUAD or LUSC data exprSet <- as.data.frame(exprSet) boxplot(exprSet2,outline=FALSE, notch=T,col=group_list, las=2) exprSet2 = normalizeBetweenArrays(exprSet2) exprSet2 <- as.data.frame(exprSet2) write.csv(pdata, file = "pdata.csv") pdata <- pdata[,-1] GPL6254 <- getGEO('GPL6254',destdir =".") GPL6254_ann <- Table(GPL6254) probe2symbol <- GPL6254_ann[,c("ID","Symbol")] colnames(probe2symbol) <- c("probe_id" , "symbol") save(probe2symbol, file = "probe2symbol.Rda") exprSet2 = cbind(probe_id = row.names(exprSet2), exprSet2) row.names(exprSet2) = NULL exprSet2$probe_id <- as.numeric(exprSet2$probe_id) exprSet <- exprSet2 %>% inner_join(probe2symbol,by="probe_id") %>% select(-probe_id) %>% select(symbol, everything()) %>% mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean) %>% tibble::column_to_rownames(colnames(.)[1]) save(exprSet, file = "exprSet_.Rda") exprSet <- new_exprSet exprSet <- t(exprSet) exprSet <- as.data.frame(exprSet) exprSet <- exprSet[,-1] exprSet = cbind(GSM = row.names(exprSet), exprSet) row.names(exprSet) <- NULL exprSet1_ <- merge(exprSet, pdata, by="GSM", all = FALSE) exprSet1_$group <- as.factor(exprSet1_$group) ggbetweenstats(data = exprSet1_, x = group, y = ITGA11) ## y = ITGB4/ITGB8 ggbetweenstats(data = exprSet1_, x = group, y = ITGB8, results.subtitle = FALSE, subtitle = "p = 0.356") aggregate(ITGA11~group, data = exprSet1_, mean) aggregate(ITGA11~group, data = exprSet1_, sd) t.test(ITGA11~group,data = exprSet1_)