## The raw code to analyze the TCGA raw data and GEO datasets are list below

## The expression analysis using TCGA raw data
## data preprocessing
require(dplyr)
require(tidyr)
LUAD_nopoint <- LUAD %>% 
  tidyr::separate(gene_id,into = c("gene_id"),sep="\\.") 
exprSet <- gtf_df %>% 
  dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>% 
  dplyr::select(c(gene_name,gene_id,gene_biotype)) %>% 
  dplyr::inner_join(LUAD_nopoint,by ="gene_id") %>% 
  tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")
write.csv(exprSet, file = "exprSet1.csv") 
exprSet1_ = as.matrix(exprSet1_)
row.names(exprSet1_) <- exprSet1_[,1] 
exprSet1_ = exprSet1_[,-1] 
show(exprSet1_[1:10,1:10])
exprSet1_ = as.data.frame(exprSet1_)
metadata1 <- data.frame(colnames(exprSet1))
for (i in 1:length(metadata1[,1])) {
  num <- as.numeric(substring(metadata1[i,1],14,15))
  if (num %in% seq(1,9)) {metadata1[i,2] <- "Tumor"}
  if (num %in% seq(10,29)) {metadata1[i,2] <- "Normal"} 
}
names(metadata1) <- c("TCGA_id","Group")
exprSet1_ <- t(exprSet1_)
exprSet1_ <- as.data.frame(exprSet1_)
exprSet1_ <- exprSet1_[,-1] 
exprSet1_ = cbind(TCGA_id = row.names(exprSet1_),exprSet1_) 
row.names(exprSet1_) = NULL 
exprSet1_ <- merge(metadata1, exprSet1_, by="TCGA_id", all = FALSE)
save(metadata1, file = "metadata1.Rda") 
save(exprSet1_, file = "exprSet1_.Rda")

## analyze and make plots
library(ggpubr)
library(ggstatsplot)
exprSet1_$sample<- as.factor(exprSet1_$sample)
ggbetweenstats(data = exprSet1_,
               x = Group,
               y = ITGB8) ## or y = ITGA11/ITGB4
ggbetweenstats(data = exprSet1_,
               x = Group,
               y = ITGB8,
               results.subtitle = FALSE,
               subtitle = "p <0.001")

## The Kaplan-Meier survival analysis using TCGA raw data
library(RTCGA)
library(dplyr)
library(survival)
library(survminer)
library(ggplot2)
group <- ifelse(exprSet1$ITGA11>median(exprSet1$ITGA11),'high','low') 
sfit <- survfit(Surv(times, patient.vital_status)~group, data=exprSet1) 
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE,
           legend.title = "ITGA11",
           risk.table = TRUE,
           tables.height = 0.2,
           tables.theme = theme_cleantable(),
           # Color palettes. Use custom color: c("#E7B800", "#2E9FDF"),
           # or brewer color (e.g.: "Dark2"), or ggsci color (e.g.: "jco")
           palette = c("#E7B800", "#2E9FDF"),
           ggtheme = theme_bw() # Change ggplot2 theme
)
## The association between the expression level and cinical stages 
## data preprocessing 
exprSet1$stage[exprSet1$stage == 1] <- 'I/II'
exprSet1$stage[exprSet1$stage == 2] <- 'I/II'
exprSet1$stage[exprSet1$stage == 3] <- 'III/IV'
exprSet1$stage[exprSet1$stage == 4] <- 'III/IV'
boxplot(`ITGB8 expression`~stage, data = exprSet1, col = topo.colors(2,alpha = 0.4) ) 
table(exprSet1$stage)
t.test(`ITGB8 expression`~stage, data = exprSet1)

## Univariate and mulitvariate analysis
## data preprocessing 
new_exprSet$age[new_exprSet$age_at_initial_pathologic_diagnosis > 65] <- '1'
new_exprSet$age[new_exprSet$age_at_initial_pathologic_diagnosis <= 65] <- '0'

new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 1] <- '0'
new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 2] <- '1'
new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 3] <- '1'
new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 4] <- '1'
new_exprSet$smoke[new_exprSet$tobacco_smoking_history == 5] <- '1'

new_exprSet$Stage[new_exprSet$stage == 1] <- '0'
new_exprSet$Stage[new_exprSet$stage == 2] <- '0'
new_exprSet$Stage[new_exprSet$stage == 3] <- '1'
new_exprSet$Stage[new_exprSet$stage == 4] <- '1'

## Univariate analysis

Basurv <- Surv(time = new_Baseline$times, event = new_Baseline$patient.vital_status)
new_Baseline$Basurv <- with(new_Baseline,Basurv)
GCox <- coxph(Basurv~gender, data = new_Baseline)
GSum <- summary(GCox)
GSum$coefficients
CI <- paste0(round(GSum$conf.int[,3:4],3),collapse = '-')
HR <- round(GSum$coefficients[,2],3)
Pvalue <- round(GSum$coefficients[,5],3)
Ucox <- function(x){
  FML <- as.formula(paste0('Basurv~',x))
  GCox <- coxph(FML,data = new_Baseline)
  GSum <- summary(GCox)
  CI <- paste0(round(GSum$conf.int[,3:4],3),collapse = '-')
  HR <- round(GSum$coefficients[,2],3)
  Pvalue <- round(GSum$coefficients[,5],3)
  Unicox <- data.frame('Chracteristics' = x,
                       'HR' = HR,
                       '95%CI' = CI,
                       'p' = Pvalue)
  return(Unicox)
}
varNames <- colnames(new_Baseline)[c(19,22:27)]
Univar <- lapply(varNames, Ucox)
Univar <- ldply(Univar,data.frame)
Univar$Chracteristics[Univar$p < 0.2]

##  Multivariate analysis

fml <-as.formula(paste0('Basurv~',paste0(Univar$Chracteristics[Univar$p < 0.2],collapse = '+')))
MultiCox <- coxph(fml, data = new_Baseline,ties = "breslow")
MultiSum <- summary(MultiCox)
MultiName <- as.character(Univar$Chracteristics[Univar$p < 0.2])
MHR <- round(MultiSum$coefficients[,2],3)
MPV <- round(MultiSum$coefficients[,5],3)
MCIL <- round(MultiSum$conf.int[,3],3)
MCIU <- round(MultiSum$conf.int[,4],3)
MCI <- paste0(MCIL,'-',MCIU)
Mulcox <- data.frame('Chracteristics' = MultiName,
                     'HR' = MHR,
                     '95%CI' = MCI,
                     'P' = MPV)
## Output results
Final <- merge.data.frame(Univar,Mulcox, by = 'Chracteristics',
                          all = T, sort = T)


## GEO datasets meta-analysis 
data <- read.csv('Table1.csv',header=T)
colnames(table1)
library(ggplot2)
library(meta)
metawsd1 = metacont(Tumor_count,Tumor_mean,Tumor_sd,
                     Normal_count,Normal_mean,Normal_sd,
                     data=table1 ,sm="SMD",
                     
                     comb.fixed = FALSE,
                     comb.random = TRUE,
                     
                     studlab = paste(Study))
metawsd1
forest(metawsd1,test.overall.random = TRUE, comb.fixed = FALSE, comb.random = TRUE)
metainf(metawsd1, pooled = "random")
forest(metainf(metawsd1, pooled = "random"))
forest(metainf(metawsd1),comb.fixed = TRUE, comb.random = FALSE)
metabias(metawsd1, method.bias = "linreg", plotit = TRUE)
funnel(metawsd1,comb.fixed = F,comb.random = T)
## If there is publication bias,the'trim and fill'will performed to adjust the bias
new_metawsd2_ <- trimfill(metawsd2_, comb.random =TRUE) 

## The expression analysis in GEO datasets
## library packages
library(GEOquery)
library(stringr)
library(limma)
library(dplyr)
library('Biobase')
library(ggstatsplot)
library(tidyr)
library(ggpubr)

## data preprocessing 
gset = getGEO("GSE21933", destdir = ".", getGPL = F)
gset <- gset[[1]]
pdata = pData(gset)
pdata <- pdata[,c(2,1,11)]
save(pdata, file = "pdata.Rda")
write.csv(pdata, file = "pdata.csv")
colnames(pdata) <- c("GSM", "group")
group_list = ifelse(str_detect(pdata2$group,"Tumor")==TRUE,"Tumor","Normal")
group_list <- as.factor(group_list) 
exprSet = exprs(gset) 
exprSet2 <- exprSet[,c(2:79)] ## select LUAD or LUSC data
exprSet <- as.data.frame(exprSet)
boxplot(exprSet2,outline=FALSE, notch=T,col=group_list, las=2) 
exprSet2 = normalizeBetweenArrays(exprSet2) 
exprSet2 <- as.data.frame(exprSet2)
write.csv(pdata, file = "pdata.csv")
pdata <- pdata[,-1]
GPL6254 <- getGEO('GPL6254',destdir =".") 
GPL6254_ann <- Table(GPL6254)
probe2symbol <- GPL6254_ann[,c("ID","Symbol")]
colnames(probe2symbol) <- c("probe_id" , "symbol")
save(probe2symbol, file = "probe2symbol.Rda")
exprSet2 = cbind(probe_id = row.names(exprSet2), exprSet2)
row.names(exprSet2) = NULL
exprSet2$probe_id <- as.numeric(exprSet2$probe_id)
exprSet <- exprSet2 %>%
  inner_join(probe2symbol,by="probe_id") %>% 
  select(-probe_id) %>% 
  select(symbol, everything()) %>% 
  mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% 
  arrange(desc(rowMean)) %>% 
  distinct(symbol,.keep_all = T) %>% 
  select(-rowMean) %>% 
  tibble::column_to_rownames(colnames(.)[1])
save(exprSet, file = "exprSet_.Rda")
exprSet <- new_exprSet
exprSet <- t(exprSet)
exprSet <- as.data.frame(exprSet)
exprSet <- exprSet[,-1] 
exprSet = cbind(GSM = row.names(exprSet), exprSet)
row.names(exprSet) <- NULL
exprSet1_ <- merge(exprSet, pdata, by="GSM", all = FALSE)
exprSet1_$group <- as.factor(exprSet1_$group)
ggbetweenstats(data = exprSet1_,
               x = group,
               y = ITGA11) ## y = ITGB4/ITGB8
ggbetweenstats(data = exprSet1_,
               x = group,
               y = ITGB8,
               results.subtitle = FALSE,
               subtitle = "p = 0.356")
aggregate(ITGA11~group, data = exprSet1_, mean) 
aggregate(ITGA11~group, data = exprSet1_, sd) 
t.test(ITGA11~group,data = exprSet1_)