#Copy it to each folder together with the code, because different data sets have different processing parameters
rm(list = ls())
library(Seurat)
library(ggplot2)
library(cowplot)
library(Matrix)
library(dplyr)
library(ggsci)
library(scater)
library(SingleR)
library(hdf5r)
library(tidyverse)
library(RColorBrewer)
library(dplyr)
library(Seurat)
library(ComplexHeatmap)
library(GSVA)
library(GSEABase)
library(limma)
library(ggplot2)
load("CRC_EMTAB8107.RData")


# Import the gmt file. Here we take Hallmark in MsigDB as an example.
# MsigDB link (https://www.gsea-msigdb.org/gsea/msigdb/)
# When downloading files, please pay attention to selecting the corresponding gene format. Generally, the results of 10X are in gene symbol format.
genesets <- getGmt("h.all.v7.5.1.symbols.gmt")
str(genesets)


#Get the matrix for GSVA analysis (normalized)
#df.data <- GetAssayData(object = pbmc, slot = "data")
expr=as.matrix(pbmc@assays$RNA@data)
expr=expr[rowSums(expr)>0,]
# Save the group information to which the cell belongs
df.group <- data.frame(umi = names(Idents(pbmc)), 
                       cluster = pbmc@active.ident, 
                       stringsAsFactors = F)

# See what the results look like
head(df.group)
str(expr)
dim(expr)
gsvascore <- gsva(expr, genesets, parallel.sz = 4,verbose=T,method = "zscore")

# Because there are many gene expression values of 0 in the single-cell data, a red warning message will appear, which is not a big deal.

# Check out the results
gsvascore[1, 1:5]
# First, use a heat map to show the results of all 50 genesets in hallmarker.
ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
aaa=as.data.frame(gsvascore)
load("hallmarkname.RDATA")
aaa=aaa[hallmarkname,]

aaa=as.matrix(aaa)


pdf(file = "7.hallmark heatmap.pdf",width = 10,height = 8)
Heatmap(aaa,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(gsvascore),
                                             gp = gpar(fontsize = 8)))
dev.off()



aaa=as.data.frame(t(gsvascore))
write.csv(aaa,"GSVAscore.csv",row.names = F)
phe=aaa
pbmc=AddMetaData(pbmc, phe, col.name = colnames(phe))




genesetname="TLS"

genesets=getGmt(paste0(genesetname,".gmt"))


str(genesets)
genename=genesets@.Data[[1]]@geneIds
pbmc <- SetIdent(pbmc, value = "celltype")
table(pbmc@active.ident) 
genes_to_check = genename
# Improved mapping method, gene names vertically


p1 <-DotPlot(pbmc, features = genes_to_check, assay='RNA' )+coord_flip()+
        theme_bw()+
        theme(panel.grid = element_blank(), axis.text.x=element_text(hjust = 0.5,vjust=0.5))+
        labs(x=NULL,y=NULL)+guides(size=guide_legend(order=3))+
        scale_color_gradientn(values = seq(0,1,0.2),colours = c('#330066','#336699','#66CC66','#FFCC33'))

ggsave(print(p1),filename = "8.Specifies the gene set expressed in cells.pdf",height = 10,width =8)
dev.off()




# Select the first pathway here to draw the picture
count <- gsvascore[genesetname, , drop = FALSE]
count <- as.data.frame(t(count))
colnames(count) <- "geneset"
count$cluster <- as.character(Idents(pbmc))

index<-vector()
name<-vector()
for (i in 1:length(unique(count$cluster))) {
        aaa<-dplyr::filter(count, cluster == unique(count$cluster)[i])
        index<-c(mean(aaa[,"geneset"]),index)
        name<-c(unique(count$cluster)[i],name)
}
dat<-data.frame(name,index)
value<-as.character(arrange(dat,index)$name)
aaaa<-count
aaaa$cluster<-factor(aaaa$cluster,levels =value )
title.name="TLS"
p1<-ggplot(data=aaaa,aes(x=cluster,y=geneset,fill=cluster))+
        geom_boxplot()+
        ylab(label =title.name)+
        theme_classic()+
        theme(legend.position = 'none' ,axis.text.x =element_text(angle=90,size=8,hjust =1,vjust =0.5,colour = "black" ),
              axis.text.y =element_text(colour = "black" ))

ggsave(p1,filename = paste0("9.",title.name,"Single cell expression.pdf"),height = 6,width = 8)


pdf(file = "10.umap map.pdf",width=8,height=6)
DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE)
dev.off()

colnames(count)[1]=genesetname
pbmc=AddMetaData(pbmc, count, col.name = colnames(count))

pdf(file = "11.Gene set expression UMAP plots.pdf",width=6,height=5.5)
FeaturePlot(pbmc, 
            features = c(title.name), 
            reduction = "umap",
            #label = TRUE,
            #ncol = 3, # Draw three columns
            cols = rev(rainbow(2)))
dev.off()


p1=DimPlot(pbmc, reduction = "umap", label = TRUE,group.by = "celltype", repel = TRUE)
p3=FeaturePlot(pbmc, 
               features = c(title.name), 
               reduction = "umap",
               #label = TRUE,
               #ncol = 3, # Draw three columns
               cols = rev(rainbow(2)))


pbmc@meta.data$group=ifelse(pbmc@meta.data[,title.name] > median(pbmc@meta.data[,title.name]),"High.score.group","Low.score.group")
pdf(file = "12.High and low score group umap chart.pdf",width=6,height=5.5)
DimPlot(pbmc, 
        reduction = "umap", 
        group.by = "group", 
        cols = c("yellow2", "royalblue3"),
        pt.size = 1.5)
dev.off()
p4=DimPlot(pbmc, 
           reduction = "umap", 
           group.by = "group", 
           cols = c("yellow2", "royalblue3"),
           pt.size = 1.5)
p1+p3+p4
ggsave('13.umap Combination chart.pdf',width = 16,height=8)


# pbmc@meta.data$celltype <- pbmc@active.ident
data_plotC <- table(pbmc@meta.data$group, pbmc@meta.data$celltype) %>% reshape::melt()
colnames(data_plotC) <- c("Sample", "CellType","Number")
colourCount = length(unique(pbmc@meta.data$celltype))
getPalette = colorRampPalette(brewer.pal(8, "Dark2"))
celltype_colors <- getPalette(colourCount)
dev.off()
pC1 <- ggplot(data = data_plotC, aes(x = Sample, y = Number, fill = CellType)) +
        geom_bar(stat = "identity", width=0.8,aes(group=CellType),position="stack")+
        scale_fill_manual(values=celltype_colors) +
        theme_bw()+
        theme(panel.grid =element_blank()) +
        labs(x="",y="Average cell number")+
        theme(axis.text = element_text(size=12, colour = "black"))+
        theme(axis.title.y = element_text(size=12, colour = "black"))+
        theme(panel.border = element_rect(size = 1, linetype = "solid", colour = "black"))+
        theme(axis.text.x = element_text(angle = 45,hjust = 0.8,  vjust = 0.6)) 

pC2 <- ggplot(data = data_plotC, aes(x = Sample, y = Number, fill = CellType)) +
        geom_bar(stat = "identity", width=0.8,aes(group=CellType),position="fill")+
        scale_fill_manual(values=celltype_colors) +
        theme_bw()+
        theme(panel.grid =element_blank()) +
        labs(x="",y="Cell proportion")+
        scale_y_continuous(labels = scales::percent)+  
        theme(axis.text = element_text(size=12, colour = "black"))+
        theme(axis.title.y = element_text(size=12, colour = "black"))+
        theme(panel.border = element_rect(size = 1, linetype = "solid", colour = "black"))+
        theme(axis.text.x = element_text(angle = 45,hjust = 0.8,  vjust = 0.6))      
library(patchwork)
pC <- pC1 + pC2 + plot_layout(ncol = 2, widths = c(1,1),guides = 'collect')
pdf(file = "14.Ratio of high- and low-score immune cells.pdf",height=6,width=8)
pC
dev.off()


top10 <- pbmc.markers %>%
        group_by(cluster) %>%
        top_n(n = 10, wt = avg_log2FC)
p4=DoHeatmap(pbmc, features = top10$gene) + NoLegend()
ggsave(print(p4),filename = "15.Top 10 gene heat map.pdf",height = 6,width =8)




head(df.group)

gsvascore <- pbmc@meta.data[,hallmarkname]


gsvascore[1, 1:5]


a=colnames(pbmc)
b=pbmc@meta.data$celltype
c=pbmc@meta.data[,genesetname]
drawdata=data.frame(id=a,celltype=b,score=c)
drawdata$celltype=str_replace_all(drawdata$celltype,"/",".")
d=as.data.frame(gsvascore)
d$id=rownames(d)
drawdata=inner_join(drawdata,d,by="id")
write.csv(drawdata,"GSVAscore.csv",row.names = F)
library(ggpubr)
Index<-unique(drawdata$celltype)
Index
length(Index)
gene="score"
for (i in 1:length(Index)) {
        print(i)
        aaa<-drawdata%>%
                filter(celltype ==Index[[i]] ) 
        y <- as.numeric(aaa[,gene])
        bbb=aaa[,4:53]
        cor_data_df <- data.frame(colnames(bbb))
        for (a in 1:length(colnames(bbb))){
                test <- cor.test(as.numeric(bbb[,a]),y,method = "pearson")
                cor_data_df[a,2] <- test$estimate
                cor_data_df[a,3] <- test$p.value
        }
        names(cor_data_df) <- c("symbol","correlation","pvalue")
        write.csv (cor_data_df, file =paste("16.",Index[[i]], gene,'GSVA correlation analysis.csv',sep=' '), row.names =FALSE)
        
        cor_data_df<-na.omit(cor_data_df)
        posneg<-cor_data_df %>%
                filter(symbol != gene)
        posneg$group<-NA
        posneg$group[ posneg$pvalue<0.05& posneg$correlation<0]<-"negative"
        posneg$group[ posneg$pvalue>=0.05]<-"pvalue>0.05"
        posneg$group[ posneg$pvalue<0.05& posneg$correlation>0]<-"positive"
        posneg$symbol<-str_replace_all(posneg$symbol,'_',' ')
        posneg$symbol<-str_replace_all(posneg$symbol,'HALLMARK ','')
        p<-ggbarplot(posneg, x = "symbol", y = "correlation",
                     fill = "group",           # change fill color by mpg_level
                     color = "white",            # Set bar border colors to white
                     palette = "jco",            # jco journal color palett. see ?ggpar
                     sort.val = "asc",           # Sort the value in ascending order
                     sort.by.groups = FALSE,     # Don't sort inside each group
                     #x.text.angle = 90,          # Rotate vertically x axis texts
                     ylab = "GSVA correlation",
                     xlab = "Term",
                     legend.title = "group",
                     lab.col = "black",
                     rotate = TRUE,
                     ggtheme = theme_minimal()
        )
        ggsave(p,filename = paste("16.",Index[[i]], gene,'GSVA Correlation Plot.pdf',sep=' '),width = 10,height = 10)
}


mergedata<-data.frame(symbol=as.character(),correlation=as.numeric(),pvalue=as.numeric())
for (i in 1:length(Index)) {
        assign(Index[i],read.csv(paste0("16. ",Index[i]," ",gene," GSVA correlation analysis.csv")))
        aaa<-get(Index[i])
        aaa$type<-Index[i]
        mergedata<-rbind(aaa,mergedata)
}

mergedata<-filter(mergedata,symbol!=gene)
mergedata$symbol<-str_replace_all(mergedata$symbol,'_',' ')
mergedata$symbol<-str_replace_all(mergedata$symbol,'HALLMARK ','')
mergedata[mergedata[,"pvalue"]<=0.0001,"label"]<-"****"
mergedata[mergedata[,"pvalue"]<= 0.001 & mergedata[,"pvalue"]>0.0001,"label"]<-"***"
mergedata[mergedata[,"pvalue"]<= 0.01 & mergedata[,"pvalue"]>0.001,"label"]<-"**"
mergedata[mergedata[,"pvalue"]<= 0.05 & mergedata[,"pvalue"]>0.01,"label"]<-"*"
meandata<-data.frame(symbol= character(),meanvalue= numeric())
for (i in 1:50) {
        meandata1<-dplyr::filter(mergedata,symbol == mergedata$symbol[[i]] )
        meandata[i,1]<-mergedata$symbol[[i]]
        meandata[i,2]<-mean(meandata1$correlation)
}
bbbbb<-as.character(arrange(meandata,meanvalue)$symbol)
mergedata$symbol<-factor(mergedata$symbol,levels = bbbbb)
p<-ggplot(mergedata,aes(x=type,y=symbol))+geom_tile(aes(fill=correlation))+
        scale_fill_gradientn(colors=c("blue","white","red"),guide="colorbar")+
        theme_void()+
        theme(axis.text.x=element_text(angle=90,vjust=0.3,hjust = 1))+
        theme(axis.text.y=element_text(angle=0,hjust=1))+
        theme(axis.title.x=element_blank(),axis.title.y=element_blank())+
        geom_text(aes(label = label),col='black',cex=2.5)
ggsave(p,filename = '17.GSVA enrichment analysis correlation analysis heat map.pdf',width =6,height = 11)


aaa<-drawdata
y <- as.numeric(aaa[,gene])
bbb=aaa[,4:53]
cor_data_df <- data.frame(colnames(bbb))
for (a in 1:length(colnames(bbb))){
        test <- cor.test(as.numeric(bbb[,a]),y,method = "pearson")
        cor_data_df[a,2] <- test$estimate
        cor_data_df[a,3] <- test$p.value
}
names(cor_data_df) <- c("symbol","correlation","pvalue")
write.csv (cor_data_df, file =paste( "18.ALL cell",gene,'GSVA correlation analysis.csv',sep=' '), row.names =FALSE)

cor_data_df<-na.omit(cor_data_df)
posneg<-cor_data_df %>%
        filter(symbol != gene)
posneg$group<-NA
posneg$group[ posneg$pvalue<0.05& posneg$correlation<0]<-"negative"
posneg$group[ posneg$pvalue>=0.05]<-"pvalue>0.05"
posneg$group[ posneg$pvalue<0.05& posneg$correlation>0]<-"positive"
posneg$symbol<-str_replace_all(posneg$symbol,'_',' ')
posneg$symbol<-str_replace_all(posneg$symbol,'HALLMARK ','')
p<-ggbarplot(posneg, x = "symbol", y = "correlation",
             fill = "group",           # change fill color by mpg_level
             color = "white",            # Set bar border colors to white
             palette = "jco",            # jco journal color palett. see ?ggpar
             sort.val = "asc",           # Sort the value in ascending order
             sort.by.groups = FALSE,     # Don't sort inside each group
             #x.text.angle = 90,          # Rotate vertically x axis texts
             ylab = "GSVA correlation",
             xlab = "Term",
             legend.title = "group",
             lab.col = "black",
             rotate = TRUE,
             ggtheme = theme_minimal()
)
ggsave(p,filename = paste("18.ALL cell", gene,'GSVA Correlation Plot.pdf',sep=' '),width = 10,height = 10)





aaa=read.csv("GSVAscore.csv",header = T,sep = ",")
aaa$group=NA
aaa$group[aaa$score >= median(aaa$score)]="High"
aaa$group[aaa$score < median(aaa$score)]="Low"
table(aaa$group)
rownames(aaa)=aaa$id
pbmc=AddMetaData(pbmc, aaa, col.name = colnames(aaa))
pbmc <- SetIdent(pbmc, value = "group")




df.group <- data.frame(umi = names(Idents(pbmc)), 
                       cluster = pbmc@active.ident, 
                       stringsAsFactors = F)


head(df.group)


aaa[1, 1:5]

ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
bbb=aaa[,4:54]
bbb=bbb[,-51]
bbb=as.matrix(bbb)
bbb=t(bbb)

pdf(file = "19.Group hallmark heat map.pdf",width = 10,height = 8)
Heatmap(bbb,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(bbb),
                                             gp = gpar(fontsize = 8)))
dev.off()




num=floor(length(pbmc@meta.data$score)/10)
ccc=arrange(pbmc@meta.data,score)
ccc=ccc[1:num,]
ccc=rownames(ccc)
ddd=arrange(pbmc@meta.data,desc(score))
ddd=ddd[1:num,]
ddd=rownames(ddd)
colnames(pbmc)
pbmc1=pbmc[,c(ccc,ddd)]
table(pbmc1@meta.data$group)
df.group <- data.frame(umi = names(Idents(pbmc1)), 
                       cluster = pbmc1@active.ident, 
                       stringsAsFactors = F)

ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
bbb=bbb[,c(ccc,ddd)]

pdf(file = "19.group.top10 group hallmark heat map.pdf",width = 10,height = 8)
Heatmap(bbb,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(bbb),
                                             gp = gpar(fontsize = 8)))
dev.off()