# Copy this code to each folder as different datasets have different processing parameters
rm(list = ls())
library(Seurat)
library(ggplot2)
library(cowplot)
library(Matrix)
library(dplyr)
library(ggsci)
library(scater)
library(SingleR)
library(hdf5r)
library(tidyverse)
library(RColorBrewer)
library(dplyr)
library(Seurat)
library(ComplexHeatmap)
library(GSVA)
library(GSEABase)
library(limma)
library(ggplot2)
load("CRC_GSE166555.RData")

# Import gmt file, here using MsigDB's Hallmark as an example
# MsigDB link (https://www.gsea-msigdb.org/gsea/msigdb/)
# When downloading files, pay attention to choosing the corresponding gene format. Generally, the results from 10X are
# gene symbol format.
genesets <- getGmt("h.all.v7.5.1.symbols.gmt")
str(genesets)

# Get the matrix for GSVA analysis (after normalization)
# df.data <- GetAssayData(object = pbmc, slot = "data")
expr = as.matrix(pbmc@assays$RNA@data)
expr = expr[rowSums(expr) > 0,]
# Save the group information of the cells
df.group <- data.frame(umi = names(Idents(pbmc)), 
                       cluster = pbmc@active.ident, 
                       stringsAsFactors = F)

# Check what the result looks like
head(df.group)
str(expr)
dim(expr)
gsvascore <- gsva(expr, genesets, parallel.sz = 8, verbose = T, method = "zscore")

# Because there are many genes with expression value of 0 in single-cell data, red warning messages may appear, but it's okay.

# Check the result
gsvascore[1, 1:5]
# First use a heatmap to display the results of all 50 genesets in the hallmark
ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
aaa = as.data.frame(gsvascore)
load("hallmarkname.RDATA")
aaa = aaa[hallmarkname,]

aaa = as.matrix(aaa)

Heatmap(aaa,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(gsvascore),
                                             gp = gpar(fontsize = 8)))
pdf(file = "7.hallmark_heatmap.pdf", width = 10, height = 8)
Heatmap(aaa,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(gsvascore),
                                             gp = gpar(fontsize = 8)))
dev.off()

aaa = as.data.frame(t(gsvascore))
write.csv(aaa, "GSVAscore.csv", row.names = F)
phe = aaa
pbmc = AddMetaData(pbmc, phe, col.name = colnames(phe))

genesetname = "ICD"

genesets = getGmt(paste0(genesetname, ".gmt"))

str(genesets)
genename = genesets@.Data[[1]]@geneIds
pbmc <- SetIdent(pbmc, value = "celltype")
table(pbmc@active.ident)
genes_to_check = genename
# Improved plotting method, gene names vertical

p1 <- DotPlot(pbmc, features = genes_to_check, assay = 'RNA') + coord_flip() +
  theme_bw() +
  theme(panel.grid = element_blank(), axis.text.x = element_text(hjust = 0.5, vjust = 0.5)) +
  labs(x = NULL, y = NULL) + guides(size = guide_legend(order = 3)) +
  scale_color_gradientn(values = seq(0, 1, 0.2), colours = c('#330066', '#336699', '#66CC66', '#FFCC33'))

ggsave(print(p1), filename = "8.specified_geneset_expression_in_cells.pdf", height = 10, width = 8)
dev.off()

# Here, choose the first pathway for plotting
count <- gsvascore[genesetname, , drop = FALSE]
count <- as.data.frame(t(count))
colnames(count) <- "geneset"
count$cluster <- as.character(Idents(pbmc))

index <- vector()
name <- vector()
for (i in 1:length(unique(count$cluster))) {
  aaa <- dplyr::filter(count, cluster == unique(count$cluster)[i])
  index <- c(mean(aaa[, "geneset"]), index)
  name <- c(unique(count$cluster)[i], name)
}
dat <- data.frame(name, index)
value <- as.character(arrange(dat, index)$name)
aaaa <- count
aaaa$cluster <- factor(aaaa$cluster, levels = value)
title.name = "ICD"
p1 <- ggplot(data = aaaa, aes(x = cluster, y = geneset, fill = cluster)) +
  geom_boxplot() +
  ylab(label = title.name) +
  theme_classic() +
  theme(legend.position = 'none', axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5, colour = "black"),
        axis.text.y = element_text(colour = "black"))

ggsave(p1, filename = paste0("9.", title.name, "_single_cell_expression.pdf"), height = 6, width = 8)

pdf(file = "10.umap_plot.pdf", width = 8, height = 6)
DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE)
dev.off()

colnames(count)[1] = genesetname
pbmc = AddMetaData(pbmc, count, col.name = colnames(count))

pdf(file = "11.geneset_expression_umap_plot.pdf", width = 6, height = 5.5)
FeaturePlot(pbmc, 
            features = c(title.name), 
            reduction = "umap",
            # label = TRUE,
            # ncol = 3, # Draw in three columns
            cols = rev(rainbow(2)))
dev.off()

p1 = DimPlot(pbmc, reduction = "umap", label = TRUE, group.by = "celltype", repel = TRUE)
p3 = FeaturePlot(pbmc, 
                 features = c(title.name), 
                 reduction = "umap",
                 # label
                 = TRUE,
                 # ncol = 3, # Draw in three columns
                 cols = rev(rainbow(2)))

pbmc@meta.data$group = ifelse(pbmc@meta.data[, title.name] > median(pbmc@meta.data[, title.name]), "High.score.group", "Low.score.group")
pdf(file = "12.high_low_score_group_umap_plot.pdf", width = 6, height = 5.5)
DimPlot(pbmc, 
        reduction = "umap", 
        group.by = "group", # column 'Diagnosis' in metadata
        cols = c("yellow2", "royalblue3"),
        pt.size = 1.5)
dev.off()
p4 = DimPlot(pbmc, 
             reduction = "umap", 
             group.by = "group", # column 'Diagnosis' in metadata
             cols = c("yellow2", "royalblue3"),
             pt.size = 1.5)
p1 + p3 + p4
ggsave('13.umap_combined_plot.pdf', width = 16, height = 8)

# pbmc@meta.data$celltype <- pbmc@active.ident
data_plotC <- table(pbmc@meta.data$group, pbmc@meta.data$celltype) %>% reshape::melt()
colnames(data_plotC) <- c("Sample", "CellType", "Number")
colourCount = length(unique(pbmc@meta.data$celltype))
getPalette = colorRampPalette(brewer.pal(8, "Dark2"))
celltype_colors <- getPalette(colourCount)
dev.off()
pC1 <- ggplot(data = data_plotC, aes(x = Sample, y = Number, fill = CellType)) +
  geom_bar(stat = "identity", width = 0.8, aes(group = CellType), position = "stack") +
  scale_fill_manual(values = celltype_colors) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(x = "", y = "Average cell number") +
  theme(axis.text = element_text(size = 12, colour = "black")) +
  theme(axis.title.y = element_text(size = 12, colour = "black")) +
  theme(panel.border = element_rect(size = 1, linetype = "solid", colour = "black")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.8, vjust = 0.6))

pC2 <- ggplot(data = data_plotC, aes(x = Sample, y = Number, fill = CellType)) +
  geom_bar(stat = "identity", width = 0.8, aes(group = CellType), position = "fill") +
  scale_fill_manual(values = celltype_colors) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(x = "", y = "Cell proportion") +
  theme(panel.grid = element_blank()) +
  labs(x = "", y = "Cell proportion") +
  scale_fill_manual(values = celltype_colors) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  theme(axis.text = element_text(size = 12, colour = "black")) +
  theme(axis.title.y = element_text(size = 12, colour = "black")) +
  theme(panel.border = element_rect(size = 1, linetype = "solid", colour = "black")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.8, vjust = 0.6))

aaa$group[aaa$score >= median(aaa$score)] = "High"
aaa$group[aaa$score < median(aaa$score)] = "Low"
table(aaa$group)
rownames(aaa) = aaa$id
pbmc = AddMetaData(pbmc, aaa, col.name = colnames(aaa))
pbmc <- SetIdent(pbmc, value = "group")

# Save the group information of the cells
df.group <- data.frame(umi = names(Idents(pbmc)), 
                       cluster = pbmc@active.ident, 
                       stringsAsFactors = F)

# Check what the result looks like
head(df.group)

# Because there are many genes with expression value of 0 in single-cell data, red warning messages may appear, but it's okay.

# Check the result
aaa[1, 1:5]
# First use a heatmap to display the results of all 50 genesets in the hallmark
ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
bbb = aaa[, 4:54]
bbb = bbb[, -51]
bbb = as.matrix(bbb)
bbb = t(bbb)

pdf(file = "19.group_segmented_hallmark_heatmap.pdf", width = 10, height = 8)
Heatmap(bbb,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(bbb),
                                             gp = gpar(fontsize = 8)))
dev.off()

# Select integers
num = floor(length(pbmc@meta.data$score) / 10)
ccc = arrange(pbmc@meta.data, score)
ccc = ccc[1:num, ]
ccc = rownames(ccc)
ddd = arrange(pbmc@meta.data, desc(score))
ddd = ddd[1:num, ]
ddd = rownames(ddd)
colnames(pbmc)
pbmc1 = pbmc[, c(ccc, ddd)]
table(pbmc1@meta.data$group)
df.group <- data.frame(umi = names(Idents(pbmc1)), 
                       cluster = pbmc1@active.ident, 
                       stringsAsFactors = F)

ha.t <- HeatmapAnnotation(Cluster = df.group$cluster)
bbb = bbb[, c(ccc, ddd)]

pdf(file = "19.group_top10_segmented_hallmark_heatmap.pdf", width = 10, height = 8)
Heatmap(bbb,
        show_column_names = F,
        cluster_rows = T,
        cluster_columns = T,
        top_annotation = ha.t,
        column_split = df.group$cluster,
        row_names_gp = gpar(fontsize = 8),
        row_names_max_width = max_text_width(rownames(bbb),
                                             gp = gpar(fontsize = 8)))
dev.off()