# Copy this code to each folder as different datasets have different processing parameters
rm(list = ls())
library(Seurat)
library(ggplot2)
library(cowplot)
library(Matrix)
library(dplyr)
library(ggsci)
library(scater)
library(SingleR)
library(hdf5r)
library(tidyverse)
library(RColorBrewer)
mydata = Read10X_h5("CRC_GSE166555_expression.h5")
# During data import, cells with fewer than 200 genes detected (min.features = 200),
# and genes covered by fewer than 3 cells (min.cells = 3), are already filtered out.
pbmc <- CreateSeuratObject(counts = mydata, project = "GSE166555", min.cells = 3, min.features = 200)
pbmc <- SetIdent(pbmc, value = "GSE166555")
dim(pbmc)

# Modify column names before reading, special characters like parentheses are not allowed
phe = data.table::fread("CRC_GSE166555_CellMetainfo_table.tsv")
phe = as.data.frame(phe)
rownames(phe) = phe$Cell
pbmc = AddMetaData(pbmc, phe, col.name = colnames(phe))

# View data
dim(mydata)

# Data Preprocessing
# 2.1 QC
# Generally use the following three standards, also refer to commonly used
# Number of unique genes detected in each cell
# Low-quality cells or empty droplets usually contain few genes
# Cell doublets or multiplets may show abnormally high gene counts
# Total number of molecules detected in each cell
# Mitochondrial gene content proportion
# Low-quality or dying cells have a high mitochondrial gene content
# Use PercentageFeatureSet() to calculate mitochondrial QC ratio
# Genes starting with MT- are mitochondrial genes

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

# Show QC metrics for the first 5 cells
head(pbmc@meta.data, 5)

# Note: PercentageFeatureSet function can calculate the percentage of a specified gene subset count in each CELL.

# Violin plot: View number of genes, number of UMIs, mitochondrial gene proportion
# Visualize QC metrics as a violin plot

p = VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
p
ggsave(print(p), filename = "1.nFeature_nCount_percent.mt.pdf", height = 5, width = 6)
# nFeature_RNA represents the number of genes detected in each cell, nCount represents the sum of expression levels of all genes detected in each cell,
# percent.mt represents the proportion of detected mitochondrial genes.
# Correlation between gene count, mitochondrial gene proportion, and UMI count
# Ratio of gene number detected per UMI (read count = molecular id)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(pbmc, feature1 = "percent.mt", feature2 = "nFeature_RNA")
p = plot_grid(plot1, plot2, plot3, ncol = 3)
ggsave(filename = "2.scatter_plot.pdf", plot = p, width = 10, height = 4)

# Filtering
# Filter out cells that have unique gene counts over 2500 or below 200
# Filter out cells that have more than 5% mitochondrial genes
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

# Normalization
# Normalize data using the LogNormalize method
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

# Identification of highly variable genes
# FindVariableFeatures() function identifies genes that are highly variable in the dataset
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Visualization of variable features
# Top 10 genes are plotted
top10 <- head(VariableFeatures(pbmc), 10)
p <- DimPlot(pbmc, reduction = "pca")
ggsave(print(p), filename = "3.top10_variable_genes.pdf", height = 6, width = 8)

# Scaling data
# ScaleData function scales and centers the data for each gene
pbmc <- ScaleData(pbmc, features = rownames(pbmc))

# Run PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

# Visualization of PCA
# Examine and visualize PCA results
p <- DimPlot(pbmc, reduction = "pca")
ggsave(print(p), filename = "4.PCA.pdf", height = 6, width = 8)

# Find neighbors
# This step builds a KNN graph based on the PCA results
pbmc <- FindNeighbors(pbmc, dims = 1:10)

# Find clusters
# The FindClusters function clusters cells based on their PCA scores
pbmc <- FindClusters(pbmc, resolution = 0.5)

# Visualization of clusters
# tSNE plot to visualize the clusters
p <- DimPlot(pbmc, reduction = "tsne")
ggsave(print(p), filename = "5.tSNE_clusters.pdf", height = 6, width = 8)

# Differential expression analysis
# The FindAllMarkers function performs differential expression analysis between clusters
# It is important to check the current active plan
plan()
plan("multiprocess", workers = 8)
plan()
start = Sys.time()
pbmc.markers1 <- FindAllMarkers(pbmc, only.pos = FALSE, min.pct = 0.25, logfc.threshold = 0.25)
end = Sys.time()
dur = end - start
dur

pbmc.markers = dplyr::filter(pbmc.markers1, avg_log2FC > 0.25)

library(scRNAtoolVis)

# Adjust cluster orders
p = jjVolcano(diffData = pbmc.markers1,
              size = 3.5,
              fontface = 'italic',
              legend.position = c(0.8, 0.2),
              flip = T)
ggsave(filename = "6.Marker_gene_volcano_plot.pdf", plot = p, width = 10, height = 6)

p = markerVolcano(markers = pbmc.markers1,
                  topn = 5, log2FC = 0.25,
                  labelCol = ggsci::pal_npg()(10))
ggsave(filename = "7.Marker_gene_volcano_plot2.pdf", plot = p, width = 12, height = 10)

# Save to file
write.csv(pbmc.markers1, "output_pbmc.markers.csv", quote = F)

# For more parameters, see ?FindAllMarkers.
# cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25,
#                                 test.use = "roc",  # Test method
#                                 only.pos = TRUE) # Only output positive genes

# Optional parameters include: wilcox (default), bimod, roc, t, negbinom, poisson, LR, MAST, DESeq2.

# Visualization
# 
# Seurat's built-in functions to view key gene basics:
# 
# 1) VlnPlot: Gene expression probability distribution
# VlnPlot(pbmc, features = c("GZMK", "CRTAM"))
# # You can also plot raw counts
# VlnPlot(pbmc, features = c("GZMK", "CRTAM"), slot =
# "counts", log = TRUE)

# 2) FeaturePlot: Display expression of key genes on tSNE or PCA plots
# FeaturePlot(pbmc, features = c("CCR7", "GPR183", "MAL", "NOSIP", "GZMK", "CRTAM", "FGFBP2", "GNLY", 
#                                "GZMK", "CXCR3"))

# 3) DoHeatmap: View heatmap of key gene expression in cells and clusters
# top10 <- pbmc.markers %>%
#   group_by(cluster) %>%
#   top_n(n = 10, wt = avg_log2FC)
# p4 = DoHeatmap(pbmc, features = top10$gene) + NoLegend()
# ggsave(print(p4), filename = "8.top10_genes_heatmap.pdf", height = 6, width = 8)

# Save the final object
save(pbmc, file = "CRC_GSE166555.RData")

# The end of the script