#Copy the code to each folder together, because different data sets have different processing parameters
rm(list = ls())
library(Seurat)
library(ggplot2)
library(cowplot)
library(Matrix)
library(dplyr)
library(ggsci)
library(scater)
library(SingleR)
library(hdf5r)
library(tidyverse)
library(RColorBrewer)
mydata= Read10X_h5("CRC_EMTAB8107_expression.h5")
#When the data was imported, cells with less than 200 genes measured in the dataset (min.features = 200) were included.
#Genes covered by less than 3 cells (min.cells = 3) have been filtered out.
pbmc <- CreateSeuratObject(counts = mydata, project = "EMTAB8107", min.cells = 3, min.features = 200)
pbmc <- SetIdent(pbmc, value = "EMTAB8107")
dim(pbmc)

#Modify the list before reading. Do not include special characters such as brackets.
phe=data.table::fread("CRC_EMTAB8107_CellMetainfo_table.tsv")
phe=as.data.frame(phe)
rownames(phe)=phe$Cell
pbmc=AddMetaData(pbmc, phe, col.name = colnames(phe))

#View data
dim(mydata)


#Data Preprocessing

sce.all=pbmc
#View data
dim(mydata)
#rm(list=ls())
###### step2:QCQuality Control ######

# sce.all=readRDS("../sce.all_raw.rds")
#Calculation of mitochondrial gene ratio
# The names of genes in humans and mice are slightly different
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13 mitochondrial genes
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#Calculation of ribosomal gene ratio
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#Calculate red blood cell gene ratio
rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
#Visualize the above ratios of cells
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, features = feats, pt.size = 0.01, ncol = 2) + 
  NoLegend()
p1
library(ggplot2) 
ggsave(filename="Vlnplot1.pdf",plot=p1)
feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all,  features = feats, pt.size = 0.01, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2	
ggsave(filename="Vlnplot2.pdf",plot=p2)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", pt.size = 0.5)
ggsave(filename="Scatterplot.pdf",plot=p3)
#Filter low-quality cells/genes based on the above indicators
#Filter indicator 1: cells with the least number of expressed genes & genes with the least number of expressed cells
selected_c <- WhichCells(sce.all, expression =  nFeature_RNA < 6000)
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 5]

sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
dim(sce.all) 
dim(sce.all.filt) 
#  As you can see, genes are mainly filtered, followed by cells

# par(mar = c(4, 8, 2, 1))
C=sce.all.filt@assays$RNA@counts
dim(C)
C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
# The matrix C here is a bit large, so you can consider sampling it.
C=C[,sample(1:ncol(C),1000)]
most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
pdf("TOP50_most_expressed_gene.pdf",width=14)
boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
        cex = 0.1, las = 1, 
        xlab = "% total count per cell", 
        col = (scales::hue_pal())(50)[50:1], 
        horizontal = TRUE)
dev.off()
rm(C)

#Filter indicator 2: Mitochondrial/ribosomal gene ratio (based on the violin graph above)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 5)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1)
length(selected_hb)
length(selected_ribo)
length(selected_mito)


sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)

table(sce.all.filt$Patient) 

#Visualize the filtered situation
feats <- c("nFeature_RNA", "nCount_RNA")
p1_filtered=VlnPlot(sce.all.filt, features = feats, pt.size = 0.1, ncol = 2) + 
  NoLegend()
ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2_filtered=VlnPlot(sce.all.filt,  features = feats, pt.size = 0.1, ncol = 3) + 
  NoLegend()
ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered)

#Filter indicator 3: Filter specific genes
# Filter MALAT1 Housekeeping Gene
sce.all.filt <- sce.all.filt[!grepl("MALAT1", rownames(sce.all.filt),ignore.case = T), ]
# Filter Mitocondrial Mitochondrial genes
sce.all.filt <- sce.all.filt[!grepl("^MT-", rownames(sce.all.filt),ignore.case = T), ]
# Of course, you can filter more

dim(sce.all.filt) 


pbmc=sce.all.filt
###harmony Remove batch effects
pbmc <- NormalizeData(pbmc, 
                      normalization.method = "LogNormalize",
                      scale.factor = 1e4) 
pbmc <- FindVariableFeatures(pbmc)
pbmc <- ScaleData(pbmc)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

library(harmony)
seuratObj <- RunHarmony(pbmc, "Patient")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 
DimPlot(seuratObj,reduction = "umap",label=T ,group.by = "Patient") 
pbmc=seuratObj
pbmc <- FindNeighbors(pbmc, reduction = "harmony",
                      dims = 1:15) 

#Set different resolutions and observe the grouping effect (which one to choose?)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  pbmc=FindClusters(pbmc, #graph.name = "CCA_snn", 
                    resolution = res, algorithm = 1)
}
colnames(pbmc@meta.data)
apply(pbmc@meta.data[,grep("RNA_snn",colnames(pbmc@meta.data))],2,table)

p1_dim=plot_grid(ncol = 3, DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.01") + 
                   ggtitle("louvain_0.01"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.1") + 
                   ggtitle("louvain_0.1"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + 
                   ggtitle("louvain_0.2"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14)

p1_dim=plot_grid(ncol = 3, DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + 
                   ggtitle("louvain_0.8"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.1") + 
                   ggtitle("louvain_1"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + 
                   ggtitle("louvain_0.3"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18)

library(clustree)
p2_tree=clustree(pbmc@meta.data, prefix = "RNA_snn_res.")
p2_tree
ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")


#"SNHG6" %in% pbmc@assays$RNA@counts@Dimnames[[1]]
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot1

#TOP10 genes marked
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
p2=plot2
ggsave(print(p2),filename = "3.Top 10 highly variable genes.pdf",height = 6,width =8)

#Next, the analysis is performed at a resolution of 0.8
sel.clust = "RNA_snn_res.0.5"
pbmc <- SetIdent(pbmc, value = sel.clust)
table(pbmc@active.ident) 

DimPlot(pbmc, reduction = "umap",group.by = sel.clust)

#6.2 tSNE
pbmc <- RunTSNE(pbmc, dims = 1:15, check_duplicates = FALSE)
head(pbmc@reductions$tsne@cell.embeddings)
DimPlot(pbmc, reduction = "tsne")

plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE)
plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE)

p3=plot1 + plot2
ggsave(print(p3),filename = "4.umap_tsne Dimensionality reduction clustering.pdf",height = 6,width =9)
#It can be seen that the structures of the dimensionality reduction visualization of the two are consistent, and the UMAP method is relatively more compact.

saveRDS(pbmc, "pbmc_qc.rds")


####Cell identification
rm(list = ls())
pbmc=readRDS("pbmc_qc.rds")

library(ggplot2)
genes_to_check = c('CD3D', 'CD3E', 'CD8A',                          #T Cells
                   'CD19', 'CD79A', 'MS4A1' ,                       #B cells
                   'IGHG1', 'MZB1', 'SDC1',                         #Plasma cells 
                   'CD163', 'CD14',                                 #Mono.Macr
                   "FGFBP2", "FCG3RA", "CX3CR1",                    #NK Cells
                   'FGF7','MME',                                    #Fibroblasts
                   'PECAM1', 'VWF',                                 #Endothelial cells
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',"CD24" ,   #epi or tumor
                   "THBD","IL3RA" ,"ITGAX" ,"CD1C"  ,                #DC cells                      
                   "CPA3"
                   )
library(stringr)
p <- DotPlot(pbmc, features = genes_to_check,
             assay='RNA'  )  + coord_flip()

p
# ggsave(plot=p, filename="check_marker_by_seurat_cluster.pdf")
# In addition, it is recommended to use RidgePlot(), CellScatter() and DotPlot() to view the data set.
# 
# This can also serve as some reference for subsequent manual annotation.


#Cell annotation using singleR
library(SingleR)
library(celldex)

DefaultAssay(pbmc) <- "RNA"
pbmc
DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE)

immune_singler <- GetAssayData(pbmc,slot = "data") # Extracting expression matrix

clusters <- pbmc$RNA_snn_res.0.5 # Get the current cell group name
table(pbmc$RNA_snn_res.0.5)
load("../../HumanPrimaryCellAtlasData.Rdata")
pred.immune <- SingleR(test = immune_singler, #Your expression matrix
                       ref = HumanPrimaryCellAtlasData, # Your annotation file
                       labels = HumanPrimaryCellAtlasData$label.fine,
                       #Because the samples are mainly immune cells (rather than all cells), they are set to label.fine
                       #If you want a more detailed classification, or all of them are immune cells, you can set it to label.fine
                       #Too detailed may not be accurate
                       method = "cluster", 
                       clusters = clusters)
table(pred.immune$labels)
plotScoreHeatmap(pred.immune, clusters = pred.immune$labels)

new.clusterID <- pred.immune$labels
names(new.clusterID) <- levels(pbmc)
new.clusterID



new.clusterID[new.clusterID=="Epithelial_cells:bladder"]="Malignant"
new.clusterID[new.clusterID=="T_cell:CD4+_central_memory"]="CD4_T"
new.clusterID[new.clusterID=="B_cell:Plasma_cell"]="Plasma"
new.clusterID[new.clusterID=="Smooth_muscle_cells:bronchial"]="Fibroblast"
new.clusterID[new.clusterID=="T_cell:CD8+"]="CD8_T"
new.clusterID[new.clusterID=="Monocyte:leukotriene_D4"]="Mono.Macro"
new.clusterID[new.clusterID=="B_cell:Memory"]="B_cell"
new.clusterID[new.clusterID=="Endothelial_cells:lymphatic:TNFa_48h"]="Endothelial_cell"
new.clusterID[new.clusterID=="Tissue_stem_cells:BM_MSC:BMP2"]="Fibroblast"
new.clusterID[new.clusterID=="Fibroblasts:breast"]="Fibroblast"
new.clusterID[new.clusterID=="CMP"]="Mast"
new.clusterID[new.clusterID=="DC:monocyte-derived:mature"]="DC_cell"
new.clusterID[new.clusterID=="Smooth_muscle_cells:bronchial"]="Fibroblast"
new.clusterID[new.clusterID=="T_cell:CD4+_effector_memory"]="CD4_T"
new.clusterID[new.clusterID=="Neurons:Schwann_cell"]="Malignant"


new.clusterID

#This data is directly used singleR
pbmc <- RenameIdents(pbmc,new.clusterID)

DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE)

pbmc@meta.data$celltype=pbmc@active.ident

table(pbmc@meta.data$celltype)



# ###### step6: Cell type annotation (based on marker gene) ######
# celltype=data.frame(ClusterID=0:27,
#                     celltype='na')
# celltype[celltype$ClusterID %in% c( 0 , 8, 23, 24 ,26, 27 ),2]='B cells'
# celltype[celltype$ClusterID %in% c(1, 3, 4 ,5 ,9 ,11 ,12, 19,22 ,25,20),2]='Malignant'
# celltype[celltype$ClusterID %in% c(2 ,6 ,7 ,17),2]='T cells'
# celltype[celltype$ClusterID %in% c( 10,14),2]='Monocyte.Macrophage'
# celltype[celltype$ClusterID %in% c( 13,15),2]='Fibroblasts'
# celltype[celltype$ClusterID %in% c(16),2]='Endothelial cells'
# celltype[celltype$ClusterID %in% c(18 ),2]='NK cells'
# celltype[celltype$ClusterID %in% c( 21),2]='DC'
# # 
# # 
# head(celltype)
# celltype
# table(celltype$celltype)
# pbmc@meta.data$celltype = "NA"
# for(i in 1:nrow(celltype)){
#   pbmc@meta.data[which(pbmc@meta.data$RNA_snn_res.0.5 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
# table(pbmc@meta.data$celltype)


# genes_to_check = c('CD3D', 'CD3E', 'CD8A',                   #T Cells
#                    'CD19', 'CD79A', 'MS4A1' ,                #B cells
#                    'IGHG1', 'MZB1', 'SDC1',                  #Plasma cells
#                    'CD163', 'CD14',                          #Monocytes and macrophages
#                    "FGFBP2", "FCG3RA", "CX3CR1",             #NK Cells
#                    "THBD","IL3RA"                            #DC cells
# )
# library(stringr)
# p_all_markers <- DotPlot(pbmc, features = unique(genes_to_check),
#                          group.by = "RNA_snn_res.0.5",assay='RNA'  )  + coord_flip()
# 
# p_all_markers
# ggsave(plot=p_all_markers,
#        filename="check_all_marker_by_RNA_snn_res.0.5.pdf")
# DimPlot(pbmc, reduction = "umap", label = T)
# DimPlot(pbmc, reduction = "umap", group.by = "celltype",label = T)
# ggsave('umap_by_celltype.pdf')


tab.1=table(pbmc@meta.data$celltype,pbmc@meta.data$RNA_snn_res.0.5) 
library(gplots)
tab.1
pro='cluster'
pdf(file = paste0("4." , pro,'  celltype VS  res.0.5.pdf'),width = 14)
balloonplot(tab.1, main =" celltype VS  res.0.5 ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off()


library(patchwork)
th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5))
p_all_markers=DotPlot(pbmc, features = unique(genes_to_check),
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+ th
p_umap=DimPlot(pbmc, reduction = "umap", group.by = "celltype",label = T) 
p_all_markers+p_umap


DimPlot(pbmc, reduction = "umap",
        group.by = "celltype",
        label = T) 
ggsave('4.umap_by_celltype_split_orig.ident.pdf',width = 8)

phe=pbmc@meta.data
save(phe,file = 'phe_by_markers.Rdata')


library(dplyr)
library(Seurat)
library(ComplexHeatmap)
library(GSVA)
library(GSEABase)
library(limma)
library(ggplot2)
Sys.setenv(LANGUAGE = "en") #Display English error message
options(stringsAsFactors = FALSE) #Disable conversion of chr to factor

# Calculate differentially expressed genes in each cell
library(magrittr)
#Time consuming steps
#Adjust both up and down

table(pbmc@meta.data$celltype)
table(pbmc@active.ident)
length(pbmc@meta.data$celltype)
length(pbmc@active.ident)

#celltype Double quotes are required!!!
pbmc <- SetIdent(pbmc, value = "celltype")
table(pbmc@active.ident)
library(future)
# check the current active plan
plan()
plan("multiprocess", workers = 4)
plan()
start = Sys.time()
pbmc.markers1 <- FindAllMarkers(pbmc,only.pos = FALSE,  min.pct = 0.25, logfc.threshold = 0.25)
end = Sys.time() 
dur = end-start 
dur

pbmc.markers=dplyr::filter(pbmc.markers1,avg_log2FC > 0.25)


library(scRNAtoolVis)


#cluster.order Set the subgroup order:
# ajust cluster orders

p=jjVolcano(diffData = pbmc.markers1,
            size  = 3.5,
            fontface = 'italic',
            legend.position = c(0.8,0.2),
            flip = T)
ggsave(filename = "5.Marker_gene_pointplot.pdf", plot = p, width = 10, height = 6)


p=markerVocalno(markers = pbmc.markers1,
                topn = 5,log2FC=0.25,
                labelCol = ggsci::pal_npg()(10))

ggsave(filename = "5.Marker_gene_pointplot2.pdf", plot = p, width = 12, height = 10)


# Save to file
write.csv(pbmc.markers1, "output_pbmc.markers.csv", quote = F)


#?FindAllMarkers See more parameters.
# cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, 
#                                 test.use = "roc",  # Inspection method
#                                 only.pos = TRUE) #Only output genes with pos


#The optional parameters of test.use are: wilcox (default), bimod, roc, t, negbinom, poisson, LR, MAST, DESeq2.
# 7.4 Visualization
# 
# You can use the built-in functions of seurat to view the basic information of key genes:
# 
# 1）VlnPlot: Gene expression probability distribution
# VlnPlot(pbmc, features = c("GZMK", "CRTAM"))
# # you can plot raw counts as well
# VlnPlot(pbmc, features = c("GZMK", "CRTAM"), slot = "counts", log = TRUE)
# 
# 
# #2）FeaturePlot：Display the expression of key genes in tSNE or PCA plots
# FeaturePlot(pbmc, features = c("CCR7", "GPR183", "MAL", "NOSIP", "GZMK", "CRTAM", "FGFBP2", "GNLY", 
#                                "GZMK","CXCR3"))

#3）DoHeatmap()View expression heatmaps of key gene cells and clusters
top10 <- pbmc.markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC)
p4=DoHeatmap(pbmc, features = top10$gene) + NoLegend()
ggsave(print(p4),filename = "6.top10Gene heat map.pdf",height = 6,width =8)



save(pbmc,file = "CRC_EMTAB8107.RData")