#
# 
# #step1:
# rm(list=ls())
# options(stringsAsFactors=F)
# getwd()
# setwd("~/1")
# library(Seurat)
# library(ggplot2)
# library(clustree)
# library(cowplot)
# library(data.table)
# library(dplyr)
# 


library(tidyverse)
library(Seurat)
library(harmony)
library(scDblFinder)
library(SingleR)
library(HGNChelper)
library(AUCell)
library(monocle)
library(igraph)
library(CellChat)
library(BiocParallel)
library(SingleCellExperiment)






## Seurat Quality Control #####
# # 
# output_dir <- "GSE270667_RAW"
# if (!dir.exists(output_dir)) {
#   dir.create(output_dir)
# }
# # 
# untar("GSE270667_RAW.tar", exdir = output_dir)
# 
# 
# rm(list = ls())
# 
# dir='GSE270667_RAW/'
# fs=list.files('GSE270667_RAW/','^GSM')
# fs
# library(tidyverse)
# samples=str_split(fs,'_',simplify=T)[,1]
# 
# if(F){
#   lapply(unique(samples),function(x){
#     #x=unique(samples)[1]
#     y=fs[grepl(x,fs)]
#     folder=paste0("GSE270667_RAW/",paste(str_split(y[1],'_',simplify=T)[,1:2],collapse="_"))
#     dir.create(folder,recursive=T)
#     #
#     file.rename(paste0("GSE270667_RAW/",y[1]),file.path(folder,"barcodes.tsv.gz"))
#     #
#     file.rename(paste0("GSE270667_RAW/",y[2]),file.path(folder,"features.tsv.gz"))
#     file.rename(paste0("GSE270667_RAW/",y[3]),file.path(folder,"matrix.mtx.gz"))
#   })
# }
# 
# dir='~/1/GSE270667_RAW/GSE270667_RAW'
# samples=list.files(dir)
# samples
# sceList=lapply(samples,function(pro){
#   #pro=samples[1]
#   print(pro)
#   tmp=Read10X(file.path(dir,pro))
#   if(length(tmp)==2){
#     ct=tmp[[1]]
#   }else{ct=tmp}
#   sce=CreateSeuratObject(counts=ct,
#                          project=pro,
#                          min.cells=5,
#                          min.features=300)
#   return(sce)
# })
# 
# 
# do.call(rbind,lapply(sceList,dim))
# sce.all=merge(x=sceList[[1]],
#               y=sceList[-1],
#               add.cell.ids=samples)
# names(sce.all@assays$RNA@layers)
# #>[1] "counts.GSM5699777_TD1" "counts.GSM5699778_TD2" "counts.GSM5699779_TD3"
# #>[4] "counts.GSM5699780_TD4" "counts.GSM5699781_TD5" "counts.GSM5699782_TD6"
# #>[7] "counts.GSM5699783_TD7" "counts.GSM5699784_TD8" "counts.GSM5699785_TD9"
# 
# sce.all[["RNA"]]$counts
# # Alternate accessor function with the same result
# LayerData(sce.all,assay="RNA",layer="counts")
# 
# 
# sce.all
# sce.all<-JoinLayers(sce.all)
# sce.all
# dim(sce.all[["RNA"]]$counts)
# 
# 
# as.data.frame(sce.all@assays$RNA$counts[1:10,1:2])
# head(sce.all@meta.data,10)
# table(sce.all$orig.ident)
# length(sce.all$orig.ident)
# # fivenum(sce.all$nFeature_RNA)
# # table(sce.all$nFeature_RNA>800) 
# # sce.all=sce.all[,sce.all$nFeature_RNA>800]
# # sce.all
# 
# library(stringr)
# phe=sce.all@meta.data
# table(phe$orig.ident)
# phe$patient=str_split(phe$orig.ident,'[_]',simplify=T)[,2]
# table(phe$patient)
# 
# phe$sample=phe$patient
# table(phe$sample)
# phe$sample=gsub("TD5|TD7|TD8","AIS",phe$sample)
# phe$sample=gsub("TD3|TD4|TD6","MIA",phe$sample)
# phe$sample=gsub("TD1|TD2|TD9","IAC",phe$sample)
# table(phe$sample)
# sce.all@meta.data=phe
# 
# #step2: QC
# sp='human'
# dir.create("~/1/GSE270667_RAW/GSE270667_RAW/1-QC/1-QC")
# setwd("~/1/GSE270667_RAW/GSE270667_RAW/1-QC/1-QC")
# #
# source('../GSE270667_RAW/1-QC/1-QC/qc.R')
# sce.all.filt=basic_qc(sce.all)
# print(dim(sce.all))
# print(dim(sce.all.filt))
# ##
# setwd('../')
# getwd()
# fivenum(sce.all.filt$percent_ribo)
# table(sce.all.filt$nFeature_RNA>5)

########## Data quality control and doublet cell processing ##########
rm(list = ls())
sc_filtered = readRDS('sce.all_int.rds')
sc_filtered$log10GenesPerUMI = log10(sc_filtered$nFeature_RNA) / log10(sc_filtered$nCount_RNA) # Calculate complexity
sc_filtered$mitoRatio = PercentageFeatureSet( 
  object = sc_filtered,
  pattern = '^MT-'
) / 100
sc_filtered = subset( 
  sc_filtered,
  subset = nCount_RNA > 500 &
    nCount_RNA < 6000 &
    nFeature_RNA > 200 &
    log10GenesPerUMI > 0.8 &
    mitoRatio < 0.2
)


counts = GetAssayData(object = sc_filtered,
                      slot = 'counts')

filtered_counts = counts[rowSums((counts) > 0) >= 10,] 
sc_filtered = CreateSeuratObject(
  filtered_counts,
  meta.data = sc_filtered@meta.data
)
VlnPlot( 
  sc_filtered,
  features = c('nCount_RNA','nFeature_RNA','log10GenesPerUMI','mitoRatio'),
  pt.size = 0,
  ncol = 4,
  group.by = 'sample'
)
sc_filtered = NormalizeData(sc_filtered) 
sc_filtered = ScaleData(
  sc_filtered,
  features = rownames(sc_filtered)
)

# Save data
saveRDS(
  sc_filtered,
  file = 'sc_Rds/sc_filtered.Rds'
)

## Cell cycle processing ####
rm(list = ls())
sc_dblf = readRDS('sc_Rds/sc_filtered.Rds')
sce = as.SingleCellExperiment(sc_dblf) 
sce = scDblFinder( 
  sce,
  samples = 'sample'
)
table(sce$scDblFinder.class)
sc_dblf = as.Seurat(sce) 
sc_dblf = subset( 
  sc_dblf,
  subset = scDblFinder.class == 'singlet'
)
saveRDS(
  sc_dblf,
  file = 'sc_Rds/sc_dblf.Rds'
)


rm(list = ls())
sc_cc = readRDS('sc_Rds/sc_dblf.Rds')
s.genes = cc.genes$s.genes 
s.genes = CaseMatch(
  search = s.genes,
  match = rownames(sc_cc)
)
g2m.genes = cc.genes$g2m.genes 
g2m.genes = CaseMatch(
  search = g2m.genes,
  match = rownames(sc_cc)
)
sc_cc = CellCycleScoring( 
  sc_cc,
  s.features = s.genes,
  g2m.features = g2m.genes,
  set.ident = T
)
sc_cc$CC.Difference = sc_cc$S.Score - sc_cc$G2M.Score 
sc_cc = SCTransform( 
  sc_cc,
  vars.to.regress = c('mitoRatio','S.Score','G2M.Score','CC.Difference'),
  verbose = T
)
sc_cc = RunPCA(
  sc_cc,
  seed.use = 2024,
  features = rownames(sc_cc)
)
sc_cc = RunTSNE(
  sc_cc,
  seed.use = 2024,
  features = rownames(sc_cc)
)
DimPlot(
  sc_cc,
  reduction = 'tsne',
  group.by = 'Phase'
)
saveRDS(
  sc_cc,
  file = 'sc_Rds/sc_cc.Rds'
)

# Cell clustering and annotation ########
rm(list = ls())
sc_data = readRDS('sc_Rds/sc_cc.Rds')
LabelPoints(
  VariableFeaturePlot(sc_data), 
  points = head(VariableFeatures(sc_data),10), 
  repel = T
)
Idents(sc_data) = 'sample'
FeatureScatter( 
  sc_data,
  feature1 = 'nCount_RNA',
  feature2 = 'nFeature_RNA',
  pt.size = 0.1
)
#library(harmony)
sc_data = RunHarmony( 
  sc_data,
  group.by.vars = 'sample',
  plot_convergence = T
)
sc_data = RunPCA(
  sc_data,
  seed.use = 2024,
  features = VariableFeatures(sc_data)
)
ElbowPlot( 
  sc_data,
  ndims = 50
)
sc_data = FindNeighbors( 
  sc_data,
  reduction = 'pca',
  dims = 1:30,
  verbose = T
)
sc_data = FindClusters(
  sc_data,
  verbose = T,
  resolution = c(0.6,0.8,1.0,1.2,1.4) 
)
sc_data = RunTSNE(
  sc_data,
  seed.use = 2024,
  features = VariableFeatures(sc_data)
)
DimPlot(
  sc_data,
  reduction = 'tsne',
  group.by = 'SCT_snn_res.0.8',
  label = T,
  repel = T
)
DimPlot(
  sc_data,
  reduction = 'tsne',
  group.by = 'sample'
)
# hpca = celldex::HumanPrimaryCellAtlasData()
# testdata = GetAssayData(
#   sc_data,
#   slot = 'data'
# )
# cluster = sc_data$SCT_snn_res.0.8
# cellpred = SingleR(
#   test = testdata,
#   ref = hpca,
#   labels = hpca$label.main,
#   de.method = 'classic',
#   clusters = cluster
# )
# celltype = data.frame(
#   clusterID = rownames(cellpred),
#   cell = cellpred$labels
# )
# celltype$cell = gsub('_',' ',celltype$cell)
# sc_data = subset(
#   sc_data,
#   idents = celltype$clusterID
# )
# sc_data$cell_anno = left_join(
#   sc_data@meta.data,
#   celltype,
#   by = c('SCT_snn_res.0.8' = 'clusterID')
# )$cell
# Load gene set preparation function
source('input/gene_sets_prepare.R') 
# Load cell annotation function
source('input/sctype_score_.R') 
# Prepare gene set using default marker database
db_ = 'input/ScTypeDB_full.xlsx' 
tissue = c('Lung','Immune system') 

gs_list = gene_sets_prepare(db_,tissue) 
es_max = sctype_score( 
  scRNAseqData = sc_data[['SCT']]@scale.data,
  scaled = T, 
  gs = gs_list$gs_positive, 
  gs2 = gs_list$gs_negative 
) 
cL_results = do.call(
  rbind, 
  lapply(
    unique(sc_data$SCT_snn_res.1.4), 
    function(cl){ 
      es_max_cl = sort( 
        rowSums(es_max[,rownames(sc_data@meta.data[sc_data$SCT_snn_res.1.4 == cl,])]), # Calculate row sums for each cluster
        decreasing = T 
      )
      head( 
        data.frame( 
          cluster = cl,
          type = names(es_max_cl),
          scores = es_max_cl,
          ncells = sum(sc_data$SCT_snn_res.1.4 == cl)
        ),
        10 
      )
    }
  )
)
sctype_scores = cL_results %>% 
  group_by(cluster) %>% 
  top_n(n = 1,wt = scores)
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells / 4] = 'Unknown' # Set low scoring cell types to 'Unknown'
sc_data$cell_anno = left_join(
  sc_data@meta.data,
  sctype_scores[,c('cluster','type')],
  by = c('SCT_snn_res.0.8' = 'cluster')
)$type
DimPlot(
  sc_data,
  reduction = 'tsne',
  group.by = 'cell_anno',
  label = T,
  repel = T
)
# sc_data = RunUMAP(
#   sc_data,
#   reduction = 'pca',
#   dims = 1:30
# )
# DimPlot(
#   sc_data,
#   reduction = 'umap',
#   group.by = 'cell_anno',
#   label = T,
#   repel = T
# )
ggplot(
  sc_data@meta.data,
  aes(
    factor(
      cell_anno,
      levels = table(cell_anno) %>% sort(decreasing = T) %>% names()
    ),
    fill = cell_anno
  )
) +
  geom_bar(
    stat = 'count'
  ) +
  geom_text(
    data.frame(
      cell_anno = table(sc_data$cell_anno) %>% sort(decreasing = T) %>% names(),
      n = table(sc_data$cell_anno) %>% sort(decreasing = T) %>% as.numeric()
    ),
    mapping = aes(
      cell_anno,
      n,
      label = n
    )
  ) +
  coord_flip() +
  theme_bw() +
  labs(
    x = element_blank(),
    y = element_blank()
  ) +
  guides(
    fill = 'none'
  )
ggplot(
  sc_data@meta.data,
  aes(
    sample,
    fill = cell_anno
  )
) +
  geom_bar(
    position = 'fill'
  ) +
  coord_flip() +
  theme_bw() +
  labs(
    x = element_blank(),
    y = element_blank()
  )
saveRDS(
  sc_data,
  file = 'sc_Rds/sc_data.Rds'
)

# Single cell differential analysis ##########
rm(list = ls())
sc_data = readRDS('sc_Rds/sc_data.Rds')
Idents(sc_data) = 'cell_anno'
DefaultAssay(sc_data) = 'RNA'
sc_marker = FindAllMarkers( 
  sc_data,
  min.pct = 0.1,
  logfc.threshold = 0.5
)
sc_marker = sc_marker %>% 
  filter(
    p_val_adj < 0.05,
    avg_log2FC > 0.5
  )
marker1 = sc_marker %>% 
  group_by(cluster) %>% 
  top_n(n = 1,wt = avg_log2FC)
marker5 = sc_marker %>% 
  group_by(cluster) %>% 
  top_n(n = 5,wt = avg_log2FC)
VlnPlot( 
  sc_data,
  features = marker1$gene,
  pt.size = 0
)
FeaturePlot( 
  features = marker1$gene
)
DotPlot( 
  sc_data,
  features = marker5$gene %>% unique
) +
  theme(
    axis.text.x = element_text(
      angle = 45,
      hjust = 1
    )
  )
saveRDS(
  sc_marker,
  file = 'sc_Rds/sc_marker.Rds'
)


rm(list = ls())
sc_marker = readRDS('sc_Rds/sc_marker.Rds')
sc_data = readRDS('sc_Rds/sc_data.Rds')
PRGs = read.table('input/PRGs.txt',header = T)$PRGs 
PR_marker = intersect(sc_marker$gene,PRGs) 

heat = FetchData( 
  sc_data,
  vars = PR_marker
) %>% 
  t() %>% 
  as.data.frame()
annotation_col = data.frame( 
  cell_anno = sc_data$cell_anno
)
annotation_col = annotation_col %>% 
  arrange(cell_anno)
heat = heat[,rownames(annotation_col)]
pheatmap::pheatmap( 
  heat,
  scale = 'row',
  color = colorRampPalette(c('blue','white','red'))(50),
  cluster_cols = F,
  annotation_col = annotation_col,
  labels_col = '',
  border_color = NA
)

saveRDS(
  PR_marker,
  file = 'sc_Rds/PR_marker.Rds'
)

##### AUCell #####
rm(list = ls())
PR_marker = readRDS('sc_Rds/PR_marker.Rds')
sc_data_AUcell = readRDS('sc_Rds/sc_data.Rds')
Idents(sc_data_AUcell) = 'cell_anno'
PR_marker = data.frame( 
  gs = 'PR_marker',
  gene = PR_marker
)
PR_marker = split( 
  PR_marker$gene,
  PR_marker$gs
)
expr = FetchData( 
  sc_data_AUcell,
  vars = rownames(sc_data_AUcell)
) %>% 
  t()
cell_ranking = AUCell_buildRankings(expr) 
cell_AUC = AUCell_calcAUC( 
  PR_marker,
  cell_ranking
)
aucs = getAUC(cell_AUC) %>% 
  t() %>% 
  as.data.frame()
sc_data_AUcell$PR_aucs = aucs$PR_marker
FeaturePlot( 
  sc_data_AUcell,
  features = 'PR_aucs'
)
VlnPlot( 
  sc_data_AUcell,
  features = 'PR_aucs',
  pt.size = 0
)
sc_data_AUcell$PR_group = ifelse( 
  sc_data_AUcell$PR_aucs >= median(sc_data_AUcell$PR_aucs),
  'high_pyroptosis',
  'low_pyroptosis'
)
DimPlot(
  sc_data_AUcell,
  reduction = 'tsne',
  group.by = 'PR_group'
)
Idents(sc_data_AUcell) = 'PR_group'
DefaultAssay(sc_data_AUcell) = 'RNA'
sc_pyroptosis = FindAllMarkers( 
  sc_data_AUcell,
  min.pct = 0.1,
  logfc.threshold = 0.1
)
sc_pyroptosis_marker = sc_pyroptosis$gene %>% unique 
DEGs = read.table('input/DEGs.txt',header = T)$DEGs 
PRDEG_marker = intersect(sc_pyroptosis_marker,DEGs) 
saveRDS(
  sc_data_AUcell,
  file = 'sc_Rds/sc_data_AUCell.Rds'
)
saveRDS(
  sc_pyroptosis,
  file = 'sc_Rds/sc_pyroptosis.Rds'
)




########### Pseudotime trajectory analysis ##########
rm(list = ls())
sc_data_pseudo = readRDS('sc_Rds/sc_data.Rds')
mat_expr = as(
  FetchData( 
    sc_data_pseudo,
    vars = rownames(sc_data_pseudo)
  ) %>% 
    t(),
  'sparseMatrix'
)
pd = new( 
  'AnnotatedDataFrame',
  data = sc_data_pseudo@meta.data
)
fd = new( 
  'AnnotatedDataFrame',
  data = data.frame(
    gene_short_name = rownames(sc_data_pseudo),
    row.names = rownames(sc_data_pseudo)
  )
)
cds = newCellDataSet( 
  mat_expr,
  phenoData = pd,
  featureData = fd,
  lowerDetectionLimit = 0.5,
  expressionFamily = negbinomial.size()
)

cds = estimateSizeFactors(cds) 
cds = estimateDispersions(cds) 
cds = detectGenes( 
  cds,
  min_expr = 0.1
)
head(fData(cds))
disp_table = dispersionTable(cds) 
unsup_clustering_genes = subset( 
  disp_table,
  mean_expression >= 0.1 & dispersion_empirical >= 0.5 * dispersion_fit
)
cds = setOrderingFilter( 
  cds,
  unsup_clustering_genes$gene_id
)



library(monocle)

set.seed(123)

n <- ncol(cds)

sample_size <- ceiling(n / 5)

cells_to_sample <- sample(n, sample_size)

cds_sample <- cds[, cells_to_sample]

print(head(cds_sample))



diff_test_res = differentialGeneTest(
  cds_sample,
  fullModelFormulaStr = '~cell_anno'
)
load('input/diff_test_res.Rda')
ordering_genes = row.names( 
  subset(
    diff_test_res,
    qval < 0.01
  )
)
cds = setOrderingFilter( 
  cds,
  ordering_genes
)
cds = reduceDimension( 
  cds,
  max_components = 2,
  reduction_method = 'DDRTree'
)
source('input/orderCells.R')
cds = orderCells(cds) 
#cds = readRDS('sc_Rds/cds.Rds')
saveRDS(
  cds,
  file = 'sc_Rds/cds.Rds'
)
plot_cell_trajectory(
  cds,
  color_by = 'cell_anno'
)
plot_cell_trajectory(
  cds,
  color_by = 'Pseudotime'
)
plot_cell_trajectory(
  cds,
  color_by = 'State'
)
plot_cell_trajectory(
  cds,
  color_by = 'cell_anno'
) +
  facet_wrap(
    ~ cell_anno,
    nrow = 4
  )
PR_marker = readRDS('sc_Rds/PR_marker.Rds')
plot_pseudotime_heatmap(
  cds[PR_marker,],
  num_clusters = 3,
  show_rownames = T,
  return_heatmap = T,
  cores = 1
)

########### Cell communication analysis ######
rm(list = ls())
sc_data_cellchat = readRDS('sc_Rds/sc_data.Rds')
data_input = GetAssayData( 
  sc_data_cellchat,
  assay = 'RNA',
  slot = 'data'
)
meta = data.frame( 
  row.names = rownames(sc_data_cellchat@meta.data),
  cell_anno = sc_data_cellchat$cell_anno
)
cellchat = createCellChat( 
  object = data_input,
  meta = meta,
  group.by = 'cell_anno'
)
cellchat = addMeta( 
  cellchat,
  meta = meta
)
cellchat = setIdent( 
  cellchat,
  ident.use = 'cell_anno'
)
cells = levels(cellchat@idents)
cellchat_db = CellChatDB.human
cellchat_db_use = cellchat_db
cellchat@DB = cellchat_db_use
cellchat = subsetData(cellchat)
future::plan('default')
cellchat = identifyOverExpressedGenes(cellchat) 
cellchat = identifyOverExpressedInteractions(cellchat) 
cellchat = projectData(cellchat,PPI.human) 
cellchat = computeCommunProb( 
  cellchat,
  raw.use = T
)
cellchat = filterCommunication( 
  cellchat,
  min.cells = 10
)
cellchat = computeCommunProbPathway(cellchat) 
cellchat = aggregateNet(cellchat) 
group_size = as.numeric(table(cellchat@idents))
# netVisual_circle( 
#   cellchat@net$count,
#   vertex.weight = group_size,
#   weight.scale = T,
#   label.edge = F,
#   title.name = 'Number of Interactions'
# )
netVisual_circle( 
  cellchat@net$weight,
  vertex.weight = group_size,
  weight.scale = T,
  label.edge = F
)
mat = matrix(
  0,
  nrow = nrow(cellchat@net$weight),
  ncol = ncol(cellchat@net$weight),
  dimnames = dimnames(cellchat@net$weight)
)
mat[1,] = cellchat@net$weight[1,]
netVisual_circle( 
  mat,
  vertex.weight = group_size,
  weight.scale = T,
  label.edge = F
)
netVisual_heatmap( 
  cellchat
)
netVisual_bubble( 
  cellchat,
  #sources.use = c(1:2),
  #targets.use = c(1:2),
  remove.isolate = F
)
saveRDS(
  cellchat,
  file = 'sc_Rds/cellchat.Rds'
)