# # # #step1: # rm(list=ls()) # options(stringsAsFactors=F) # getwd() # setwd("~/1") # library(Seurat) # library(ggplot2) # library(clustree) # library(cowplot) # library(data.table) # library(dplyr) # library(tidyverse) library(Seurat) library(harmony) library(scDblFinder) library(SingleR) library(HGNChelper) library(AUCell) library(monocle) library(igraph) library(CellChat) library(BiocParallel) library(SingleCellExperiment) ## Seurat Quality Control ##### # # # output_dir <- "GSE270667_RAW" # if (!dir.exists(output_dir)) { # dir.create(output_dir) # } # # # untar("GSE270667_RAW.tar", exdir = output_dir) # # # rm(list = ls()) # # dir='GSE270667_RAW/' # fs=list.files('GSE270667_RAW/','^GSM') # fs # library(tidyverse) # samples=str_split(fs,'_',simplify=T)[,1] # # if(F){ # lapply(unique(samples),function(x){ # #x=unique(samples)[1] # y=fs[grepl(x,fs)] # folder=paste0("GSE270667_RAW/",paste(str_split(y[1],'_',simplify=T)[,1:2],collapse="_")) # dir.create(folder,recursive=T) # # # file.rename(paste0("GSE270667_RAW/",y[1]),file.path(folder,"barcodes.tsv.gz")) # # # file.rename(paste0("GSE270667_RAW/",y[2]),file.path(folder,"features.tsv.gz")) # file.rename(paste0("GSE270667_RAW/",y[3]),file.path(folder,"matrix.mtx.gz")) # }) # } # # dir='~/1/GSE270667_RAW/GSE270667_RAW' # samples=list.files(dir) # samples # sceList=lapply(samples,function(pro){ # #pro=samples[1] # print(pro) # tmp=Read10X(file.path(dir,pro)) # if(length(tmp)==2){ # ct=tmp[[1]] # }else{ct=tmp} # sce=CreateSeuratObject(counts=ct, # project=pro, # min.cells=5, # min.features=300) # return(sce) # }) # # # do.call(rbind,lapply(sceList,dim)) # sce.all=merge(x=sceList[[1]], # y=sceList[-1], # add.cell.ids=samples) # names(sce.all@assays$RNA@layers) # #>[1] "counts.GSM5699777_TD1" "counts.GSM5699778_TD2" "counts.GSM5699779_TD3" # #>[4] "counts.GSM5699780_TD4" "counts.GSM5699781_TD5" "counts.GSM5699782_TD6" # #>[7] "counts.GSM5699783_TD7" "counts.GSM5699784_TD8" "counts.GSM5699785_TD9" # # sce.all[["RNA"]]$counts # # Alternate accessor function with the same result # LayerData(sce.all,assay="RNA",layer="counts") # # # sce.all # sce.all<-JoinLayers(sce.all) # sce.all # dim(sce.all[["RNA"]]$counts) # # # as.data.frame(sce.all@assays$RNA$counts[1:10,1:2]) # head(sce.all@meta.data,10) # table(sce.all$orig.ident) # length(sce.all$orig.ident) # # fivenum(sce.all$nFeature_RNA) # # table(sce.all$nFeature_RNA>800) # # sce.all=sce.all[,sce.all$nFeature_RNA>800] # # sce.all # # library(stringr) # phe=sce.all@meta.data # table(phe$orig.ident) # phe$patient=str_split(phe$orig.ident,'[_]',simplify=T)[,2] # table(phe$patient) # # phe$sample=phe$patient # table(phe$sample) # phe$sample=gsub("TD5|TD7|TD8","AIS",phe$sample) # phe$sample=gsub("TD3|TD4|TD6","MIA",phe$sample) # phe$sample=gsub("TD1|TD2|TD9","IAC",phe$sample) # table(phe$sample) # sce.all@meta.data=phe # # #step2: QC # sp='human' # dir.create("~/1/GSE270667_RAW/GSE270667_RAW/1-QC/1-QC") # setwd("~/1/GSE270667_RAW/GSE270667_RAW/1-QC/1-QC") # # # source('../GSE270667_RAW/1-QC/1-QC/qc.R') # sce.all.filt=basic_qc(sce.all) # print(dim(sce.all)) # print(dim(sce.all.filt)) # ## # setwd('../') # getwd() # fivenum(sce.all.filt$percent_ribo) # table(sce.all.filt$nFeature_RNA>5) ########## Data quality control and doublet cell processing ########## rm(list = ls()) sc_filtered = readRDS('sce.all_int.rds') sc_filtered$log10GenesPerUMI = log10(sc_filtered$nFeature_RNA) / log10(sc_filtered$nCount_RNA) # Calculate complexity sc_filtered$mitoRatio = PercentageFeatureSet( object = sc_filtered, pattern = '^MT-' ) / 100 sc_filtered = subset( sc_filtered, subset = nCount_RNA > 500 & nCount_RNA < 6000 & nFeature_RNA > 200 & log10GenesPerUMI > 0.8 & mitoRatio < 0.2 ) counts = GetAssayData(object = sc_filtered, slot = 'counts') filtered_counts = counts[rowSums((counts) > 0) >= 10,] sc_filtered = CreateSeuratObject( filtered_counts, meta.data = sc_filtered@meta.data ) VlnPlot( sc_filtered, features = c('nCount_RNA','nFeature_RNA','log10GenesPerUMI','mitoRatio'), pt.size = 0, ncol = 4, group.by = 'sample' ) sc_filtered = NormalizeData(sc_filtered) sc_filtered = ScaleData( sc_filtered, features = rownames(sc_filtered) ) # Save data saveRDS( sc_filtered, file = 'sc_Rds/sc_filtered.Rds' ) ## Cell cycle processing #### rm(list = ls()) sc_dblf = readRDS('sc_Rds/sc_filtered.Rds') sce = as.SingleCellExperiment(sc_dblf) sce = scDblFinder( sce, samples = 'sample' ) table(sce$scDblFinder.class) sc_dblf = as.Seurat(sce) sc_dblf = subset( sc_dblf, subset = scDblFinder.class == 'singlet' ) saveRDS( sc_dblf, file = 'sc_Rds/sc_dblf.Rds' ) rm(list = ls()) sc_cc = readRDS('sc_Rds/sc_dblf.Rds') s.genes = cc.genes$s.genes s.genes = CaseMatch( search = s.genes, match = rownames(sc_cc) ) g2m.genes = cc.genes$g2m.genes g2m.genes = CaseMatch( search = g2m.genes, match = rownames(sc_cc) ) sc_cc = CellCycleScoring( sc_cc, s.features = s.genes, g2m.features = g2m.genes, set.ident = T ) sc_cc$CC.Difference = sc_cc$S.Score - sc_cc$G2M.Score sc_cc = SCTransform( sc_cc, vars.to.regress = c('mitoRatio','S.Score','G2M.Score','CC.Difference'), verbose = T ) sc_cc = RunPCA( sc_cc, seed.use = 2024, features = rownames(sc_cc) ) sc_cc = RunTSNE( sc_cc, seed.use = 2024, features = rownames(sc_cc) ) DimPlot( sc_cc, reduction = 'tsne', group.by = 'Phase' ) saveRDS( sc_cc, file = 'sc_Rds/sc_cc.Rds' ) # Cell clustering and annotation ######## rm(list = ls()) sc_data = readRDS('sc_Rds/sc_cc.Rds') LabelPoints( VariableFeaturePlot(sc_data), points = head(VariableFeatures(sc_data),10), repel = T ) Idents(sc_data) = 'sample' FeatureScatter( sc_data, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA', pt.size = 0.1 ) #library(harmony) sc_data = RunHarmony( sc_data, group.by.vars = 'sample', plot_convergence = T ) sc_data = RunPCA( sc_data, seed.use = 2024, features = VariableFeatures(sc_data) ) ElbowPlot( sc_data, ndims = 50 ) sc_data = FindNeighbors( sc_data, reduction = 'pca', dims = 1:30, verbose = T ) sc_data = FindClusters( sc_data, verbose = T, resolution = c(0.6,0.8,1.0,1.2,1.4) ) sc_data = RunTSNE( sc_data, seed.use = 2024, features = VariableFeatures(sc_data) ) DimPlot( sc_data, reduction = 'tsne', group.by = 'SCT_snn_res.0.8', label = T, repel = T ) DimPlot( sc_data, reduction = 'tsne', group.by = 'sample' ) # hpca = celldex::HumanPrimaryCellAtlasData() # testdata = GetAssayData( # sc_data, # slot = 'data' # ) # cluster = sc_data$SCT_snn_res.0.8 # cellpred = SingleR( # test = testdata, # ref = hpca, # labels = hpca$label.main, # de.method = 'classic', # clusters = cluster # ) # celltype = data.frame( # clusterID = rownames(cellpred), # cell = cellpred$labels # ) # celltype$cell = gsub('_',' ',celltype$cell) # sc_data = subset( # sc_data, # idents = celltype$clusterID # ) # sc_data$cell_anno = left_join( # sc_data@meta.data, # celltype, # by = c('SCT_snn_res.0.8' = 'clusterID') # )$cell # Load gene set preparation function source('input/gene_sets_prepare.R') # Load cell annotation function source('input/sctype_score_.R') # Prepare gene set using default marker database db_ = 'input/ScTypeDB_full.xlsx' tissue = c('Lung','Immune system') gs_list = gene_sets_prepare(db_,tissue) es_max = sctype_score( scRNAseqData = sc_data[['SCT']]@scale.data, scaled = T, gs = gs_list$gs_positive, gs2 = gs_list$gs_negative ) cL_results = do.call( rbind, lapply( unique(sc_data$SCT_snn_res.1.4), function(cl){ es_max_cl = sort( rowSums(es_max[,rownames(sc_data@meta.data[sc_data$SCT_snn_res.1.4 == cl,])]), # Calculate row sums for each cluster decreasing = T ) head( data.frame( cluster = cl, type = names(es_max_cl), scores = es_max_cl, ncells = sum(sc_data$SCT_snn_res.1.4 == cl) ), 10 ) } ) ) sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1,wt = scores) sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells / 4] = 'Unknown' # Set low scoring cell types to 'Unknown' sc_data$cell_anno = left_join( sc_data@meta.data, sctype_scores[,c('cluster','type')], by = c('SCT_snn_res.0.8' = 'cluster') )$type DimPlot( sc_data, reduction = 'tsne', group.by = 'cell_anno', label = T, repel = T ) # sc_data = RunUMAP( # sc_data, # reduction = 'pca', # dims = 1:30 # ) # DimPlot( # sc_data, # reduction = 'umap', # group.by = 'cell_anno', # label = T, # repel = T # ) ggplot( sc_data@meta.data, aes( factor( cell_anno, levels = table(cell_anno) %>% sort(decreasing = T) %>% names() ), fill = cell_anno ) ) + geom_bar( stat = 'count' ) + geom_text( data.frame( cell_anno = table(sc_data$cell_anno) %>% sort(decreasing = T) %>% names(), n = table(sc_data$cell_anno) %>% sort(decreasing = T) %>% as.numeric() ), mapping = aes( cell_anno, n, label = n ) ) + coord_flip() + theme_bw() + labs( x = element_blank(), y = element_blank() ) + guides( fill = 'none' ) ggplot( sc_data@meta.data, aes( sample, fill = cell_anno ) ) + geom_bar( position = 'fill' ) + coord_flip() + theme_bw() + labs( x = element_blank(), y = element_blank() ) saveRDS( sc_data, file = 'sc_Rds/sc_data.Rds' ) # Single cell differential analysis ########## rm(list = ls()) sc_data = readRDS('sc_Rds/sc_data.Rds') Idents(sc_data) = 'cell_anno' DefaultAssay(sc_data) = 'RNA' sc_marker = FindAllMarkers( sc_data, min.pct = 0.1, logfc.threshold = 0.5 ) sc_marker = sc_marker %>% filter( p_val_adj < 0.05, avg_log2FC > 0.5 ) marker1 = sc_marker %>% group_by(cluster) %>% top_n(n = 1,wt = avg_log2FC) marker5 = sc_marker %>% group_by(cluster) %>% top_n(n = 5,wt = avg_log2FC) VlnPlot( sc_data, features = marker1$gene, pt.size = 0 ) FeaturePlot( features = marker1$gene ) DotPlot( sc_data, features = marker5$gene %>% unique ) + theme( axis.text.x = element_text( angle = 45, hjust = 1 ) ) saveRDS( sc_marker, file = 'sc_Rds/sc_marker.Rds' ) rm(list = ls()) sc_marker = readRDS('sc_Rds/sc_marker.Rds') sc_data = readRDS('sc_Rds/sc_data.Rds') PRGs = read.table('input/PRGs.txt',header = T)$PRGs PR_marker = intersect(sc_marker$gene,PRGs) heat = FetchData( sc_data, vars = PR_marker ) %>% t() %>% as.data.frame() annotation_col = data.frame( cell_anno = sc_data$cell_anno ) annotation_col = annotation_col %>% arrange(cell_anno) heat = heat[,rownames(annotation_col)] pheatmap::pheatmap( heat, scale = 'row', color = colorRampPalette(c('blue','white','red'))(50), cluster_cols = F, annotation_col = annotation_col, labels_col = '', border_color = NA ) saveRDS( PR_marker, file = 'sc_Rds/PR_marker.Rds' ) ##### AUCell ##### rm(list = ls()) PR_marker = readRDS('sc_Rds/PR_marker.Rds') sc_data_AUcell = readRDS('sc_Rds/sc_data.Rds') Idents(sc_data_AUcell) = 'cell_anno' PR_marker = data.frame( gs = 'PR_marker', gene = PR_marker ) PR_marker = split( PR_marker$gene, PR_marker$gs ) expr = FetchData( sc_data_AUcell, vars = rownames(sc_data_AUcell) ) %>% t() cell_ranking = AUCell_buildRankings(expr) cell_AUC = AUCell_calcAUC( PR_marker, cell_ranking ) aucs = getAUC(cell_AUC) %>% t() %>% as.data.frame() sc_data_AUcell$PR_aucs = aucs$PR_marker FeaturePlot( sc_data_AUcell, features = 'PR_aucs' ) VlnPlot( sc_data_AUcell, features = 'PR_aucs', pt.size = 0 ) sc_data_AUcell$PR_group = ifelse( sc_data_AUcell$PR_aucs >= median(sc_data_AUcell$PR_aucs), 'high_pyroptosis', 'low_pyroptosis' ) DimPlot( sc_data_AUcell, reduction = 'tsne', group.by = 'PR_group' ) Idents(sc_data_AUcell) = 'PR_group' DefaultAssay(sc_data_AUcell) = 'RNA' sc_pyroptosis = FindAllMarkers( sc_data_AUcell, min.pct = 0.1, logfc.threshold = 0.1 ) sc_pyroptosis_marker = sc_pyroptosis$gene %>% unique DEGs = read.table('input/DEGs.txt',header = T)$DEGs PRDEG_marker = intersect(sc_pyroptosis_marker,DEGs) saveRDS( sc_data_AUcell, file = 'sc_Rds/sc_data_AUCell.Rds' ) saveRDS( sc_pyroptosis, file = 'sc_Rds/sc_pyroptosis.Rds' ) ########### Pseudotime trajectory analysis ########## rm(list = ls()) sc_data_pseudo = readRDS('sc_Rds/sc_data.Rds') mat_expr = as( FetchData( sc_data_pseudo, vars = rownames(sc_data_pseudo) ) %>% t(), 'sparseMatrix' ) pd = new( 'AnnotatedDataFrame', data = sc_data_pseudo@meta.data ) fd = new( 'AnnotatedDataFrame', data = data.frame( gene_short_name = rownames(sc_data_pseudo), row.names = rownames(sc_data_pseudo) ) ) cds = newCellDataSet( mat_expr, phenoData = pd, featureData = fd, lowerDetectionLimit = 0.5, expressionFamily = negbinomial.size() ) cds = estimateSizeFactors(cds) cds = estimateDispersions(cds) cds = detectGenes( cds, min_expr = 0.1 ) head(fData(cds)) disp_table = dispersionTable(cds) unsup_clustering_genes = subset( disp_table, mean_expression >= 0.1 & dispersion_empirical >= 0.5 * dispersion_fit ) cds = setOrderingFilter( cds, unsup_clustering_genes$gene_id ) library(monocle) set.seed(123) n <- ncol(cds) sample_size <- ceiling(n / 5) cells_to_sample <- sample(n, sample_size) cds_sample <- cds[, cells_to_sample] print(head(cds_sample)) diff_test_res = differentialGeneTest( cds_sample, fullModelFormulaStr = '~cell_anno' ) load('input/diff_test_res.Rda') ordering_genes = row.names( subset( diff_test_res, qval < 0.01 ) ) cds = setOrderingFilter( cds, ordering_genes ) cds = reduceDimension( cds, max_components = 2, reduction_method = 'DDRTree' ) source('input/orderCells.R') cds = orderCells(cds) #cds = readRDS('sc_Rds/cds.Rds') saveRDS( cds, file = 'sc_Rds/cds.Rds' ) plot_cell_trajectory( cds, color_by = 'cell_anno' ) plot_cell_trajectory( cds, color_by = 'Pseudotime' ) plot_cell_trajectory( cds, color_by = 'State' ) plot_cell_trajectory( cds, color_by = 'cell_anno' ) + facet_wrap( ~ cell_anno, nrow = 4 ) PR_marker = readRDS('sc_Rds/PR_marker.Rds') plot_pseudotime_heatmap( cds[PR_marker,], num_clusters = 3, show_rownames = T, return_heatmap = T, cores = 1 ) ########### Cell communication analysis ###### rm(list = ls()) sc_data_cellchat = readRDS('sc_Rds/sc_data.Rds') data_input = GetAssayData( sc_data_cellchat, assay = 'RNA', slot = 'data' ) meta = data.frame( row.names = rownames(sc_data_cellchat@meta.data), cell_anno = sc_data_cellchat$cell_anno ) cellchat = createCellChat( object = data_input, meta = meta, group.by = 'cell_anno' ) cellchat = addMeta( cellchat, meta = meta ) cellchat = setIdent( cellchat, ident.use = 'cell_anno' ) cells = levels(cellchat@idents) cellchat_db = CellChatDB.human cellchat_db_use = cellchat_db cellchat@DB = cellchat_db_use cellchat = subsetData(cellchat) future::plan('default') cellchat = identifyOverExpressedGenes(cellchat) cellchat = identifyOverExpressedInteractions(cellchat) cellchat = projectData(cellchat,PPI.human) cellchat = computeCommunProb( cellchat, raw.use = T ) cellchat = filterCommunication( cellchat, min.cells = 10 ) cellchat = computeCommunProbPathway(cellchat) cellchat = aggregateNet(cellchat) group_size = as.numeric(table(cellchat@idents)) # netVisual_circle( # cellchat@net$count, # vertex.weight = group_size, # weight.scale = T, # label.edge = F, # title.name = 'Number of Interactions' # ) netVisual_circle( cellchat@net$weight, vertex.weight = group_size, weight.scale = T, label.edge = F ) mat = matrix( 0, nrow = nrow(cellchat@net$weight), ncol = ncol(cellchat@net$weight), dimnames = dimnames(cellchat@net$weight) ) mat[1,] = cellchat@net$weight[1,] netVisual_circle( mat, vertex.weight = group_size, weight.scale = T, label.edge = F ) netVisual_heatmap( cellchat ) netVisual_bubble( cellchat, #sources.use = c(1:2), #targets.use = c(1:2), remove.isolate = F ) saveRDS( cellchat, file = 'sc_Rds/cellchat.Rds' )