#Copy the code to each folder together, because different data sets have different processing parameters rm(list = ls()) library(Seurat) library(ggplot2) library(cowplot) library(Matrix) library(dplyr) library(ggsci) library(scater) library(SingleR) library(hdf5r) library(tidyverse) library(RColorBrewer) mydata= Read10X_h5("CRC_EMTAB8107_expression.h5") #When the data was imported, cells with less than 200 genes measured in the dataset (min.features = 200) were included. #Genes covered by less than 3 cells (min.cells = 3) have been filtered out. pbmc <- CreateSeuratObject(counts = mydata, project = "EMTAB8107", min.cells = 3, min.features = 200) pbmc <- SetIdent(pbmc, value = "EMTAB8107") dim(pbmc) #Modify the list before reading. Do not include special characters such as brackets. phe=data.table::fread("CRC_EMTAB8107_CellMetainfo_table.tsv") phe=as.data.frame(phe) rownames(phe)=phe$Cell pbmc=AddMetaData(pbmc, phe, col.name = colnames(phe)) #View data dim(mydata) #Data Preprocessing sce.all=pbmc #View data dim(mydata) #rm(list=ls()) ###### step2:QCQuality Control ###### # sce.all=readRDS("../sce.all_raw.rds") #Calculation of mitochondrial gene ratio # The names of genes in humans and mice are slightly different mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] mito_genes #13 mitochondrial genes sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito") fivenum(sce.all@meta.data$percent_mito) #Calculation of ribosomal gene ratio ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)] ribo_genes sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo") fivenum(sce.all@meta.data$percent_ribo) #Calculate red blood cell gene ratio rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)] sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb") fivenum(sce.all@meta.data$percent_hb) #Visualize the above ratios of cells feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb") feats <- c("nFeature_RNA", "nCount_RNA") p1=VlnPlot(sce.all, features = feats, pt.size = 0.01, ncol = 2) + NoLegend() p1 library(ggplot2) ggsave(filename="Vlnplot1.pdf",plot=p1) feats <- c("percent_mito", "percent_ribo", "percent_hb") p2=VlnPlot(sce.all, features = feats, pt.size = 0.01, ncol = 3, same.y.lims=T) + scale_y_continuous(breaks=seq(0, 100, 5)) + NoLegend() p2 ggsave(filename="Vlnplot2.pdf",plot=p2) p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", pt.size = 0.5) ggsave(filename="Scatterplot.pdf",plot=p3) #Filter low-quality cells/genes based on the above indicators #Filter indicator 1: cells with the least number of expressed genes & genes with the least number of expressed cells selected_c <- WhichCells(sce.all, expression = nFeature_RNA < 6000) selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 5] sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c) dim(sce.all) dim(sce.all.filt) # As you can see, genes are mainly filtered, followed by cells # par(mar = c(4, 8, 2, 1)) C=sce.all.filt@assays$RNA@counts dim(C) C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100 # The matrix C here is a bit large, so you can consider sampling it. C=C[,sample(1:ncol(C),1000)] most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1] pdf("TOP50_most_expressed_gene.pdf",width=14) boxplot(as.matrix(Matrix::t(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell", col = (scales::hue_pal())(50)[50:1], horizontal = TRUE) dev.off() rm(C) #Filter indicator 2: Mitochondrial/ribosomal gene ratio (based on the violin graph above) selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 5) selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3) selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1) length(selected_hb) length(selected_ribo) length(selected_mito) sce.all.filt <- subset(sce.all.filt, cells = selected_mito) sce.all.filt <- subset(sce.all.filt, cells = selected_ribo) sce.all.filt <- subset(sce.all.filt, cells = selected_hb) dim(sce.all.filt) table(sce.all.filt$Patient) #Visualize the filtered situation feats <- c("nFeature_RNA", "nCount_RNA") p1_filtered=VlnPlot(sce.all.filt, features = feats, pt.size = 0.1, ncol = 2) + NoLegend() ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered) feats <- c("percent_mito", "percent_ribo", "percent_hb") p2_filtered=VlnPlot(sce.all.filt, features = feats, pt.size = 0.1, ncol = 3) + NoLegend() ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered) #Filter indicator 3: Filter specific genes # Filter MALAT1 Housekeeping Gene sce.all.filt <- sce.all.filt[!grepl("MALAT1", rownames(sce.all.filt),ignore.case = T), ] # Filter Mitocondrial Mitochondrial genes sce.all.filt <- sce.all.filt[!grepl("^MT-", rownames(sce.all.filt),ignore.case = T), ] # Of course, you can filter more dim(sce.all.filt) pbmc=sce.all.filt ###harmony Remove batch effects pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 1e4) pbmc <- FindVariableFeatures(pbmc) pbmc <- ScaleData(pbmc) pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) library(harmony) seuratObj <- RunHarmony(pbmc, "Patient") names(seuratObj@reductions) seuratObj <- RunUMAP(seuratObj, dims = 1:15, reduction = "harmony") DimPlot(seuratObj,reduction = "umap",label=T ) DimPlot(seuratObj,reduction = "umap",label=T ,group.by = "Patient") pbmc=seuratObj pbmc <- FindNeighbors(pbmc, reduction = "harmony", dims = 1:15) #Set different resolutions and observe the grouping effect (which one to choose?) for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { pbmc=FindClusters(pbmc, #graph.name = "CCA_snn", resolution = res, algorithm = 1) } colnames(pbmc@meta.data) apply(pbmc@meta.data[,grep("RNA_snn",colnames(pbmc@meta.data))],2,table) p1_dim=plot_grid(ncol = 3, DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.01") + ggtitle("louvain_0.01"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.1") + ggtitle("louvain_0.1"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + ggtitle("louvain_0.2")) ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14) p1_dim=plot_grid(ncol = 3, DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + ggtitle("louvain_0.8"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.1") + ggtitle("louvain_1"), DimPlot(pbmc, reduction = "umap", group.by = "RNA_snn_res.0.5") + ggtitle("louvain_0.3")) ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18) library(clustree) p2_tree=clustree(pbmc@meta.data, prefix = "RNA_snn_res.") p2_tree ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf") #"SNHG6" %in% pbmc@assays$RNA@counts@Dimnames[[1]] # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(pbmc), 10) # plot variable features with and without labels plot1 <- VariableFeaturePlot(pbmc) plot1 #TOP10 genes marked plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) p2=plot2 ggsave(print(p2),filename = "3.Top 10 highly variable genes.pdf",height = 6,width =8) #Next, the analysis is performed at a resolution of 0.8 sel.clust = "RNA_snn_res.0.5" pbmc <- SetIdent(pbmc, value = sel.clust) table(pbmc@active.ident) DimPlot(pbmc, reduction = "umap",group.by = sel.clust) #6.2 tSNE pbmc <- RunTSNE(pbmc, dims = 1:15, check_duplicates = FALSE) head(pbmc@reductions$tsne@cell.embeddings) DimPlot(pbmc, reduction = "tsne") plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE) plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE) p3=plot1 + plot2 ggsave(print(p3),filename = "4.umap_tsne Dimensionality reduction clustering.pdf",height = 6,width =9) #It can be seen that the structures of the dimensionality reduction visualization of the two are consistent, and the UMAP method is relatively more compact. saveRDS(pbmc, "pbmc_qc.rds") ####Cell identification rm(list = ls()) pbmc=readRDS("pbmc_qc.rds") library(ggplot2) genes_to_check = c('CD3D', 'CD3E', 'CD8A', #T Cells 'CD19', 'CD79A', 'MS4A1' , #B cells 'IGHG1', 'MZB1', 'SDC1', #Plasma cells 'CD163', 'CD14', #Mono.Macr "FGFBP2", "FCG3RA", "CX3CR1", #NK Cells 'FGF7','MME', #Fibroblasts 'PECAM1', 'VWF', #Endothelial cells 'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',"CD24" , #epi or tumor "THBD","IL3RA" ,"ITGAX" ,"CD1C" , #DC cells "CPA3" ) library(stringr) p <- DotPlot(pbmc, features = genes_to_check, assay='RNA' ) + coord_flip() p # ggsave(plot=p, filename="check_marker_by_seurat_cluster.pdf") # In addition, it is recommended to use RidgePlot(), CellScatter() and DotPlot() to view the data set. # # This can also serve as some reference for subsequent manual annotation. #Cell annotation using singleR library(SingleR) library(celldex) DefaultAssay(pbmc) <- "RNA" pbmc DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE) immune_singler <- GetAssayData(pbmc,slot = "data") # Extracting expression matrix clusters <- pbmc$RNA_snn_res.0.5 # Get the current cell group name table(pbmc$RNA_snn_res.0.5) load("../../HumanPrimaryCellAtlasData.Rdata") pred.immune <- SingleR(test = immune_singler, #Your expression matrix ref = HumanPrimaryCellAtlasData, # Your annotation file labels = HumanPrimaryCellAtlasData$label.fine, #Because the samples are mainly immune cells (rather than all cells), they are set to label.fine #If you want a more detailed classification, or all of them are immune cells, you can set it to label.fine #Too detailed may not be accurate method = "cluster", clusters = clusters) table(pred.immune$labels) plotScoreHeatmap(pred.immune, clusters = pred.immune$labels) new.clusterID <- pred.immune$labels names(new.clusterID) <- levels(pbmc) new.clusterID new.clusterID[new.clusterID=="Epithelial_cells:bladder"]="Malignant" new.clusterID[new.clusterID=="T_cell:CD4+_central_memory"]="CD4_T" new.clusterID[new.clusterID=="B_cell:Plasma_cell"]="Plasma" new.clusterID[new.clusterID=="Smooth_muscle_cells:bronchial"]="Fibroblast" new.clusterID[new.clusterID=="T_cell:CD8+"]="CD8_T" new.clusterID[new.clusterID=="Monocyte:leukotriene_D4"]="Mono.Macro" new.clusterID[new.clusterID=="B_cell:Memory"]="B_cell" new.clusterID[new.clusterID=="Endothelial_cells:lymphatic:TNFa_48h"]="Endothelial_cell" new.clusterID[new.clusterID=="Tissue_stem_cells:BM_MSC:BMP2"]="Fibroblast" new.clusterID[new.clusterID=="Fibroblasts:breast"]="Fibroblast" new.clusterID[new.clusterID=="CMP"]="Mast" new.clusterID[new.clusterID=="DC:monocyte-derived:mature"]="DC_cell" new.clusterID[new.clusterID=="Smooth_muscle_cells:bronchial"]="Fibroblast" new.clusterID[new.clusterID=="T_cell:CD4+_effector_memory"]="CD4_T" new.clusterID[new.clusterID=="Neurons:Schwann_cell"]="Malignant" new.clusterID #This data is directly used singleR pbmc <- RenameIdents(pbmc,new.clusterID) DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE) pbmc@meta.data$celltype=pbmc@active.ident table(pbmc@meta.data$celltype) # ###### step6: Cell type annotation (based on marker gene) ###### # celltype=data.frame(ClusterID=0:27, # celltype='na') # celltype[celltype$ClusterID %in% c( 0 , 8, 23, 24 ,26, 27 ),2]='B cells' # celltype[celltype$ClusterID %in% c(1, 3, 4 ,5 ,9 ,11 ,12, 19,22 ,25,20),2]='Malignant' # celltype[celltype$ClusterID %in% c(2 ,6 ,7 ,17),2]='T cells' # celltype[celltype$ClusterID %in% c( 10,14),2]='Monocyte.Macrophage' # celltype[celltype$ClusterID %in% c( 13,15),2]='Fibroblasts' # celltype[celltype$ClusterID %in% c(16),2]='Endothelial cells' # celltype[celltype$ClusterID %in% c(18 ),2]='NK cells' # celltype[celltype$ClusterID %in% c( 21),2]='DC' # # # # # head(celltype) # celltype # table(celltype$celltype) # pbmc@meta.data$celltype = "NA" # for(i in 1:nrow(celltype)){ # pbmc@meta.data[which(pbmc@meta.data$RNA_snn_res.0.5 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]} # table(pbmc@meta.data$celltype) # genes_to_check = c('CD3D', 'CD3E', 'CD8A', #T Cells # 'CD19', 'CD79A', 'MS4A1' , #B cells # 'IGHG1', 'MZB1', 'SDC1', #Plasma cells # 'CD163', 'CD14', #Monocytes and macrophages # "FGFBP2", "FCG3RA", "CX3CR1", #NK Cells # "THBD","IL3RA" #DC cells # ) # library(stringr) # p_all_markers <- DotPlot(pbmc, features = unique(genes_to_check), # group.by = "RNA_snn_res.0.5",assay='RNA' ) + coord_flip() # # p_all_markers # ggsave(plot=p_all_markers, # filename="check_all_marker_by_RNA_snn_res.0.5.pdf") # DimPlot(pbmc, reduction = "umap", label = T) # DimPlot(pbmc, reduction = "umap", group.by = "celltype",label = T) # ggsave('umap_by_celltype.pdf') tab.1=table(pbmc@meta.data$celltype,pbmc@meta.data$RNA_snn_res.0.5) library(gplots) tab.1 pro='cluster' pdf(file = paste0("4." , pro,' celltype VS res.0.5.pdf'),width = 14) balloonplot(tab.1, main =" celltype VS res.0.5 ", xlab ="", ylab="", label = T, show.margins = F) dev.off() library(patchwork) th=theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) p_all_markers=DotPlot(pbmc, features = unique(genes_to_check), assay='RNA' ,group.by = 'celltype' ) + coord_flip()+ th p_umap=DimPlot(pbmc, reduction = "umap", group.by = "celltype",label = T) p_all_markers+p_umap DimPlot(pbmc, reduction = "umap", group.by = "celltype", label = T) ggsave('4.umap_by_celltype_split_orig.ident.pdf',width = 8) phe=pbmc@meta.data save(phe,file = 'phe_by_markers.Rdata') library(dplyr) library(Seurat) library(ComplexHeatmap) library(GSVA) library(GSEABase) library(limma) library(ggplot2) Sys.setenv(LANGUAGE = "en") #Display English error message options(stringsAsFactors = FALSE) #Disable conversion of chr to factor # Calculate differentially expressed genes in each cell library(magrittr) #Time consuming steps #Adjust both up and down table(pbmc@meta.data$celltype) table(pbmc@active.ident) length(pbmc@meta.data$celltype) length(pbmc@active.ident) #celltype Double quotes are required!!! pbmc <- SetIdent(pbmc, value = "celltype") table(pbmc@active.ident) library(future) # check the current active plan plan() plan("multiprocess", workers = 4) plan() start = Sys.time() pbmc.markers1 <- FindAllMarkers(pbmc,only.pos = FALSE, min.pct = 0.25, logfc.threshold = 0.25) end = Sys.time() dur = end-start dur pbmc.markers=dplyr::filter(pbmc.markers1,avg_log2FC > 0.25) library(scRNAtoolVis) #cluster.order Set the subgroup order: # ajust cluster orders p=jjVolcano(diffData = pbmc.markers1, size = 3.5, fontface = 'italic', legend.position = c(0.8,0.2), flip = T) ggsave(filename = "5.Marker_gene_pointplot.pdf", plot = p, width = 10, height = 6) p=markerVocalno(markers = pbmc.markers1, topn = 5,log2FC=0.25, labelCol = ggsci::pal_npg()(10)) ggsave(filename = "5.Marker_gene_pointplot2.pdf", plot = p, width = 12, height = 10) # Save to file write.csv(pbmc.markers1, "output_pbmc.markers.csv", quote = F) #?FindAllMarkers See more parameters. # cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, # test.use = "roc", # Inspection method # only.pos = TRUE) #Only output genes with pos #The optional parameters of test.use are: wilcox (default), bimod, roc, t, negbinom, poisson, LR, MAST, DESeq2. # 7.4 Visualization # # You can use the built-in functions of seurat to view the basic information of key genes: # # 1)VlnPlot: Gene expression probability distribution # VlnPlot(pbmc, features = c("GZMK", "CRTAM")) # # you can plot raw counts as well # VlnPlot(pbmc, features = c("GZMK", "CRTAM"), slot = "counts", log = TRUE) # # # #2)FeaturePlot:Display the expression of key genes in tSNE or PCA plots # FeaturePlot(pbmc, features = c("CCR7", "GPR183", "MAL", "NOSIP", "GZMK", "CRTAM", "FGFBP2", "GNLY", # "GZMK","CXCR3")) #3)DoHeatmap()View expression heatmaps of key gene cells and clusters top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) p4=DoHeatmap(pbmc, features = top10$gene) + NoLegend() ggsave(print(p4),filename = "6.top10Gene heat map.pdf",height = 6,width =8) save(pbmc,file = "CRC_EMTAB8107.RData")