# Copy this code to each folder as different datasets have different processing parameters rm(list = ls()) library(Seurat) library(ggplot2) library(cowplot) library(Matrix) library(dplyr) library(ggsci) library(scater) library(SingleR) library(hdf5r) library(tidyverse) library(RColorBrewer) mydata = Read10X_h5("CRC_GSE166555_expression.h5") # During data import, cells with fewer than 200 genes detected (min.features = 200), # and genes covered by fewer than 3 cells (min.cells = 3), are already filtered out. pbmc <- CreateSeuratObject(counts = mydata, project = "GSE166555", min.cells = 3, min.features = 200) pbmc <- SetIdent(pbmc, value = "GSE166555") dim(pbmc) # Modify column names before reading, special characters like parentheses are not allowed phe = data.table::fread("CRC_GSE166555_CellMetainfo_table.tsv") phe = as.data.frame(phe) rownames(phe) = phe$Cell pbmc = AddMetaData(pbmc, phe, col.name = colnames(phe)) # View data dim(mydata) # Data Preprocessing # 2.1 QC # Generally use the following three standards, also refer to commonly used # Number of unique genes detected in each cell # Low-quality cells or empty droplets usually contain few genes # Cell doublets or multiplets may show abnormally high gene counts # Total number of molecules detected in each cell # Mitochondrial gene content proportion # Low-quality or dying cells have a high mitochondrial gene content # Use PercentageFeatureSet() to calculate mitochondrial QC ratio # Genes starting with MT- are mitochondrial genes # The [[ operator can add columns to object metadata. This is a great place to stash QC stats pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") # Show QC metrics for the first 5 cells head(pbmc@meta.data, 5) # Note: PercentageFeatureSet function can calculate the percentage of a specified gene subset count in each CELL. # Violin plot: View number of genes, number of UMIs, mitochondrial gene proportion # Visualize QC metrics as a violin plot p = VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) p ggsave(print(p), filename = "1.nFeature_nCount_percent.mt.pdf", height = 5, width = 6) # nFeature_RNA represents the number of genes detected in each cell, nCount represents the sum of expression levels of all genes detected in each cell, # percent.mt represents the proportion of detected mitochondrial genes. # Correlation between gene count, mitochondrial gene proportion, and UMI count # Ratio of gene number detected per UMI (read count = molecular id) # FeatureScatter is typically used to visualize feature-feature relationships, but can be used # for anything calculated by the object, i.e. columns in object metadata, PC scores etc. plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt") plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") plot3 <- FeatureScatter(pbmc, feature1 = "percent.mt", feature2 = "nFeature_RNA") p = plot_grid(plot1, plot2, plot3, ncol = 3) ggsave(filename = "2.scatter_plot.pdf", plot = p, width = 10, height = 4) # Filtering # Filter out cells that have unique gene counts over 2500 or below 200 # Filter out cells that have more than 5% mitochondrial genes pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) # Normalization # Normalize data using the LogNormalize method pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000) # Identification of highly variable genes # FindVariableFeatures() function identifies genes that are highly variable in the dataset pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) # Visualization of variable features # Top 10 genes are plotted top10 <- head(VariableFeatures(pbmc), 10) p <- DimPlot(pbmc, reduction = "pca") ggsave(print(p), filename = "3.top10_variable_genes.pdf", height = 6, width = 8) # Scaling data # ScaleData function scales and centers the data for each gene pbmc <- ScaleData(pbmc, features = rownames(pbmc)) # Run PCA pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) # Visualization of PCA # Examine and visualize PCA results p <- DimPlot(pbmc, reduction = "pca") ggsave(print(p), filename = "4.PCA.pdf", height = 6, width = 8) # Find neighbors # This step builds a KNN graph based on the PCA results pbmc <- FindNeighbors(pbmc, dims = 1:10) # Find clusters # The FindClusters function clusters cells based on their PCA scores pbmc <- FindClusters(pbmc, resolution = 0.5) # Visualization of clusters # tSNE plot to visualize the clusters p <- DimPlot(pbmc, reduction = "tsne") ggsave(print(p), filename = "5.tSNE_clusters.pdf", height = 6, width = 8) # Differential expression analysis # The FindAllMarkers function performs differential expression analysis between clusters # It is important to check the current active plan plan() plan("multiprocess", workers = 8) plan() start = Sys.time() pbmc.markers1 <- FindAllMarkers(pbmc, only.pos = FALSE, min.pct = 0.25, logfc.threshold = 0.25) end = Sys.time() dur = end - start dur pbmc.markers = dplyr::filter(pbmc.markers1, avg_log2FC > 0.25) library(scRNAtoolVis) # Adjust cluster orders p = jjVolcano(diffData = pbmc.markers1, size = 3.5, fontface = 'italic', legend.position = c(0.8, 0.2), flip = T) ggsave(filename = "6.Marker_gene_volcano_plot.pdf", plot = p, width = 10, height = 6) p = markerVolcano(markers = pbmc.markers1, topn = 5, log2FC = 0.25, labelCol = ggsci::pal_npg()(10)) ggsave(filename = "7.Marker_gene_volcano_plot2.pdf", plot = p, width = 12, height = 10) # Save to file write.csv(pbmc.markers1, "output_pbmc.markers.csv", quote = F) # For more parameters, see ?FindAllMarkers. # cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, # test.use = "roc", # Test method # only.pos = TRUE) # Only output positive genes # Optional parameters include: wilcox (default), bimod, roc, t, negbinom, poisson, LR, MAST, DESeq2. # Visualization # # Seurat's built-in functions to view key gene basics: # # 1) VlnPlot: Gene expression probability distribution # VlnPlot(pbmc, features = c("GZMK", "CRTAM")) # # You can also plot raw counts # VlnPlot(pbmc, features = c("GZMK", "CRTAM"), slot = # "counts", log = TRUE) # 2) FeaturePlot: Display expression of key genes on tSNE or PCA plots # FeaturePlot(pbmc, features = c("CCR7", "GPR183", "MAL", "NOSIP", "GZMK", "CRTAM", "FGFBP2", "GNLY", # "GZMK", "CXCR3")) # 3) DoHeatmap: View heatmap of key gene expression in cells and clusters # top10 <- pbmc.markers %>% # group_by(cluster) %>% # top_n(n = 10, wt = avg_log2FC) # p4 = DoHeatmap(pbmc, features = top10$gene) + NoLegend() # ggsave(print(p4), filename = "8.top10_genes_heatmap.pdf", height = 6, width = 8) # Save the final object save(pbmc, file = "CRC_GSE166555.RData") # The end of the script