##The raw data was download from TCGA-LGG project,the data type we obtained is 'BAM' file. (https://portal.gdc.cancer.gov/repository?facetTab=files&filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-LGG%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Raw%20Sequencing%20Data%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_format%22%2C%22value%22%3A%5B%22BAM%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.experimental_strategy%22%2C%22value%22%3A%5B%22RNA-Seq%22%5D%7D%7D%5D%7D&searchTableTab=files), This BAM file raw data is too large to be made available online, so we upload the link where we got the BAM file. ################################################################### ##Code to map raw data to reference genome hg19 using STAR software: #!/bin/bash #PBS -k o #PBS -l nodes=1:ppn=16,vmem=40gb,mem=40gb,walltime=11:59:00 #PBS -M yw74@iu.edu #PBS -N STAR Alignment #PBS -j oe STAR --genomeDir /N/dc2/scratch/yw74/STAR/starIndex --runThreadN 8 --readFilesIn /N/dc2/scratch/yw74/TCGA_kidney_normal_tumor/data/SRR2874123_1.fastq --outFileNamePrefix /N/dc2/scratch/yw74/TCGA_kidney_normal_tumor/star_alignment_res/normal1 SRR2874123_trimmomatic_STAR_ --sjdbGTFfile /N/dc2/scratch/yw74/hg19/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf --outSAMtype BAM SortedByCoordinate #################################################################### ##Code to call transcript counts using NGSUtils: #Counts the number of reads in genes or regions #!/bin/bash #PBS -k o #PBS -l nodes=1:ppn=1,walltime=4:00:00,gres=ccm,mem=32gb #PBS -M yw74@iu.edu #PBS -N test_bamutils #PBS -j oe ##miso does not support mixed read lengths ##miso expects sample bam file to be indexed module load ccm BAM=${filearg} filename=$(basename $BAM) cd /N/dc2/projects/ngs/users/yangw/rosMap/bamfile/ OUTPUT=/N/dc2/projects/ngs/users/yangw/rosMap/fpkm/ ccmrun bamutils count -gtf $OUTPUT/genes.gtf -fpkm -norm all -multiple ignore -libray unstranded -BAM > $OUTPUT/$BAM.new.rpkm.txt #################################################################### ##Code to call transcript CPM using edgeR: ##based on the results from NGSUtils, using EdgeR ,do filtering and normalization, then get CPM y = DGEList(counts = res1, genes = gene_anno) # Filter lowly expressed genes: 15-20 counts ## For the six samples, total assigned counts range between 8-22M, ## thus use cpm > 1 keep = rowSums(cpm(y)>1) >= 1 # usually the replication number y = y[keep, , keep.lib.sizes = F] # Normalization y = calcNormFactors(y) logCPM = cpm(y, log = T, normalized.lib.sizes = T) ################################################################### ##Code to call alternative splicing events using MISO: #!/bin/bash #PBS -k o #PBS -l nodes=1:ppn=16,walltime=11:59:00 #PBS -M yw74@iu.edu #PBS -N HepG2_1 #PBS -j oe ##miso does not support mixed read lengths ##miso expects sample bam file to be indexed index_gff --index /N/dc2/scratch/yw74/MISO/hg19/SE.hg19.gff3 /N/dc2/scratch/yw74/MISO/indexed_SE_events BAM=${filearg} filename=$(basename $BAM) #BAM=/N/dc2/scratch/yw74/Tophat/res_untrim/accepted_hits.bam #OUTPU=/N/dc2/scratch/yw74/MISO/miso/res/MISO_results/miso_output #genes.min_1000.const_exons.gff= exon_utils --get-const-exons /N/dc2/scratch/yw74/hg19/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gff --min-exon-size 1000 --output-dir /N/dc2/scratch/yw74/MISO/exons/ #pe_utils --compute-insert-len /N/dc2/scratch/yw74/Tophat/res_untrim/accepted_hits.bam /N/dc2/scratch/yw74/MISO/exons/genes.min_1000.const_exons.gff --output-dir /N/dc2/scratch/yw74/MISO/indert-dist #INSERT='less -S $OUTPUT/pe_utils/*.insert_len |grep -m 1 -P -o "#mean=\\d+" |sed -s 's/#\|mean\|=//g'' #DIST='less -S $OUTPUT/pe_utils/*.insert_len |grep -m 1 -P -o "sdev=\\d+" |sed -s 's/sdev\|=//g'' miso --run /N/dc2/projects/ngs/yangwang/SF2/org_data/index/hg19/indexed_SE_events/ /N/dc2/projects/ngs/yangwang/SF2/org_data/miso_file/bam_data/HepG2_1.bam --output-dir /N/dc2/projects/ngs/yangwang/SF2/org_data/miso_file/bam_data/my_output1/ --paired-end 52.3 21.5 --read-len 50 #summarize_miso --summarize-samples /N/dc2/projects/ngs/yangwang/SF2/org_data/miso_file/bam_data/my_output1/ /N/dc2/projects/ngs/yangwang/SF2/org_data/miso_file/bam_data/summary_output1/ ################################################################### ###extract miso results and regenerate the table with psi_mean, low, high, Cpsi, fours kinds of counts args = commandArgs(T) miso_summary = args[1] output = args[2] readcountReform = function(miso_summary, output) { psi_res = read.table(miso_summary, as.is=T, header = T, sep = "\t", quote = NULL) num_event = nrow(psi_res) rc_sum = matrix(nrow = num_event, ncol = 5) rownames(rc_sum) = rownames(psi_res) colnames(rc_sum) = c("event_name", "psi_mean", "psi_low","psi_high", "Cpsi") for (i in 1:4) { rc_sum[, i] = psi_res[, i] } rc_sum[,5] = psi_res[,4]-psi_res[,3] isoform_counts = c() for (f in 1:num_event){ allcounts = psi_res[f, 6] counts = isoformCount(allcounts) isoform_counts = rbind(isoform_counts, counts) } colnames(isoform_counts) = c("(0,0)","(0,1)","(1,0)","(1,1)") single_sum = isoform_counts[, "(0,1)"] + isoform_counts[, "(1,0)"] rc_sum = data.frame(rc_sum, isoform_counts, single_sum, stringsAsFactors = F, check.names = F) filename = basename(miso_summary) write.table(rc_sum,paste(output, filename, ".txt", sep=""),sep="\t", row.names = F, col.names=T, quote = F) return(rc_sum) } isoformCount = function(allcounts) { s<-gsub(',(', ',_(', allcounts, fixed=T) s<-unlist(strsplit(s,split=',_')) patt = c("(0,0)","(0,1)","(1,0)","(1,1)") counts = c() for (m in patt) { a = grep(m,s,value=T) b = unlist(strsplit(a,split=':')) if (!is.null(b[2])) { counts[m]<-as.numeric(b[2]) } if (is.null(b[2])) { counts[m]<-0 } } return(counts) } res = readcountReform(miso_summary, output) ##################################################