Script S1: raw data analysis script ########## Data quality assessment ########## for i in Q_H1 Q_H2 Q_H3 Q_H4 Q_H5 Q_L1 Q_L2 Q_L3 Q_L4 Q_L5 S_H1 S_H2 S_H3 S_H4 S_H5 S_L1 S_L2 S_L3 S_L4 S_L5 do fastqc -t 48 -o ./2_fastqc ./1_clean_data/${i}_1_paired.fq.gz ./1_clean_data/${i}_2_paired.fq.gz done ########## Quality control ########## for i in Q_H1 Q_H2 Q_H3 Q_H4 Q_H5 Q_L1 Q_L2 Q_L3 Q_L4 Q_L5 S_H1 S_H2 S_H3 S_H4 S_H5 S_L1 S_L2 S_L3 S_L4 S_L5 do java -jar \ /home/xlkang/Software/Trimmomatic-0.39/trimmomatic-0.39.jar PE \ -threads 48 \ ./rawdata/${i}_1.fq.gz ./rawdata/${i}_2.fq.gz \ ./1_clean_data/${i}_1_paired.fq.gz ./1_clean_data/${i}_1_unpaired.fq.gz \ ./1_clean_data/${i}_2_paired.fq.gz ./1_clean_data/${i}_2_unpaired.fq.gz \ LEADING:3 \ TRAILING:3 \ MINLEN:75 \ SLIDINGWINDOW:5:20 done ########## Alignment ########## STAR \ --runThreadN 48 \ --runMode genomeGenerate \ --genomeDir ./3_star_index \ --genomeFastaFiles ./ref_genome/genomic.fa \ --sjdbGTFfile ./ref_genome/ann_genomic.gtf \ --sjdbOverhang 149 for i in Q_H1 Q_H2 Q_H3 Q_H4 Q_H5 Q_L1 Q_L2 Q_L3 Q_L4 Q_L5 S_H1 S_H2 S_H3 S_H4 S_H5 S_L1 S_L2 S_L3 S_L4 S_L5 do STAR \ --runThreadN 12 \ ulimit -n 10000 \ --genomeDir ./3_star_index \ --readFilesCommand zcat \ --readFilesIn ./1_clean_data/${i}_1_paired.fq.gz ./1_clean_data/${i}_2_paired.fq.gz \ --outFileNamePrefix ./4_star_alignment/${i} \ --outSAMstrandField intronMotif \ --outSAMtype BAM Unsorted SortedByCoordinate \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.04 \ --outFilterMultimapNmax 1 done ########## Add read groups ########## for i in Q_H1 Q_H2 Q_H3 Q_H4 Q_H5 do java -jar /home/xlkang/miniconda3/pkgs/picard-2.27.4-hdfd78af_0/share/picard-2.27.4-0/picard.jar AddOrReplaceReadGroups \ I=./4_star_alignment/${i}Aligned.sortedByCoord.out.bam \ O=./5_picard_RG/${i}Aligned.sortedByCoord.RG.out.bam \ RGID=${i} \ RGLB=hypothalamus \ RGPL=illumina \ RGPU=unit1 \ RGSM=high done for i in Q_L1 Q_L2 Q_L3 Q_L4 Q_L5 do java -jar /home/xlkang/miniconda3/pkgs/picard-2.27.4-hdfd78af_0/share/picard-2.27.4-0/picard.jar AddOrReplaceReadGroups \ I=./4_star_alignment/${i}Aligned.sortedByCoord.out.bam \ O=./5_picard_RG/${i}Aligned.sortedByCoord.RG.out.bam \ RGID=${i} \ RGLB=hypothalamus \ RGPL=illumina \ RGPU=unit1 \ RGSM=low done for i in S_H1 S_H2 S_H3 S_H4 S_H5 do java -jar /home/xlkang/miniconda3/pkgs/picard-2.27.4-hdfd78af_0/share/picard-2.27.4-0/picard.jar AddOrReplaceReadGroups \ I=./4_star_alignment/${i}Aligned.sortedByCoord.out.bam \ O=./5_picard_RG/${i}Aligned.sortedByCoord.RG.out.bam \ RGID=${i} \ RGLB=duodenum \ RGPL=illumina \ RGPU=unit1 \ RGSM=high done for i in S_L1 S_L2 S_L3 S_L4 S_L5 do java -jar /home/xlkang/miniconda3/pkgs/picard-2.27.4-hdfd78af_0/share/picard-2.27.4-0/picard.jar AddOrReplaceReadGroups \ I=./4_star_alignment/${i}Aligned.sortedByCoord.out.bam \ O=./5_picard_RG/${i}Aligned.sortedByCoord.RG.out.bam \ RGID=${i} \ RGLB=duodenum \ RGPL=illumina \ RGPU=unit1 \ RGSM=low done ########## Remove duplicates ########## for i in Q_H1 Q_H2 Q_H3 Q_H4 Q_H5 Q_L1 Q_L2 Q_L3 Q_L4 Q_L5 S_H1 S_H2 S_H3 S_H4 S_H5 S_L1 S_L2 S_L3 S_L4 S_L5 do java -jar /home/xlkang/miniconda3/pkgs/picard-2.27.4-hdfd78af_0/share/picard-2.27.4-0/picard.jar MarkDuplicates \ I=./5_picard_RG/${i}Aligned.sortedByCoord.RG.out.bam \ O=./6_remove_duplicates/${i}Aligned.sortedByCoord.RG.dedupped.out.bam \ REMOVE_DUPLICATES=true \ CREATE_INDEX=true \ VALIDATION_STRINGENCY=SILENT \ M=./6_remove_duplicates/${i}_output_metrics.txt done ########## Merge BAM file ########## samtools merge \ -@ 48 \ ./7_merged/HRFI_merged.bam \ ./6_remove_duplicates/Q_H1Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_H2Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_H3Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_H4Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_H5Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_H1Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_H2Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_H3Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_H4Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_H5Aligned.sortedByCoord.RG.dedupped.out.bam samtools merge \ -@ 48 \ ./7_merged/LRFI_merged.bam \ ./6_remove_duplicates/Q_L1Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_L2Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_L3Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_L4Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/Q_L5Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_L1Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_L2Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_L3Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_L4Aligned.sortedByCoord.RG.dedupped.out.bam \ ./6_remove_duplicates/S_L5Aligned.sortedByCoord.RG.dedupped.out.bam ########## Variant calling ########## bcftools mpileup --threads 24 -E -Ou -d10000000 -f ./ref_genome/genomic.fa ./7_merged/HRFI_merged.bam ./7_merged/LRFI_merged.bam | bcftools call --threads 24 -mv -Ob -o ./8_variant_calling/HRFI_and_LRFI_merged.bcf ########## Normalize and filter SNP data ########## bcftools norm -f ../ref_genome/genomic.fa HRFI_and_LRFI_merged.bcf > 2_norm.vcf bcftools filter \ -g 5 \ -O v \ -o 3_bcf_filtered.vcf \ 2_norm.vcf vcftools \ --vcf 3_bcf_filtered.vcf \ --minDP 10 \ --mac 2 \ --max-missing 0.5 \ --minQ 30 \ --maf 0.05 \ --recode \ --recode-INFO-all \ --out 4_vcf_filtered ########## Identify and annotation intergroup-specific SNPs ########## SnpSift filter "(GEN[high].GT='0/1') | (GEN[high].GT='1/1') & (GEN[low].GT='0/0')" ../8_snp_calling_filter/4_vcf_filtered.recode.vcf > 1_high_specific.vcf SnpSift filter "(GEN[low].GT='0/1') | (GEN[low].GT='1/1') & (GEN[high].GT='0/0')" ../8_snp_calling_filter/4_vcf_filtered.recode.vcf > 1_low_specific.vcf snpEff -csvStats 3_high_snpEff_summary_state.csv -s 3_high_snpEff_summary.html ARS-UCD1.2.105 1_high_specific.vcf > 3_high_snp.vcf snpEff -csvStats 3_high_snpEff_summary_state.csv -s 3_high_snpEff_summary.html ARS-UCD1.2.105 1_low_specific.vcf > 3_low_snp.vcf ########## Identify intergroup-specific high-impact SNPs ########## SnpSift filter "(ANN[*].IMPACT has 'HIGH')" 3_high_snp.vcf | grep -v "^#" | awk -F '\t' '{split($8,a,"|");print $1"\t"$2"\t"$4"\t"$5"\t"a[4]}' > 4_high_feature.csv SnpSift filter "(ANN[*].IMPACT has 'HIGH')" 3_low_snp.vcf | grep -v "^#" | awk -F '\t' '{split($8,a,"|");print $1"\t"$2"\t"$4"\t"$5"\t"a[4]}' > 4_low_feature.csv grep -v "^#" 3_high_snp.vcf | awk -F '\t' '{split($8,a,"|");print $1"\t"$2"\t"$4"\t"$5"\t"a[4]}' > 6_high.csv grep -v "^#" 3_low_snp.vcf | awk -F '\t' '{split($8,a,"|");print $1"\t"$2"\t"$4"\t"$5"\t"a[4]}' > 6_low.csv # ============================================== # Statistical Power Calculation # using RNASeqPower Package # ============================================== # ------------------------------- install.packages("BiocManager") BiocManager::install("RNASeqPower") library(RNASeqPower) # v1.40.0 # 1. sample (HRFI vs LRFI) n <- 5 # 2. Significance threshold (corrected for multiple testing) alpha <- 0.05 # Benjamini-Hochberg # 3. Effect size (based on log2FC) effect_size <- 2 # 4. Sequencing depth (millions of reads/sample) depth <- 30 #### 5. Gene expression dispersion (coefficient of variation) #### # Pre-experiment data: raw counts matrix (rows=genes, columns=samples) pre_counts <- read.csv(“pre_experiment_counts.csv”, row.names=1) # Calculate CPM cpm_matrix <- t(t(pre_counts) / colSums(pre_counts)) * 1e6 # Extract housekeeping gene CPM values housekeeping_cpm <- cpm_matrix[c(“GAPDH”, “ACTB”, “RPL4”, “RPS9”, “HPRT1”), ] # Calculate the CV for each housekeeping gene cv_per_gene <- apply(housekeeping_cpm, 1, function(x) { sd(x) / mean(x) }) # Output results print(cv_per_gene) # GAPDH ACTB RPL4 RPS9 HPRT1 # 0.32 0.27 0.31 0.38 0.46 0.31 # Take the median as the global CV estimate final_cv <- median(cv_per_gene) # 0.31 # ------------------------------- # Calculating statistical power (core function) # ------------------------------- power_result <- rnapower( n = n, alpha = alpha, effect = effect_size, depth = depth, cv = cv )