A comprehensive guide for bioinformaticians to effectively use High Performance Computing clusters for Next-Generation Sequencing data analysis.
A High Performance Computing (HPC) cluster is a collection of interconnected computers (nodes) that work together to perform computationally intensive tasks. For NGS analysis, HPCs provide:
Rocky Linux is an enterprise-grade Linux distribution, similar to CentOS, commonly used in HPC environments due to its:
# Basic connection
ssh username@hpc-cluster.institution.edu
# With X11 forwarding for GUI applications
ssh -X username@hpc-cluster.institution.edu
# Using SSH keys (recommended)
ssh -i ~/.ssh/id_rsa username@hpc-cluster.institution.edu
# Check your home directory quota
quota -u $USER
# View cluster information
sinfo
# Check available modules
module avail
# List files with details
ls -la
# Check disk usage
du -sh *
# Find files
find /path -name "*.fastq.gz"
# Count files
find /path -name "*.fastq" | wc -l
# Check system resources
top
htop # if available
# Memory usage
free -h
# CPU information
lscpu
# Storage information
df -h
/home/username/ # Home directory (limited space)
/scratch/username/ # Temporary high-speed storage
/project/groupname/ # Shared project space
/data/ # Raw data storage
/software/ # Shared software installations
~
or /home/username
)# Check home directory usage
du -sh ~/*
/scratch/username
)# Create your scratch directory
mkdir -p /scratch/$USER/ngs_analysis
cd /scratch/$USER/ngs_analysis
/project/groupname
)SLURM (Simple Linux Utility for Resource Management) manages job scheduling on HPC clusters.
# View cluster status
sinfo
# View job queue
squeue
# View your jobs
squeue -u $USER
# Job details
scontrol show job JOBID
# Cancel a job
scancel JOBID
For testing and small analyses:
# Request interactive node
sinteractive --time=2:00:00 --mem=8GB --cpus-per-task=4
# Request GPU node (if available)
sinteractive --time=1:00:00 --gres=gpu:1
#!/bin/bash
#SBATCH --job-name=fastq_analysis
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
#SBATCH --time=04:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=32GB
#SBATCH --partition=general
# Load required modules
module load fastqc/0.11.9
module load trimmomatic/0.39
# Set working directory
cd /scratch/$USER/ngs_analysis
# Your analysis commands here
echo "Starting analysis at $(date)"
fastqc sample_R1.fastq.gz sample_R2.fastq.gz
echo "Analysis completed at $(date)"
sbatch fastq_analysis.slurm
Most HPC systems use environment modules to manage software:
# List available modules
module avail
# Search for specific software
module avail fastq
module avail python
# Load a module
module load fastqc/0.11.9
# List loaded modules
module list
# Unload a module
module unload fastqc
# Unload all modules
module purge
# Load conda module (if available)
module load miniconda3
# Create new environment
conda create -n ngs_analysis python=3.9
# Activate environment
conda activate ngs_analysis
# Install bioinformatics tools
conda install -c bioconda fastqc trimmomatic bwa samtools
# Load Python module
module load python/3.9
# Create virtual environment
python -m venv ~/venvs/ngs_env
# Activate environment
source ~/venvs/ngs_env/bin/activate
# Install packages
pip install biopython pandas numpy
#!/bin/bash
#SBATCH --job-name=fastqc_analysis
#SBATCH --output=fastqc_%j.out
#SBATCH --time=01:00:00
#SBATCH --cpus-per-task=4
#SBATCH --mem=8GB
module load fastqc/0.11.9
# Create output directory
mkdir -p /scratch/$USER/fastqc_results
# Run FastQC on all FASTQ files
fastqc -t 4 -o /scratch/$USER/fastqc_results/ *.fastq.gz
# Generate summary report
multiqc /scratch/$USER/fastqc_results/ -o /scratch/$USER/fastqc_results/
#!/bin/bash
#SBATCH --job-name=trimming
#SBATCH --output=trim_%j.out
#SBATCH --time=02:00:00
#SBATCH --cpus-per-task=8
#SBATCH --mem=16GB
module load trimmomatic/0.39
# Set variables
INPUT_DIR="/scratch/$USER/raw_fastq"
OUTPUT_DIR="/scratch/$USER/trimmed_fastq"
ADAPTERS="/path/to/adapters/TruSeq3-PE.fa"
mkdir -p $OUTPUT_DIR
# Process paired-end reads
for R1 in ${INPUT_DIR}/*_R1.fastq.gz; do
# Extract sample name
SAMPLE=$(basename $R1 _R1.fastq.gz)
R2=${INPUT_DIR}/${SAMPLE}_R2.fastq.gz
# Run Trimmomatic
java -jar $TRIMMOMATIC_PATH/trimmomatic-0.39.jar PE -threads 8 \
$R1 $R2 \
${OUTPUT_DIR}/${SAMPLE}_R1_paired.fastq.gz \
${OUTPUT_DIR}/${SAMPLE}_R1_unpaired.fastq.gz \
${OUTPUT_DIR}/${SAMPLE}_R2_paired.fastq.gz \
${OUTPUT_DIR}/${SAMPLE}_R2_unpaired.fastq.gz \
ILLUMINACLIP:${ADAPTERS}:2:30:10 \
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
done
#!/bin/bash
#SBATCH --job-name=bwa_alignment
#SBATCH --output=alignment_%j.out
#SBATCH --time=06:00:00
#SBATCH --cpus-per-task=16
#SBATCH --mem=64GB
module load bwa/0.7.17
module load samtools/1.12
# Set variables
REFERENCE="/path/to/reference/genome.fa"
FASTQ_DIR="/scratch/$USER/trimmed_fastq"
OUTPUT_DIR="/scratch/$USER/alignments"
mkdir -p $OUTPUT_DIR
# Index reference genome (if not already done)
# bwa index $REFERENCE
# Align reads
for R1 in ${FASTQ_DIR}/*_R1_paired.fastq.gz; do
SAMPLE=$(basename $R1 _R1_paired.fastq.gz)
R2=${FASTQ_DIR}/${SAMPLE}_R2_paired.fastq.gz
# BWA alignment
bwa mem -t 16 $REFERENCE $R1 $R2 | \
samtools view -Sb - | \
samtools sort -@ 8 -o ${OUTPUT_DIR}/${SAMPLE}.sorted.bam
# Index BAM file
samtools index ${OUTPUT_DIR}/${SAMPLE}.sorted.bam
done
#!/bin/bash
#SBATCH --job-name=variant_calling
#SBATCH --output=variants_%j.out
#SBATCH --time=08:00:00
#SBATCH --cpus-per-task=8
#SBATCH --mem=32GB
module load gatk/4.2.0
module load samtools/1.12
# Set variables
REFERENCE="/path/to/reference/genome.fa"
BAM_DIR="/scratch/$USER/alignments"
OUTPUT_DIR="/scratch/$USER/variants"
mkdir -p $OUTPUT_DIR
# Process each sample
for BAM in ${BAM_DIR}/*.sorted.bam; do
SAMPLE=$(basename $BAM .sorted.bam)
# Mark duplicates
gatk MarkDuplicates \
-I $BAM \
-O ${OUTPUT_DIR}/${SAMPLE}.dedup.bam \
-M ${OUTPUT_DIR}/${SAMPLE}.metrics.txt
# Index deduplicated BAM
samtools index ${OUTPUT_DIR}/${SAMPLE}.dedup.bam
# Call variants
gatk HaplotypeCaller \
-R $REFERENCE \
-I ${OUTPUT_DIR}/${SAMPLE}.dedup.bam \
-O ${OUTPUT_DIR}/${SAMPLE}.vcf.gz
done
#!/bin/bash
#SBATCH --job-name=parallel_fastqc
#SBATCH --output=fastqc_%A_%a.out
#SBATCH --error=fastqc_%A_%a.err
#SBATCH --array=1-10%4 # Process 10 samples, max 4 concurrent jobs
#SBATCH --time=01:00:00
#SBATCH --cpus-per-task=2
#SBATCH --mem=4GB
module load fastqc/0.11.9
# Create array of sample files
SAMPLES=($(ls /scratch/$USER/raw_fastq/*_R1.fastq.gz))
SAMPLE=${SAMPLES[$SLURM_ARRAY_TASK_ID-1]}
SAMPLE_NAME=$(basename $SAMPLE _R1.fastq.gz)
# Process the assigned sample
fastqc -t 2 ${SAMPLE} ${SAMPLE/_R1/_R2} \
-o /scratch/$USER/fastqc_results/
# Upload files to HPC
scp local_file.fastq.gz username@hpc:/scratch/username/
# Download files from HPC
scp username@hpc:/scratch/username/results.txt ./
# Transfer directories
scp -r username@hpc:/scratch/username/analysis_results/ ./
# Sync local directory to HPC
rsync -avz --progress local_data/ username@hpc:/scratch/username/data/
# Sync from HPC to local
rsync -avz --progress username@hpc:/scratch/username/results/ ./results/
# Resume interrupted transfer
rsync -avz --progress --partial username@hpc:/path/to/data/ ./
Many institutions provide Globus for large data transfers:
# Monitor memory usage during interactive session
watch -n 1 'free -h'
# Check memory usage of running job
ssh compute-node-XX
ps aux | grep $USER
# Clean up scratch space regularly
find /scratch/$USER -type f -mtime +30 -delete
# Compress intermediate files
gzip *.sam
pigz -p 8 *.fastq # parallel compression
# Archive completed projects
tar -czf project_archive.tar.gz /scratch/$USER/completed_project/
~/ngs_projects/
├── scripts/
│ ├── 01_quality_control.slurm
│ ├── 02_trimming.slurm
│ ├── 03_alignment.slurm
│ └── utilities/
├── configs/
│ ├── sample_list.txt
│ └── reference_paths.txt
└── logs/
└── job_logs/
# Initialize git repository for scripts
cd ~/ngs_projects/scripts
git init
git add *.slurm
git commit -m "Initial commit of analysis scripts"
# Snakefile example
rule all:
input:
expand("results/fastqc/{sample}_fastqc.html", sample=SAMPLES)
rule fastqc:
input:
"data/{sample}.fastq.gz"
output:
"results/fastqc/{sample}_fastqc.html"
threads: 2
shell:
"fastqc -t {threads} {input} -o results/fastqc/"
snakemake --cluster "sbatch --time={cluster.time} --mem={cluster.mem} --cpus-per-task={cluster.cpus}" \
--cluster-config cluster.yaml \
--jobs 10
# Check job details
scontrol show job JOBID
# Increase memory in SLURM script
#SBATCH --mem=64GB
# Use memory-efficient tools
# samtools sort -m 2G instead of default
# Increase time limit
#SBATCH --time=12:00:00
# Break large jobs into smaller chunks
# Use job arrays for parallel processing
# Check quota
quota -u $USER
# Clean up temporary files
find /scratch/$USER -name "*.tmp" -delete
# Move results to project space
rsync -av /scratch/$USER/important_results/ /project/groupname/
# Module not found
module spider software_name
# Conflicting modules
module purge
module load required_module
# Version conflicts
module list
module unload conflicting_module
# Fix file permissions
chmod 755 script.sh
chmod -R 644 *.fastq.gz
# Group permissions for shared data
chgrp -R groupname /project/groupname/shared_data/
chmod -R g+rw /project/groupname/shared_data/
# Use local storage for temporary files
export TMPDIR=/tmp/$SLURM_JOB_ID
mkdir -p $TMPDIR
# Parallel compression/decompression
pigz -p 8 -d *.gz
pbzip2 -p8 large_file.bz2
# Stream processing instead of loading entire files
zcat large_file.fastq.gz | head -1000000 | analysis_tool
# Use memory-mapped files when possible
samtools view -T reference.fa alignment.bam region
# Check system documentation
man slurm
man module
# HPC-specific help
# Most clusters have documentation at:
# https://hpc.institution.edu/docs/
This tutorial provides a foundation for using HPC systems with Rocky Linux for NGS data analysis. Key takeaways:
This tutorial is designed to be a living document. As HPC technologies and bioinformatics tools evolve, update the content accordingly. Always refer to your institution's specific HPC documentation for system-specific details.