Example Scheduler Submission Scripts

Below are example submission scripts used to configure and launch libEnsemble on a variety of high-powered systems. See here for more information about the respective systems and configuration.

General examples

Slurm - Basic
/examples/libE_submission_scripts/submit_slurm_simple.sh
#!/bin/bash
#SBATCH -J libE_simple
#SBATCH -A <myproject>
#SBATCH -p <partition_name>
#SBATCH -C <constraint_name>
#SBATCH --time 10
#SBATCH --nodes 2

# Usually either -p or -C above is used.

# On some SLURM configurations, these ensure runs can share nodes
export SLURM_EXACT=1
export SLURM_MEM_PER_NODE=0

python libe_calling_script.py -n 8
PBS - Basic
/examples/libE_submission_scripts/submit_pbs_simple.sh
#!/bin/bash -l
#PBS -l select=2
#PBS -l walltime=00:15:00
#PBS -q <queue_name>
#PBS -A <myproject>

# We selected 2 nodes - now running with 8 workers.
export MPICH_GPU_SUPPORT_ENABLED=1
cd $PBS_O_WORKDIR
python libE_calling_script.py -n 8
LSF - Basic
/examples/libE_submission_scripts/submit_lsf_simple.sh
#!/bin/bash -l
#BSUB -P <project code>
#BSUB -J libe_mproc
#BSUB -W 15
#BSUB -nnodes 2

python run_libe_forces.py -n 8

System Examples

Aurora
/examples/libE_submission_scripts/submit_pbs_aurora.sh
#!/bin/bash -l
#PBS -l select=2
#PBS -l walltime=00:30:00
#PBS -q <myqueue>
#PBS -A <myproject>

module load frameworks

export MPICH_GPU_SUPPORT_ENABLED=1
cd $PBS_O_WORKDIR

# 2 nodes - 12 sim workers (6 GPUs per node)
python libE_calling_script.py -n 13

# if using libE_specs["use_tiles_as_gpus"] = True
# 2 nodes 24 sim workers  (12 GPU tiles per node) libE_specs["use_tiles_as_gpus"] = True
# python libE_calling_script.py -n 25
Frontier (Large WarpX Ensemble)
/examples/libE_submission_scripts/submit_frontier_large.sh
#!/bin/bash
#SBATCH -J libE_warpX_full_sim_32x40
#SBATCH -A <myproject>
#SBATCH -p batch
#SBATCH --time 6:00:00
#SBATCH --nodes 240

module load cray-python

# Run one gen and 40 sim workers (6 nodes = 48 GPUs each)
python run_gpcam_warpx.py -n 41
Perlmutter
/examples/libE_submission_scripts/submit_perlmutter.sh
#!/bin/bash
#SBATCH -J libE_small_test
#SBATCH -A <myproject>
#SBATCH -C gpu
#SBATCH --time 10
#SBATCH --nodes 1

# This script is using GPU partition
export MPICH_GPU_SUPPORT_ENABLED=1
export SLURM_EXACT=1

# One worker for generator and 4 for sims (one GPU each)
python libe_calling_script.py -n 5

# Or if libE_specs option gen_on_manager=True
python libe_calling_script.py -n 4
Polaris
/examples/libE_submission_scripts/submit_pbs_polaris.sh
#!/bin/bash -l
#PBS -l select=1:system=polaris
#PBS -l walltime=00:15:00
#PBS -l filesystems=home:grand
#PBS -q debug
#PBS -A <myproject>

export MPICH_GPU_SUPPORT_ENABLED=1
cd $PBS_O_WORKDIR
python libE_calling_script.py -n 4
Bridges - Central Mode
/examples/libE_submission_scripts/bridges_submit_slurm_central.sh
#!/bin/bash
#SBATCH -J libE_test_central
#SBATCH -N 5
#SBATCH -p RM
#SBATCH -A <my_project>
#SBATCH -o tlib.%j.%N.out
#SBATCH -e tlib.%j.%N.error
#SBATCH -t 00:30:00

# Launch script for running in central mode with mpi4py.
#   libEnsemble will run on a dedicated node (or nodes).
#   The remaining nodes in the allocation will be dedicated to worker launched apps.
#   Initialize Executor with auto-resources=True and central_mode=True.

# User to edit these variables
export EXE=libE_calling_script.py
export NUM_WORKERS=4

mpirun -np $(($NUM_WORKERS+1)) -ppn $(($NUM_WORKERS+1)) python $EXE

# To use local mode instead of mpi4py (with parse_args())
# python $EXE -n $NUM_WORKERS
Bebop - Central Mode
/examples/libE_submission_scripts/bebop_submit_slurm_central.sh
#!/bin/bash
#SBATCH -J libE_test_central
#SBATCH -N 5
#SBATCH -p knlall
#SBATCH -A <my_project>
#SBATCH -o tlib.%j.%N.out
#SBATCH -e tlib.%j.%N.error
#SBATCH -t 01:00:00

# Launch script for running in central mode with mpi4py.
#   libEnsemble will run on a dedicated node (or nodes).
#   The remaining nodes in the allocation will be dedicated to worker launched apps.
#   Use executor with auto-resources=True and central_mode=True.

# User to edit these variables
export EXE=libE_calling_script.py
export NUM_WORKERS=4
export I_MPI_FABRICS=shm:tmi

# Overcommit will allow ntasks up to the no. of contexts on one node (eg. 320 on Bebop)
srun --overcommit --ntasks=$(($NUM_WORKERS+1)) --nodes=1 python $EXE

# To use local mode instead of mpi4py (with parse_args())
# python calling_script.py -n $NUM_WORKERS
Bebop - Distributed Mode
/examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh
#!/bin/bash -l
#PBS -l select=2:mpiprocs=16
#PBS -l walltime=00:15:00
#PBS -q bdwall
#PBS -A [project]
#PBS -N libE_example


cd $PBS_O_WORKDIR
module load openmpi

mpirun -n 16 --ppn 8 python run_libe_example.py
Summit (Decommissioned) - On Launch Nodes with Multiprocessing
/examples/libE_submission_scripts/summit_submit_mproc.sh
#!/bin/bash -x
#BSUB -P <project code>
#BSUB -J libe_mproc
#BSUB -W 30
#BSUB -nnodes 4
#BSUB -alloc_flags "smt1"

# Script to run libEnsemble using multiprocessing on launch nodes.
# Assumes Conda environment is set up.

# To be run with central job management
# - Manager and workers run on launch node.
# - Workers submit tasks to the compute nodes in the allocation.

# Name of calling script-
export EXE=libE_calling_script.py

# Communication Method
export COMMS="--comms local"

# Number of workers.
export NWORKERS="--nworkers 4"

# Wallclock for libE.  (allow clean shutdown)
export LIBE_WALLCLOCK=25 # Optional if pass to script

# Name of Conda environment
export CONDA_ENV_NAME=<conda_env_name>

# Need these if not already loaded
# module load python
# module load gcc/4.8.5

# Activate conda environment
export PYTHONNOUSERSITE=1
. activate $CONDA_ENV_NAME

# hash -d python # Check pick up python in conda env
hash -r # Check no commands hashed (pip/python...)

# Launch libE
# python $EXE $NUM_WORKERS > out.txt 2>&1  # No args. All defined in calling script
# python $EXE $COMMS $NWORKERS > out.txt 2>&1  # If calling script is using parse_args()
python $EXE $LIBE_WALLCLOCK $COMMS $NWORKERS > out.txt 2>&1 # If calling script takes wall-clock as positional arg.
Cobalt - Intermediate node with Multiprocessing