diff --git a/2026/HPSF/README.md b/2026/HPSF/README.md new file mode 100644 index 00000000..2dc31fba --- /dev/null +++ b/2026/HPSF/README.md @@ -0,0 +1,71 @@ +# Flux Jupyter Tutorial + +This set of tutorials provides: + + - [Building Tutorial Images](#build-images) + - [Introduction to Flux](#introduction-to-flux) + - [The Flux Operator with Agents](#the-flux-operator-with-agents) + +Pre-requisites: + + - Docker client installed locally + - Excitement to learn about Flux! + +## Build Images + +Build the tutorial images. + +```bash +docker build -f ./docker/Dockerfile -t hpsf-flux . +docker build -f ./docker/Dockerfile.ml -t hpsf-flux-ml . +docker build -f ./docker/Dockerfile.flux-operator -t ghcr.io/flux-framework/tutorials:flux-operator-pytorch . +``` + +## 1. Introduction to Flux + +The introduction to Flux section is a notebook. You can simply run the Image. + +```bash +docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --name jupyterhub -p 8888:8888 hpsf-flux +``` + +## 2. The Flux Operator with Agents + +The Flux Operator and MCP example is a Flux Framework MiniCluster. Either create a cloud (or have) a Kubernetes cluster, or create one with kind: + +```bash +kind create cluster --config ./flux-operator/kind-config.yaml +kind load docker-image ghcr.io/flux-framework/tutorials:flux-operator-pytorch +``` +Install the Flux Operator: + +```bash +kubectl apply -f https://raw.githubusercontent.com/flux-framework/flux-operator/refs/heads/main/examples/dist/flux-operator.yaml +``` + +Wait until both ranks are running, then shell inside and connect to the lead broker. + +```bash +kubectl get pods --watch +kubectl exec -it pytorch-0-xxxx -- bash +. /mnt/flux/flux-view.sh +flux proxy $fluxsocket bash +flux resource list +``` + +Now let's run Pytorch, without agents. There is a simple "hello world" and training example with cifar. + +```bash +flux run -N 2 --exclusive python distributed_flux_hello_world.py +flux run -N 2 --exclusive python distributed_flux.py +``` + +We could also put `-n` for the number of tasks (processes) per node, but exclusive will ask for all of them. + + +## 3. Kubeflow and Flux + +The Kubeflow and Flux example can use the same cluster, but we add Kubeflow. + +**TODO** + diff --git a/2026/HPSF/docker/Dockerfile b/2026/HPSF/docker/Dockerfile new file mode 100644 index 00000000..0224f596 --- /dev/null +++ b/2026/HPSF/docker/Dockerfile @@ -0,0 +1,104 @@ +FROM fluxrm/flux-sched:noble + +# Based off of https://github.com/jupyterhub/zero-to-jupyterhub-k8s/tree/main/images/singleuser-sample +# Local usage +# docker run -p 8888:8888 -v $(pwd):/home/jovyan/work test + +USER root + +ENV NB_USER=jovyan \ + NB_UID=1000 \ + HOME=/home/jovyan + +RUN adduser \ + --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + --home ${HOME} \ + --force-badname \ + ${NB_USER} + +RUN apt-get update \ + # && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + gcc-10 \ + g++-10 \ + ca-certificates \ + dnsutils \ + iputils-ping \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + openmpi-bin \ + openmpi-common \ + libopenmpi-dev \ + liblz4-dev \ + tini \ + nodejs \ + python3-greenlet \ + # requirement for nbgitpuller + git \ + && rm -rf /var/lib/apt/lists/* + +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +COPY ./docker/requirements_venv.txt ./requirements_venv.txt +RUN python3 -m venv $VIRTUAL_ENV && pip install --no-cache-dir -r requirements_venv.txt +COPY ./docker/requirements.txt ./requirements.txt + +RUN pip install ruamel.yaml.clib && \ + pip install -r requirements.txt && \ + pip install ipykernel pycurl IPython && \ + python -m IPython kernel install + +RUN wget https://nodejs.org/dist/v20.15.0/node-v20.15.0-linux-x64.tar.xz && \ + apt-get update && apt-get install -y xz-utils && rm -rf /var/lib/apt/lists/* && \ + xz -d -v node-v20.15.0-linux-x64.tar.xz && \ + tar -C /usr/local --strip-components=1 -xvf node-v20.15.0-linux-x64.tar + +# This customizes the launcher UI +# https://jupyter-app-launcher.readthedocs.io/en/latest/usage.html +RUN pip install jupyter_app_launcher && \ + pip install --upgrade jupyter-server && \ + pip install jupyter-launcher-shortcuts && \ + pip install jupyterhub_idle_culler && \ + mkdir -p /usr/local/share/jupyter/lab/jupyter_app_launcher && \ + pip install jupyterhub && \ + pip install ipywidgets + +# This customizes the launcher UI +# https://jupyter-app-launcher.readthedocs.io/en/latest/usage.html +RUN pip install jupyter_app_launcher && \ + pip install --upgrade jupyter-server && \ + pip install jupyter-launcher-shortcuts && \ + mkdir -p /usr/local/share/jupyter/lab/jupyter_app_launcher + +COPY ./tutorial /home/jovyan/ +COPY ./docker/jupyter-launcher.yaml /usr/local/share/jupyter/lab/jupyter_app_launcher/jp_app_launcher.yaml +ENV JUPYTER_APP_LAUNCHER_PATH=/usr/local/share/jupyter/lab/jupyter_app_launcher/ + +# Give jovyan user permissions to tutorial materials +RUN chmod -R 777 ~/ /home/jovyan + +WORKDIR $HOME +# Flux assets in the tutorial/ directory were moved to tutorial/img/ +# this shouldn't be a problem for the next command but making a note in case +COPY ./docker/flux-icon.png $HOME/flux-icon.png + +# note that previous examples are added via git volume in config.yaml +ENV SHELL=/usr/bin/bash +ENV FLUX_URI_RESOLVE_LOCAL=t + +EXPOSE 8888 +ENTRYPOINT ["tini", "--"] + +# This is for JupyterHub +COPY ./docker/entrypoint.sh /entrypoint.sh +RUN mkdir -p $HOME/.local/share && \ + chmod 777 $HOME/.local/share + +USER ${NB_USER} + +CMD ["flux", "start", "--test-size=4", "jupyter", "lab", "--ip=0.0.0.0"] diff --git a/2026/HPSF/docker/Dockerfile.flux-operator b/2026/HPSF/docker/Dockerfile.flux-operator new file mode 100644 index 00000000..8351027a --- /dev/null +++ b/2026/HPSF/docker/Dockerfile.flux-operator @@ -0,0 +1,14 @@ +FROM nvcr.io/nvidia/pytorch:24.02-py3 +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + wget \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /code +COPY ./flux-operator/requirements.txt /code/requirements.txt +RUN pip install flux-mcp +COPY ./flux-operator/distributed_flux.py /code/distributed_flux.py +ENV LD_LIBRARY_PATH=/opt/hpcx/ompi/lib:/opt/hpcx/mxm/lib:/opt/hpcx/ucx/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/hpcx/ompi/bin:$PATH +ENV SHELL=/usr/bin/bash diff --git a/2026/HPSF/docker/entrypoint.sh b/2026/HPSF/docker/entrypoint.sh new file mode 100755 index 00000000..8b11568e --- /dev/null +++ b/2026/HPSF/docker/entrypoint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/bin/flux start --test-size=4 /usr/local/bin/jupyterhub-singleuser \ No newline at end of file diff --git a/2026/HPSF/docker/flux-icon.png b/2026/HPSF/docker/flux-icon.png new file mode 100644 index 00000000..d50aa52d Binary files /dev/null and b/2026/HPSF/docker/flux-icon.png differ diff --git a/2026/HPSF/docker/jupyter-launcher.yaml b/2026/HPSF/docker/jupyter-launcher.yaml new file mode 100644 index 00000000..ebcd04ee --- /dev/null +++ b/2026/HPSF/docker/jupyter-launcher.yaml @@ -0,0 +1,100 @@ +- title: Flux Tutorial Notebook + description: This is the main Flux Framework Tutorial + type: jupyterlab-commands + icon: ./static/flux-icon.png + source: + - label: Flux Tutorial + id: 'filebrowser:open-path' + args: + path: ./tutorial/ch1/01_flux_tutorial.ipynb + icon: ./static/flux-icon.png + catalog: Notebook + +- title: Flux Python SDK Tutorial + description: This notebook will teach you about the Flux Python SDK + type: jupyterlab-commands + icon: ./static/flux-icon.png + source: + - label: Flux Tutorial + id: 'filebrowser:open-path' + args: + path: ./tutorial/ch2/02_flux_framework.ipynb + icon: ./static/flux-icon.png + catalog: Notebook + +- title: Flux Process, Monitoring, Utilities Tutorial + description: This will teach you about Flux utilities + type: jupyterlab-commands + icon: ./static/flux-icon.png + source: + - label: Flux Tutorial + id: 'filebrowser:open-path' + args: + path: ./tutorial/ch3/03_flux_tutorial.ipynb + icon: ./static/flux-icon.png + catalog: Notebook + +- title: Flux Framework and User-space Kubernetes + description: This will teach you about running AI/ML workloads with Flux and Kubernetes + type: jupyterlab-commands + icon: ./static/flux-icon.png + source: + - label: Flux Tutorial + id: 'filebrowser:open-path' + args: + path: ./tutorial/ch4/04_flux_framework_usernetes.ipynb + icon: ./static/flux-icon.png + catalog: Notebook + +- title: Flux Framework Portal + description: Flux Framework portal for projects, releases, and publication. + source: https://flux-framework.org/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Documentation + source: https://flux-framework.readthedocs.io/en/latest/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Cheat Sheet + source: https://flux-framework.org/cheat-sheet/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Glossary of Terms + source: https://flux-framework.readthedocs.io/en/latest/glossary.html + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Comics + source: https://flux-framework.readthedocs.io/en/latest/comics/fluxonomicon.html + description: come and meet FluxBird - the pink bird who knows things! + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Learning Guide + source: https://flux-framework.readthedocs.io/en/latest/guides/learning_guide.html + description: learn about what Flux does, how it works, and real research applications + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Getting Started with Flux and Go + source: https://converged-computing.github.io/flux-go + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Getting Started with Flux in C + source: https://converged-computing.github.io/flux-c-examples/ + description: ...looking for contributors! + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] diff --git a/2026/HPSF/docker/requirements.txt b/2026/HPSF/docker/requirements.txt new file mode 100644 index 00000000..bf086b6c --- /dev/null +++ b/2026/HPSF/docker/requirements.txt @@ -0,0 +1,339 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# Use the "Run workflow" button at https://github.com/jupyterhub/zero-to-jupyterhub-k8s/actions/workflows/watch-dependencies.yaml +# +alembic==1.11.3 + # via jupyterhub +anyio==3.7.1 + # via jupyter-server +argon2-cffi==23.1.0 + # via + # jupyter-server + # nbclassic +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +arrow==1.2.3 + # via isoduration +asttokens==2.2.1 + # via stack-data +async-generator==1.10 + # via jupyterhub +async-lru==2.0.4 + # via jupyterlab +attrs==23.1.0 + # via + # jsonschema + # referencing +babel==2.12.1 + # via jupyterlab-server +backcall==0.2.0 + # via ipython +beautifulsoup4==4.12.2 + # via nbconvert +bleach==6.0.0 + # via nbconvert +certifi==2023.7.22 + # via requests +certipy==0.1.3 + # via jupyterhub +cffi==1.15.1 + # via + # argon2-cffi-bindings + # cryptography +charset-normalizer==3.2.0 + # via requests +comm==0.1.4 + # via ipykernel +cryptography==41.0.3 + # via pyopenssl +debugpy==1.6.7.post1 + # via ipykernel +decorator==5.1.1 + # via ipython +defusedxml==0.7.1 + # via nbconvert +executing==1.2.0 + # via stack-data +fastjsonschema==2.18.0 + # via nbformat +fqdn==1.5.1 + # via jsonschema +greenlet + # via sqlalchemy +idna==3.4 + # via + # anyio + # jsonschema + # requests +ipykernel==6.25.1 + # via + # jupyterlab + # nbclassic +ipython==8.13.0 + # via ipykernel +ipython-genutils==0.2.0 + # via nbclassic +isoduration==20.11.0 + # via jsonschema +jedi==0.19.0 + # via ipython +jinja2==3.1.2 + # via + # jupyter-server + # jupyterhub + # jupyterlab + # jupyterlab-server + # nbclassic + # nbconvert +json5==0.9.14 + # via jupyterlab-server +jsonpointer==2.4 + # via jsonschema +jsonschema[format-nongpl]==4.19.0 + # via + # jupyter-events + # jupyter-telemetry + # jupyterlab-server + # nbformat +jsonschema-specifications==2023.7.1 + # via jsonschema +jupyter-client==8.3.0 + # via + # ipykernel + # jupyter-server + # nbclassic + # nbclient +jupyter-core==5.3.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # nbclassic + # nbclient + # nbconvert + # nbformat +jupyter-events==0.7.0 + # via jupyter-server +jupyter-lsp==2.2.0 + # via jupyterlab +jupyter-server==2.7.2 + # via + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # nbclassic + # nbgitpuller + # notebook-shim +jupyter-server-terminals==0.4.4 + # via jupyter-server +jupyter-telemetry==0.1.0 + # via jupyterhub +jupyterhub==4.0.2 + # via -r requirements.in +jupyterlab==4.0.5 + # via -r requirements.in +jupyterlab-pygments==0.2.2 + # via nbconvert +jupyterlab-server==2.24.0 + # via jupyterlab +mako==1.2.4 + # via alembic +markupsafe==2.1.3 + # via + # jinja2 + # mako + # nbconvert +matplotlib-inline==0.1.6 + # via + # ipykernel + # ipython +mistune==3.0.1 + # via nbconvert +nbclassic==1.0.0 + # via -r requirements.in +nbclient==0.8.0 + # via nbconvert +nbconvert==7.7.4 + # via + # jupyter-server + # nbclassic +nbformat==5.9.2 + # via + # jupyter-server + # nbclassic + # nbclient + # nbconvert +nbgitpuller==1.2.0 + # via -r requirements.in +nest-asyncio==1.5.7 + # via + # ipykernel + # nbclassic +notebook-shim==0.2.3 + # via + # jupyterlab + # nbclassic +oauthlib==3.2.2 + # via jupyterhub +overrides==7.4.0 + # via jupyter-server +packaging==23.1 + # via + # ipykernel + # jupyter-server + # jupyterhub + # jupyterlab + # jupyterlab-server + # nbconvert +pamela==1.1.0 + # via jupyterhub +pandocfilters==1.5.0 + # via nbconvert +parso==0.8.3 + # via jedi +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +platformdirs==3.10.0 + # via jupyter-core +prometheus-client==0.17.1 + # via + # jupyter-server + # jupyterhub + # nbclassic +prompt-toolkit==3.0.39 + # via ipython +psutil==5.9.5 + # via ipykernel +ptyprocess==0.7.0 + # via + # pexpect + # terminado +pure-eval==0.2.2 + # via stack-data +pycparser==2.21 + # via cffi +pygments==2.16.1 + # via + # ipython + # nbconvert +pyopenssl==23.2.0 + # via certipy +python-dateutil==2.8.2 + # via + # arrow + # jupyter-client + # jupyterhub +python-json-logger==2.0.7 + # via + # jupyter-events + # jupyter-telemetry +pyyaml==6.0.1 + # via jupyter-events +pyzmq==25.1.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # nbclassic +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +requests==2.31.0 + # via + # jupyterhub + # jupyterlab-server +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rpds-py==0.9.2 + # via + # jsonschema + # referencing +ruamel-yaml==0.17.32 + # via jupyter-telemetry +ruamel-yaml-clib + # via ruamel-yaml +send2trash==1.8.2 + # via + # jupyter-server + # nbclassic +six==1.16.0 + # via + # asttokens + # bleach + # python-dateutil + # rfc3339-validator +sniffio==1.3.0 + # via anyio +soupsieve==2.4.1 + # via beautifulsoup4 +sqlalchemy==2.0.20 + # via + # alembic + # jupyterhub +stack-data==0.6.2 + # via ipython +terminado==0.17.1 + # via + # jupyter-server + # jupyter-server-terminals + # nbclassic +tinycss2==1.2.1 + # via nbconvert +tornado==6.3.3 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterhub + # jupyterlab + # nbclassic + # nbgitpuller + # terminado +traitlets==5.9.0 + # via + # comm + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # jupyter-events + # jupyter-server + # jupyter-telemetry + # jupyterhub + # jupyterlab + # matplotlib-inline + # nbclassic + # nbclient + # nbconvert + # nbformat +typing-extensions==4.7.1 + # via + # alembic + # sqlalchemy +uri-template==1.3.0 + # via jsonschema +urllib3==2.0.4 + # via requests +wcwidth==0.2.6 + # via prompt-toolkit +webcolors==1.13 + # via jsonschema +webencodings==0.5.1 + # via + # bleach + # tinycss2 +websocket-client==1.6.1 + # via jupyter-server \ No newline at end of file diff --git a/2026/HPSF/docker/requirements_venv.txt b/2026/HPSF/docker/requirements_venv.txt new file mode 100644 index 00000000..04bf0f62 --- /dev/null +++ b/2026/HPSF/docker/requirements_venv.txt @@ -0,0 +1,8 @@ +# Used for the DYAD notebook +Pygments +build +ipykernel +jsonschema +cffi +ply +pyyaml diff --git a/2026/HPSF/flux-operator/distributed_flux.py b/2026/HPSF/flux-operator/distributed_flux.py new file mode 100644 index 00000000..d2fac2a1 --- /dev/null +++ b/2026/HPSF/flux-operator/distributed_flux.py @@ -0,0 +1,97 @@ +import os +import torch +import torch.nn as nn +import torch.optim as optim +import torch.distributed as dist +import torchvision +import torchvision.transforms as transforms +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler +from torchvision.models import resnet18 + + +def setup(): + dist.init_process_group(backend="mpi") + rank = dist.get_rank() + + if torch.cuda.is_available(): + # Map local rank to specific GPU + local_rank = int(os.environ.get("FLUX_TASK_LOCAL_ID", 0)) + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + else: + device = torch.device("cpu") + + return rank, device + + +def train(): + rank, device = setup() + world_size = dist.get_world_size() + + if rank == 0: + print(f"Starting training on {world_size} nodes...") + + # 1. Data Prep (Standard CIFAR-10 transforms) + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + ) + + # Download dataset (only Rank 0 downloads, others wait) + if rank == 0: + torchvision.datasets.CIFAR10(root="./data", train=True, download=True) + + # Sync all ranks + dist.barrier() + + trainset = torchvision.datasets.CIFAR10( + root="./data", train=True, download=True, transform=transform + ) + + # DistributedSampler partitions the data so each rank gets 1/N of the images + sampler = DistributedSampler(trainset, num_replicas=world_size, rank=rank) + trainloader = torch.utils.data.DataLoader( + trainset, batch_size=32, sampler=sampler, num_workers=2 + ) + + # Model Prep... + model = resnet18(num_classes=10).to(device) + + # Wrap model in DDP + # For MPI backend on CPU, we use the model as is. + # For GPU, DDP handles the internal gradient bucketing. + model = DDP(model, device_ids=[device.index] if device.type == "cuda" else None) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(model.parameters(), lr=0.001 * world_size, momentum=0.9) + + # Training Loop + model.train() + for epoch in range(2): # Just 2 epochs for demonstration + sampler.set_epoch(epoch) + running_loss = 0.0 + + for i, data in enumerate(trainloader, 0): + inputs, labels = data[0].to(device), data[1].to(device) + + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + running_loss += loss.item() + if i % 100 == 99 and rank == 0: + print( + f"[Epoch {epoch + 1}, Batch {i + 1}] Loss: {running_loss / 100:.3f}" + ) + running_loss = 0.0 + + if rank == 0: + print("Finished Training.") + + dist.destroy_process_group() + + +if __name__ == "__main__": + train() diff --git a/2026/HPSF/flux-operator/distributed_flux_hello_world.py b/2026/HPSF/flux-operator/distributed_flux_hello_world.py new file mode 100644 index 00000000..71379d57 --- /dev/null +++ b/2026/HPSF/flux-operator/distributed_flux_hello_world.py @@ -0,0 +1,27 @@ +import os +import torch +import torch.distributed as dist + + +def run(): + # Don't set MASTER_ADDR or MASTER_PORT environment variables. + # Let MPI handle the handshake entirely + if not dist.is_initialized(): + dist.init_process_group(backend="mpi") + + rank = dist.get_rank() + world_size = dist.get_world_size() + print(f"Hello from Rank {rank}/{world_size}") + + # Dummy tensor for all-reduce + tensor = torch.ones(1) * (rank + 1) + dist.all_reduce(tensor) + + if rank == 0: + print(f"Success! Result: {tensor.item()}") + + dist.destroy_process_group() + + +if __name__ == "__main__": + run() diff --git a/2026/HPSF/flux-operator/kind-config.yaml b/2026/HPSF/flux-operator/kind-config.yaml new file mode 100755 index 00000000..5744a66b --- /dev/null +++ b/2026/HPSF/flux-operator/kind-config.yaml @@ -0,0 +1,7 @@ +# Run this from this directory! +# kind create cluster -f kind-config.yaml +apiVersion: kind.x-k8s.io/v1alpha4 +kind: Cluster +nodes: + - role: control-plane + - role: worker diff --git a/2026/HPSF/flux-operator/minicluster.yaml b/2026/HPSF/flux-operator/minicluster.yaml new file mode 100755 index 00000000..08f7f5c2 --- /dev/null +++ b/2026/HPSF/flux-operator/minicluster.yaml @@ -0,0 +1,12 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: pytorch +spec: + size: 2 + interactive: true + flux: + container: + image: ghcr.io/converged-computing/flux-view-ubuntu:tag-jammy + containers: + - image: ghcr.io/flux-framework/tutorials:flux-operator-pytorch diff --git a/2026/HPSF/flux-operator/requirements.txt b/2026/HPSF/flux-operator/requirements.txt new file mode 100644 index 00000000..6a7fbd9c --- /dev/null +++ b/2026/HPSF/flux-operator/requirements.txt @@ -0,0 +1 @@ +flux-mcp diff --git a/2026/HPSF/tutorial/assets/Flux-logo.svg b/2026/HPSF/tutorial/assets/Flux-logo.svg new file mode 100644 index 00000000..f2d126bb --- /dev/null +++ b/2026/HPSF/tutorial/assets/Flux-logo.svg @@ -0,0 +1 @@ +Flux-logo-3 \ No newline at end of file diff --git a/2026/HPSF/tutorial/assets/flux-icon.png b/2026/HPSF/tutorial/assets/flux-icon.png new file mode 100644 index 00000000..d50aa52d Binary files /dev/null and b/2026/HPSF/tutorial/assets/flux-icon.png differ diff --git a/2026/HPSF/tutorial/ch1/01_flux_tutorial.ipynb b/2026/HPSF/tutorial/ch1/01_flux_tutorial.ipynb new file mode 100644 index 00000000..59565fe3 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/01_flux_tutorial.ipynb @@ -0,0 +1,726 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2507d149-dcab-458a-a554-37388e0ee13a", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "40e867ba-f689-4301-bb60-9a448556bb84", + "metadata": { + "tags": [] + }, + "source": [ + "# Welcome to the Flux Tutorial\n", + "\n", + "> What is Flux Framework? πŸ€”οΈ\n", + " \n", + "Flux is a flexible framework for resource management, built for your site. The framework consists of a suite of projects, tools, and libraries that may be used to build site-custom resource managers for High Performance Computing centers and cloud environments. Flux is a next-generation resource manager and scheduler with many transformative capabilities like hierarchical scheduling and resource management (you can think of it as \"fractal scheduling\") and directed-graph based resource representations.\n", + "\n", + "## I'm ready! How do I do this tutorial? 😁️\n", + "\n", + "This tutorial is split into 3 chapters, each of which has a notebook. Let's first setup your workspace. Click on to create a Terminal. Drag it to be alongside this notebook so you have a terminal on the left alongside this text on the right (you should see both at once).\n", + "\n", + "When you are done, let's get started!" + ] + }, + { + "cell_type": "markdown", + "id": "15e82c38-8465-49ac-ae2b-b0bb56a79ec9", + "metadata": { + "tags": [] + }, + "source": [ + "\n", + "# Getting started with Flux\n", + "\n", + "The code and examples that this tutorial is based on can be found at [flux-framework/Tutorials](https://github.com/flux-framework/Tutorials/tree/master/2024-HPCIC-AWS). You can also find python examples in the `assets/flux-workflow-examples` directory from the sidebar navigation in this JupyterLab instance. " + ] + }, + { + "cell_type": "markdown", + "id": "ae33fef6-278c-4996-8534-fd15e548b338", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "Tip: Did you know you can get help for flux or a flux command? For example, try \"flux help\" and \"flux help jobs\"\n", + "
\n", + "\n", + "Try asking for help first across top level commands.\n", + "\n", + "```bash\n", + "flux help\n", + "```\n", + "\n", + "Now try for a specific command\n", + "\n", + "```bash\n", + "flux help jobs\n", + "```\n", + "The second will open a manpage, and you can press \"Q\" for quit to exit. Now let's get help for a specific subcommand. This works too:\n", + "\n", + "```bash\n", + "flux submit --help\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "ec052119", + "metadata": {}, + "source": [ + "## Flux Resources\n", + "\n", + "When you are interacting with Flux, you will commonly want to know what resources are available to you. Flux uses [hwloc](https://github.com/open-mpi/hwloc) to detect the resources on each node and then to populate its resource graph.\n", + "\n", + "You can access the topology information that Flux collects with the `flux resource` subcommand. Let's run `flux resource list` to see the resources available to us in this notebook.\n", + "\n", + "```bash\n", + "flux resource list\n", + " STATE NNODES NCORES NGPUS NODELIST\n", + " free 1 64 0 ip-10-0-25-10\n", + " allocated 0 0 0 \n", + " down 0 0 0 \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "0086e47e", + "metadata": {}, + "source": [ + "Flux can also bootstrap its resource graph based on static input files, like in the case of a multi-user system instance setup by site administrators. [More information on Flux's static resource configuration files](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/admin.html#configuration). Flux provides a more standard interface to listing available resources that works regardless of the resource input source: `flux resource`.\n", + "\n", + "```bash\n", + "# To view status of resources\n", + "flux resource status\n", + " STATE UP NNODES NODELIST\n", + " avail βœ” 1 ip-10-0-25-10\n", + "```\n", + "The output above may vary based on your resources. It might also be the case that you need to see queues. Here is how to do that:\n", + "\n", + "```bash\n", + "flux queue list\n", + " EN ST TDEFAULT TLIMIT NNODES NCORES NGPUS\n", + " βœ” βœ” inf inf 0-1 0-64 0-0\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "dee2d6af-43fa-490e-88e9-10f13e660125", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "\n", + "# Flux Commands \n", + "\n", + "Here are how Flux commands map to a scheduler you are likely familiar with, Slurm. A larger table with similar mappings for LSF, Moab, and Slurm can be [viewed here](https://hpc.llnl.gov/banks-jobs/running-jobs/batch-system-cross-reference-guides). For submitting jobs, you can use the `flux` `submit`, `run`, `bulksubmit`, `batch`, and `alloc` commands.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OperationSlurmFlux
One-off run of a single job (blocking)srunflux run
One-off run of a single job (interactive)srun --ptyflux run -o pty.interactive
One-off run of a single job (not blocking)NAflux submit
Bulk submission of jobs (not blocking)NAflux bulksubmit
Watching jobsNAflux watch
Querying the status of jobssqueue/scontrol show job job_idflux jobs/flux job info job_id
Canceling running jobsscancelflux cancel
Allocation for an interactive instancesallocflux alloc
Submitting batch jobssbatchflux batch
" + ] + }, + { + "cell_type": "markdown", + "id": "ac798095", + "metadata": {}, + "source": [ + "## flux run\n", + "\n", + "
\n", + "Description: Running a single job (blocking)\n", + "
\n", + "\n", + "The `flux run` command submits a job to Flux (similar to `flux submit`) but then attaches to the job with `flux job attach`, printing the job's stdout/stderr to the terminal and exiting with the same exit code as the job. It's basically doing an interactive submit, because you will be able to watch the output in your terminal, and it will block your terminal until the job completes.\n", + "\n", + "```bash\n", + "flux run sh -c 'sleep 5 && echo hello'\n", + "hello\n", + "```\n", + "\n", + "The output from the previous command is the hostname (a container ID string in this case). If the job exits with a non-zero exit code this will be reported by `flux job attach` (occurs implicitly with `flux run`). For example, execute the following:\n", + "\n", + "```bash\n", + "flux run /bin/false\n", + "flux-job: task(s) exited with exit code 1\n", + "```\n", + "A job submitted with `run` can be canceled with two rapid `Cltr-C`s in succession, or a user can detach from the job with `Ctrl-C Ctrl-Z`. The user can then re-attach to the job by using `flux job attach JOBID`. `flux submit` and `flux run` also support many other useful flags:\n", + "\n", + "```bash\n", + "flux run -N1 -n1 -c1 hostname\n", + "ip-10-0-25-10\n", + "```\n", + "\n", + "```bash\n", + "flux run -n4 --label-io --time-limit=5s --env-remove=LD_LIBRARY_PATH hostname\n", + "3: ip-10-0-25-10\n", + "2: ip-10-0-25-10\n", + "1: ip-10-0-25-10\n", + "0: ip-10-0-25-10\n", + "```\n", + "\n", + "To see all flags for a `flux run`:\n", + "\n", + "```bash\n", + "flux run --help\n", + "```\n", + "\n", + "Let's try compiling the Makefile in this directory.\n", + "\n", + "```bash\n", + "flux run make -j\n", + "mpicc -o hello hello.c\n", + "```\n", + "And run an exclusive job using all the processes on the node:\n", + "\n", + "```bash\n", + "flux run -N1 -n32 -c2 --exclusive ./hello\n", + "Ζ’2261FtDpK: completed MPI_Init in 2.171s. There are 32 tasks\n", + "Ζ’2261FtDpK: completed first barrier in 0.001s\n", + "Ζ’2261FtDpK: completed MPI_Finalize in 0.021s\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "7c09708a-74a1-4e61-b678-cb337b7df435", + "metadata": {}, + "source": [ + "## flux submit\n", + "\n", + "
\n", + "Description: Running a single job (not blocking)\n", + "
\n", + "\n", + "\n", + "The `flux submit` command submits a job to Flux and prints out the jobid. Let's peek at the help first.\n", + "\n", + "```bash\n", + "flux submit --help\n", + "```\n", + "\n", + "Let's test with a basic hostname.\n", + "\n", + "```bash\n", + "flux submit hostname\n", + "```\n", + "\n", + "But how does one get output? To quickly see output (which will block the terminal if the job is still running) after a submit, you can do:\n", + "\n", + "```bash\n", + "flux job attach $(flux job last)\n", + "```\n", + "\n", + "How does that work? The command `flux job last` provides the last job identifier.\n", + "\n", + "```bash\n", + "flux job last\n", + "```\n", + "\n", + "And when we attach to it, if the job is still running we will block the terminal. To provide a custom path to an output or error file, you can provide `--out` and `--err`, respectively. Let's try those both now. Now let's submit another one, and give it the same output and error file\n", + "\n", + "```bash\n", + "flux submit --out /tmp/harry-potter.txt --err /tmp/harry-potter.txt echo \"Yer a wizard, $(whoami)!\"\n", + "```\n", + "\n", + "Take a look!\n", + "\n", + "```bash\n", + "cat /tmp/harry-potter.txt\n", + "```\n", + "\n", + "The `submit` command supports common options like `--nnodes`, `--ntasks`, and `--cores-per-task`. There are short option equivalents (`-N`, `-n`, and `-c`, respectively) of these options as well. `--cores-per-task=1` is the default.\n", + "\n", + "```bash\n", + "flux submit -N1 -n2 sleep inf\n", + "```\n", + "\n", + "#### Some submission flags of note\n", + "\n", + "* `-N` specifies a number of nodes\n", + "* `-n` specifies a number of tasks for distributed applications, and cores for interactive allocations\n", + "* `-c` specifies a number of cores per task\n", + "* `--requires` constrains a job to run on a specific rank or hostname\n", + "* `--dependency` makes a job depend on another job\n", + "* `-cc` submits carbon copies of the same job many times\n", + "* `--out` and `--err` redirect output and error to files\n", + "\n", + "Let's work through some examples of these flags!" + ] + }, + { + "cell_type": "markdown", + "id": "91e9ed6c", + "metadata": {}, + "source": [ + "## flux bulksubmit\n", + "\n", + "
\n", + "Description: Submitting jobs in bulk (not blocking)\n", + "
\n", + "\n", + "The `flux bulksubmit` command enqueues jobs based on a set of inputs which are substituted on the command line, similar to `xargs` and the GNU `parallel` utility, except the jobs have access to the resources of an entire Flux instance instead of only the local system.\n", + "\n", + "```bash\n", + "flux bulksubmit --wait echo {} ::: harry ron hermione\n", + "Ζ’3Z7z2ZZ6o\n", + "Ζ’3Z7z2ZZ6p\n", + "Ζ’3Z7z2ZZ6q\n", + "```\n", + "\n", + "### carbon copy\n", + "\n", + "The `--cc` option (akin to \"carbon copy\") to `submit` makes repeated submission even easier via, `flux submit --cc=IDSET`:\n", + "\n", + "```bash\n", + "flux submit --watch --cc=1-4 echo {cc}\n", + "Ζ’24R3EgAAB\n", + "Ζ’24R3EgAAC\n", + "Ζ’24R3EgAAD\n", + "Ζ’24R3EgAAE\n", + "1\n", + "2\n", + "3\n", + "4\n", + "```\n", + "\n", + "Try this to add a progress bar and see the jobs/s rate report:\n", + "\n", + "```bash\n", + "flux submit --cc=1-100 --watch --progress --jps hostname\n", + "```\n", + "\n", + "Note that `--wait` is implied by `--watch`, meaning that when you are watching jobs, you are also waiting for them to finish. Here are some other carbon copy commands that are useful to try:\n", + "\n", + "```bash\n", + "# Use flux carbon copy to submit identical jobs with different inputs\n", + "flux submit --cc=\"1-10\" echo \"Hello I am job {cc}\"\n", + "\n", + "# Submits scripts myscript1.sh through myscript10.sh\n", + "flux submit --cc=0-6 flux-workflow-examples/bulksubmit/{cc}.sh\n", + "\n", + "# Bypass the key value store and write output to file with jobid\n", + "flux submit --cc=1-10 --output=job-{{id}}.out echo \"This is job {cc}\"\n", + "\n", + "# Use carbon copy to submit identical jobs with different inputs\n", + "flux bulksubmit --dry-run --cc={1} echo {0} ::: a b c ::: 0-1 0-3 0-7\n", + "```\n", + "\n", + "Of course, Flux can launch more than just single-node, single-core jobs. We can submit multiple heterogeneous jobs and Flux will co-schedule the jobs while also ensuring no oversubscription of resources (e.g., cores). Let's run the second example here, and add a clever trick to ask for output as we submit the jobs. This is a fun one, I promise!\n", + "\n", + "```bash\n", + "for jobid in $(flux submit --cc=0-7 /bin/bash bulksubmit/{cc}.sh); \n", + " do \n", + " flux job attach ${jobid}; \n", + " sleep 1; \n", + "done\n", + "```\n", + "\n", + "Note: in this tutorial, we cannot assume that the host you are running on has multiple cores, thus the examples below only vary the number of nodes per job. Varying the `cores-per-task` is also possible on Flux when the underlying hardware supports it (e.g., a multi-core node). Here are two examples.\n", + "\n", + "```bash\n", + "# Two nodes, 2 tasks, and each task has one core\n", + "flux submit --nodes=2 --ntasks=2 --cores-per-task=1 --job-name magic sleep inf\n", + "\n", + "# One node, one task, and each task has one core\n", + "flux submit --nodes=1 --ntasks=1 --cores-per-task=1 --job-name moremagic sleep inf\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "641f446c-b2e8-40d8-b6bd-eb6b9dba3c71", + "metadata": {}, + "source": [ + "## flux watch\n", + "\n", + "
\n", + "Description: πŸ‘€οΈ Watching jobs\n", + "
\n", + "\n", + "Wouldn't it be cool to submit a job and then watch it? Well, yeah! We can do this now with flux watch. Let's run a fun example, and then watch the output. We have sleeps in here interspersed with echos only to show you the live action! πŸ₯žοΈ\n", + "Also note a nice trick - you can always use `flux job last` to get the last JOBID.\n", + "Here is an example (not runnable, as notebooks don't support environment variables) for getting and saving a job id:\n", + "\n", + "```bash\n", + "flux submit hostname\n", + "JOBID=$(flux job last)\n", + "```\n", + "\n", + "And then you could use the variable `$JOBID` in your subsequent script or interactions with Flux! So what makes `flux watch` different from `flux job attach`? Aside from the fact that `flux watch` is read-only, `flux watch` can watch many (or even all (`flux watch --all`) jobs at once!\n", + "\n", + "```bash\n", + "flux submit ./job-watch.sh\n", + "flux watch $(flux job last)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "3f8c2af2", + "metadata": {}, + "source": [ + "## flux jobs\n", + "\n", + "
\n", + "Description: Querying the status of jobs\n", + "
\n", + "\n", + "We can now list the jobs in the queue with `flux jobs` and we should see both jobs that we just submitted. Jobs that are instances are colored blue in output, red jobs are failed jobs, and green jobs are those that completed successfully. Note that the JupyterLab notebook may not display these colors. You will be able to see them in the terminal.\n", + "\n", + "```bash\n", + "flux submit sleep inf\n", + "flux jobs\n", + "```\n", + "\n", + "You might also want to see all the jobs with `-a`:\n", + "\n", + "```bash\n", + "flux jobs -a\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "77ca4277", + "metadata": {}, + "source": [ + "## flux cancel\n", + "\n", + "
\n", + "Description: Canceling running jobs\n", + "
\n", + "\n", + "Since some of the jobs we see in the table above won't ever exit (and we didn't specify a timelimit), let's cancel them all now and free up the resources.\n", + "\n", + "```bash\n", + "# This was previously flux cancelall -f\n", + "flux cancel --all\n", + "flux jobs\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "2d3e314e-98eb-487a-ad8e-1442840e37d8", + "metadata": {}, + "source": [ + "## flux alloc\n", + "\n", + "
\n", + "Description: Allocation for an interactive instance\n", + "
\n", + "\n", + "You might want to request an allocation for a set of resources (an allocation) and then attach to them interactively. This is the goal of flux alloc. Try these commands in the terminal: \n", + "\n", + "```bash\n", + "# Look at the resources you have outside of the allocation\n", + "flux resource list\n", + "\n", + "# Request an allocation with 2 \"nodes\" - a subset of what you have in total\n", + "flux alloc -N 2\n", + "\n", + "# See the resources you are given\n", + "flux resource list\n", + "\n", + "# You can exit from the allocation like this!\n", + "exit\n", + "```\n", + "When you want to automate this, submitting work to an allocation, you would use `flux batch`." + ] + }, + { + "cell_type": "markdown", + "id": "544aa0a9", + "metadata": {}, + "source": [ + "## flux batch\n", + "\n", + "
\n", + "Description: Submitting batch jobs\n", + "
\n", + "\n", + "We can use the `flux batch` command to easily created nested flux instances. When `flux batch` is invoked, Flux will automatically create a nested instance that spans the resources allocated to the job, and then Flux runs the batch script passed to `flux batch` on rank 0 of the nested instance. \"Rank\" refers to the rank of the Tree-Based Overlay Network (TBON) used by the [Flux brokers](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-broker.html).\n", + "\n", + "While a batch script is expected to launch parallel jobs using `flux run` or `flux submit` at this level, nothing prevents the script from further batching other sub-batch-jobs using the `flux batch` interface, if desired. Take a quick look at [sleep_batch.sh](sleep_batch.sh) to see what we are about to run, then run:\n", + "\n", + "```bash\n", + "flux batch --nslots=2 --cores-per-slot=1 --nodes=2 ./sleep_batch.sh\n", + "flux batch --nslots=2 --cores-per-slot=1 --nodes=2 ./sleep_batch.sh\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "7f2b135c-ece7-45f7-b25d-dc90ba5f44f7", + "metadata": {}, + "source": [ + "### `flux job`\n", + "\n", + "Let's next inspect the last job we ran with `flux job info` and target the last job identifier with `flux job last`. \n", + "\n", + "```bash\n", + "# Note here we are using flux job last to see the last job id\n", + "# The \"R\" here asks for the resource spec\n", + "flux job info $(flux job last) R\n", + "\n", + "# When we attach it will direct us to our output file\n", + "flux job attach $(flux job last)\n", + "```\n", + "\n", + "We can again see a list all completed jobs with `flux jobs -a`. To restrict the output to failed (i.e., jobs that exit with nonzero exit code, time out, or are canceled or killed) jobs, run:\n", + "\n", + "```bash\n", + "flux jobs -f failed\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "6bc17bac-2fc4-4418-8939-e930f9929976", + "metadata": {}, + "source": [ + "### flux submit from within a batch\n", + "\n", + "Next open up [hello-batch.sh](hello-batch.sh) to see an example of using `flux batch` to submit jobs within the instance, and then wait for them to finish. This script is going to:\n", + "\n", + "1. Create a flux instance with the top level resources you specify\n", + "2. Submit jobs to the scheduler controlled by the broker of that sub-instance\n", + "3. Run the four jobs, with `--flags=waitable` and `flux job wait --all` to wait for the output file\n", + "4. Within the batch script, you can add `--wait` or `--flags=waitable` to individual jobs, and use `flux queue drain` to wait for the queue to drain, _or_ `flux job wait --all` to wait for the jobs you flagged to finish. \n", + "\n", + "Note that when you submit a batch job, you'll get a job id back for the _batch job_, and usually when you look at the output of that with `flux job attach $jobid` you will see the output file(s) where the internal contents are written. Since we want to print the output file easily to the terminal, we are waiting for the batch job by adding the `--flags=waitable` and then waiting for it. Let's try to run our batch job now.\n", + "\n", + "```bash\n", + "flux batch --flags=waitable --out /tmp/flux-batch.out -N1 ./hello-batch.sh\n", + "flux job wait\n", + "cat /tmp/hello-batch-1.out\n", + "cat /tmp/hello-batch-2.out\n", + "cat /tmp/hello-batch-3.out\n", + "cat /tmp/hello-batch-4.out\n", + "```\n", + "```console\n", + "Hello job 1 from ip-10-0-25-10 πŸ’›οΈ\n", + "Hello job 2 from ip-10-0-25-10 πŸ’šοΈ\n", + "Hello job 3 from ip-10-0-25-10 πŸ’™οΈ\n", + "Hello job 4 from ip-10-0-25-10 πŸ’œοΈ\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "04b405b1-219f-489c-abfc-e2983e82124a", + "metadata": {}, + "source": [ + "### The Flux Hierarchy πŸ‡οΈ\n", + "\n", + "One feature of the Flux Framework scheduler that is unique is its ability to submit jobs within instances, where an instance can be thought of as a level in a graph. Let's start with a basic image - this is what it might look like to submit to a scheduler that is not graph-based (left), where all jobs go to a central job queue or database. Note that our maximum job throughput is one job per second. The throughput is limited by the workload manager's ability to process a single job. We can improve upon this by simply adding another level, perhaps with three instances. For example, let's say we create a flux allocation or batch that has control of some number of child nodes. We might launch three new instances (each with its own scheduler and queue, right image) at that level two, and all of a sudden, we get a throughput of 1x3, or three jobs per second.\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "All of a sudden, the throughout can increase exponentially because we are essentially submitting to different schedulers. The example above is not impressive, but our [learning guide](https://flux-framework.readthedocs.io/en/latest/guides/learning_guide.html#fully-hierarchical-resource-management-techniques) (Figure 10) has a beautiful example of how it can scale, done via an actual experiment. We were able to submit 500 jobs/second using only three levels, vs. close to 1 job/second with one level. For an interesting detail, you can vary the scheduler algorithm or topology within each sub-instance, meaning that you can do some fairly interesting things with scheduling work, and all without stressing the top level system instance. \n", + "\n", + "Now that we understand nested instances, let's look at another batch example that better uses them. Here we have two job scripts:\n", + "\n", + "- [sub_job1.sh](sub_job1.sh): Is going to be run with `flux batch` and submit sub_job2.sh\n", + "- [sub_job2.sh](sub_job2.sh): Is going to be submitted by sub_job1.sh.\n", + "\n", + "Take a look at each script to see how they work, and then submit it!\n", + "\n", + "```bash\n", + "flux batch ./sub_job1.sh\n", + "```\n", + "\n", + "And now that we've submitted, let's look at the hierarchy for all the jobs we just ran. Here is how to try flux pstree, which normally can show jobs in an instance, but it has limited functionality given we are in a notebook! So instead of just running the single command, let's add \"-a\" to indicate \"show me ALL jobs.\"\n", + "More complex jobs and in a different environment would have deeper nesting. You can [see examples here](https://flux-framework.readthedocs.io/en/latest/jobs/hierarchies.html?h=pstree#flux-pstree-command).\n", + "\n", + "```bash\n", + "flux pstree -a\n", + ".\n", + "β”œβ”€β”€ ./sub_job1.sh:CD\n", + "β”œβ”€β”€ ./hello-batch.sh:CD\n", + "β”œβ”€β”€ ./hello-batch.sh:F\n", + "β”œβ”€β”€ 11*[echo:CD]\n", + "β”œβ”€β”€ 2*[./sleep_batch.sh:F]\n", + "β”œβ”€β”€ sleep:CA\n", + "β”œβ”€β”€ job-watch.sh:CD\n", + "β”œβ”€β”€ 104*[hostname:CD]\n", + "β”œβ”€β”€ moremagic:F\n", + "β”œβ”€β”€ magic:F\n", + "β”œβ”€β”€ 8*[bash:CD]\n", + "β”œβ”€β”€ hello:CD\n", + "β”œβ”€β”€ hello:F\n", + "β”œβ”€β”€ 2*[make:CD]\n", + "β”œβ”€β”€ make:F\n", + "β”œβ”€β”€ false:F\n", + "└── sh:CD\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "7724130f-b0db-4ccf-a01e-98907b9a27ca", + "metadata": {}, + "source": [ + "You can also try a more detailed view with `flux pstree -a -X`!" + ] + }, + { + "cell_type": "markdown", + "id": "70e3df1d-32c9-4996-b6f7-2fa85f4c02ad", + "metadata": { + "tags": [] + }, + "source": [ + "### flux start\n", + "\n", + "
\n", + "Description: Interactively starting a set of resources\n", + "
\n", + "\n", + "Sometimes you need to interactively start a set of compute resources. We call this subset a flux instance. You can launch jobs under this instance, akin to how you've done above! In fact, this entire tutorial is started (to give you 4 faux nodes) with a `flux start` command: \n", + "\n", + "```bash\n", + "flux start --test-size=4\n", + "```\n", + "\n", + "A Flux instance may be running as the default resource manager on a cluster, a job in a resource manager such as Slurm, LSF, or Flux itself, or as a test instance launched locally. This is really neat because it means you can launch Flux under other resource managers where it is not installed as the system workload manager. You can also execute \"one off\" commands to it, for example, to see the instance size:\n", + "\n", + "```bash\n", + "flux start --test-size=4 flux getattr size\n", + "4\n", + "```\n", + "\n", + "When you run `flux start` without a command, it will give you an interactive shell to the instance. When you provide a command (as we do above) it will run it and exit. This is what happens for the command above! The output indicates the number of brokers started successfully. As soon as we get and print the size, we exit." + ] + }, + { + "cell_type": "markdown", + "id": "926bd17b-b288-4b51-b984-1478dd382954", + "metadata": { + "tags": [] + }, + "source": [ + "#### **Wrap up: All the Different Ways to Do Work (from the CLI)**\n", + "Here's a basic table that shows the four submission commands we use in Flux. \n", + "\n", + "| | creates subinstance | runs distributed application |\n", + "|------------------------|-------------------------------|---------------------------------------|\n", + "| interactive | `flux alloc` | `flux run` |\n", + "| backgrounded | `flux batch` | `flux submit`πŸ‘€ |\n", + "\n", + "* `flux alloc` will allocate resources and start an interactive Flux sub-instance underneath those resources. Within that subinstance, you can submit as many jobs as you like, with no worry about backing up the parent (usually system) instance.\n", + "* `flux batch` will also allocate resources and start a Flux sub-instance, but the job is not interactive, and thus `batch` requires a script outlining the work to do.\n", + "* `flux run` runs a program under a Flux instance. It does not create a new sub-instance, and will watch until the program completes.\n", + "* `flux submit` does not exist in other resource managers, notably Slurm. It does the same thing as `flux run`, but does not watch for job output, instead writing this to a file. " + ] + }, + { + "cell_type": "markdown", + "id": "c9c3e767-0459-4218-a8cf-0f98bd32d6bf", + "metadata": {}, + "source": [ + "# This concludes Chapter 1! πŸ“—οΈ\n", + "\n", + "In this module, we covered:\n", + "1. Submitting jobs with Flux\n", + "2. The differences in submission commands.\n", + "\n", + "To continue with the tutorial, open [Chapter 2](../ch2/02_flux_framework.ipynb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/2026/HPSF/tutorial/ch1/Makefile b/2026/HPSF/tutorial/ch1/Makefile new file mode 100644 index 00000000..c3530067 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/Makefile @@ -0,0 +1,2 @@ +make: + mpicc -o hello hello.c diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/0.sh b/2026/HPSF/tutorial/ch1/bulksubmit/0.sh new file mode 100755 index 00000000..6570ec41 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/0.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "Enter, stranger, but take heed" +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/1.sh b/2026/HPSF/tutorial/ch1/bulksubmit/1.sh new file mode 100755 index 00000000..dd6c458a --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/1.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "Of what awaits the sin of greed," +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/2.sh b/2026/HPSF/tutorial/ch1/bulksubmit/2.sh new file mode 100755 index 00000000..cfbfa217 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/2.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "For those who take, but do not earn," +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/3.sh b/2026/HPSF/tutorial/ch1/bulksubmit/3.sh new file mode 100755 index 00000000..c1fea0ee --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/3.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "Must pay most dearly in their turn," +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/4.sh b/2026/HPSF/tutorial/ch1/bulksubmit/4.sh new file mode 100755 index 00000000..3359c352 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/4.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "So if you seek beneath our floors" +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/5.sh b/2026/HPSF/tutorial/ch1/bulksubmit/5.sh new file mode 100755 index 00000000..9a07e711 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/5.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "A treasure that was never yours," +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/6.sh b/2026/HPSF/tutorial/ch1/bulksubmit/6.sh new file mode 100755 index 00000000..6eabff7c --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/6.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "Thief, you have been warned, beware" +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/bulksubmit/7.sh b/2026/HPSF/tutorial/ch1/bulksubmit/7.sh new file mode 100755 index 00000000..20f9e2f4 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/bulksubmit/7.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "Of finding more than treasure there." +sleep 1 diff --git a/2026/HPSF/tutorial/ch1/hello-batch.sh b/2026/HPSF/tutorial/ch1/hello-batch.sh new file mode 100755 index 00000000..3c39dc02 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/hello-batch.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-1.out echo "Hello job 1 from $(hostname) πŸ’›οΈ" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-2.out echo "Hello job 2 from $(hostname) πŸ’šοΈ" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-3.out echo "Hello job 3 from $(hostname) πŸ’™οΈ" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-4.out echo "Hello job 4 from $(hostname) πŸ’œοΈ" +# Wait for the jobs to finish +flux job wait --all \ No newline at end of file diff --git a/2026/HPSF/tutorial/ch1/hello.c b/2026/HPSF/tutorial/ch1/hello.c new file mode 100644 index 00000000..b43bfaa9 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/hello.c @@ -0,0 +1,94 @@ +/************************************************************\ + * Copyright 2014 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include + +static struct timespec ts_diff (struct timespec start, struct timespec end) +{ + struct timespec temp; + if ((end.tv_nsec-start.tv_nsec)<0) { + temp.tv_sec = end.tv_sec-start.tv_sec-1; + temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec; + } else { + temp.tv_sec = end.tv_sec-start.tv_sec; + temp.tv_nsec = end.tv_nsec-start.tv_nsec; + } + return temp; +} + +double monotime_since (struct timespec t0) +{ + struct timespec ts, d; + clock_gettime (CLOCK_MONOTONIC, &ts); + + d = ts_diff (t0, ts); + + return ((double) d.tv_sec * 1000 + (double) d.tv_nsec / 1000000); +} + +void monotime (struct timespec *tp) +{ + clock_gettime (CLOCK_MONOTONIC, tp); +} + +bool monotime_isset (struct timespec t) +{ + return (t.tv_sec || t.tv_nsec); +} + +int main (int argc, char *argv[]) +{ + int id, ntasks; + struct timespec t; + const char *label; + + if (!(label = getenv ("FLUX_JOB_CC"))) + if (!(label = getenv ("FLUX_JOB_ID"))) + label = "0"; + + monotime (&t); + MPI_Init (&argc, &argv); + MPI_Comm_rank (MPI_COMM_WORLD, &id); + MPI_Comm_size (MPI_COMM_WORLD, &ntasks); + if (id == 0) { + printf ("%s: completed MPI_Init in %0.3fs. There are %d tasks\n", + label, + monotime_since (t) / 1000, ntasks); + fflush (stdout); + } + + monotime (&t); + MPI_Barrier (MPI_COMM_WORLD); + if (id == 0) { + printf ("%s: completed first barrier in %0.3fs\n", + label, + monotime_since (t) / 1000); + fflush (stdout); + } + + monotime (&t); + MPI_Finalize (); + if (id == 0) { + printf ("%s: completed MPI_Finalize in %0.3fs\n", + label, + monotime_since (t) / 1000); + fflush (stdout); + } + return 0; +} + diff --git a/2026/HPSF/tutorial/ch1/job-watch.sh b/2026/HPSF/tutorial/ch1/job-watch.sh new file mode 100755 index 00000000..9b4c08d1 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/job-watch.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "Oh you may not think me pretty," +sleep 3 +echo "But don’t judge on what you see," +sleep 3 +echo "I’ll eat myself if you can find" +sleep 2 +echo "A smarter hat than me." +sleep 2 +echo "[A scheduler smarter than me]!" diff --git a/2026/HPSF/tutorial/ch1/sleep_batch.sh b/2026/HPSF/tutorial/ch1/sleep_batch.sh new file mode 100644 index 00000000..58496dae --- /dev/null +++ b/2026/HPSF/tutorial/ch1/sleep_batch.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#FLUX: --nodes=2 +#FLUX: --nslots=2 +#FLUX: --cores-per-slot=1 + +echo "Starting my batch job" +echo "Print the resources allocated to this batch job" +flux resource list + +echo "Use sleep to emulate a parallel program" +echo "Run the program at a total of 2 processes each requiring" +echo "1 core. These processes are equally spread across 2 nodes." +flux run -N 2 -n 2 sleep 30 +flux run -N 2 -n 2 sleep 30 + diff --git a/2026/HPSF/tutorial/ch1/sub_job1.sh b/2026/HPSF/tutorial/ch1/sub_job1.sh new file mode 100755 index 00000000..5cf7ff47 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/sub_job1.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#FLUX: -N1 + +flux batch -N1 ./sub_job2.sh +flux queue drain + diff --git a/2026/HPSF/tutorial/ch1/sub_job2.sh b/2026/HPSF/tutorial/ch1/sub_job2.sh new file mode 100755 index 00000000..d947f191 --- /dev/null +++ b/2026/HPSF/tutorial/ch1/sub_job2.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +flux run -N1 sleep 30 + diff --git a/2026/HPSF/tutorial/img/dl-training-io.png b/2026/HPSF/tutorial/img/dl-training-io.png new file mode 100644 index 00000000..6129d0ef Binary files /dev/null and b/2026/HPSF/tutorial/img/dl-training-io.png differ diff --git a/2026/HPSF/tutorial/img/flux-batch.jpg b/2026/HPSF/tutorial/img/flux-batch.jpg new file mode 100644 index 00000000..f7282bb4 Binary files /dev/null and b/2026/HPSF/tutorial/img/flux-batch.jpg differ diff --git a/2026/HPSF/tutorial/img/flux-broker-design.png b/2026/HPSF/tutorial/img/flux-broker-design.png new file mode 100644 index 00000000..267f1a66 Binary files /dev/null and b/2026/HPSF/tutorial/img/flux-broker-design.png differ diff --git a/2026/HPSF/tutorial/img/flux-instance-pre-tbon.png b/2026/HPSF/tutorial/img/flux-instance-pre-tbon.png new file mode 100644 index 00000000..bc40a7e4 Binary files /dev/null and b/2026/HPSF/tutorial/img/flux-instance-pre-tbon.png differ diff --git a/2026/HPSF/tutorial/img/flux-instance-w-tbon.png b/2026/HPSF/tutorial/img/flux-instance-w-tbon.png new file mode 100644 index 00000000..93a276e8 Binary files /dev/null and b/2026/HPSF/tutorial/img/flux-instance-w-tbon.png differ diff --git a/2026/HPSF/tutorial/img/flux-tree.png b/2026/HPSF/tutorial/img/flux-tree.png new file mode 100644 index 00000000..a0dba825 Binary files /dev/null and b/2026/HPSF/tutorial/img/flux-tree.png differ diff --git a/2026/HPSF/tutorial/img/instance-submit.png b/2026/HPSF/tutorial/img/instance-submit.png new file mode 100644 index 00000000..84ce558e Binary files /dev/null and b/2026/HPSF/tutorial/img/instance-submit.png differ diff --git a/2026/HPSF/tutorial/img/scaled-submit.png b/2026/HPSF/tutorial/img/scaled-submit.png new file mode 100644 index 00000000..a5dc3468 Binary files /dev/null and b/2026/HPSF/tutorial/img/scaled-submit.png differ diff --git a/2026/HPSF/tutorial/img/single-submit.png b/2026/HPSF/tutorial/img/single-submit.png new file mode 100644 index 00000000..0592defe Binary files /dev/null and b/2026/HPSF/tutorial/img/single-submit.png differ