Run verl by tractorun
In this notebook we run Verl on the Tracto.ai platform. As an example, we will use this notebook, but instead of using 1 GPU, we will utilize all 8.
To run this notebook, use the following Docker image.
ROM ubuntu:22.04
USER root
RUN apt-get update && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y \
python3.10 \
python3.10-venv \
python3.10-dev
RUN apt-get install wget --yes
RUN wget https://bootstrap.pypa.io/get-pip.py && python3.10 get-pip.py
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
RUN dpkg -i cuda-keyring_1.1-1_all.deb
RUN apt-get update
RUN apt-get -y install cuda-toolkit-12-4
ENV PATH /usr/local/cuda-12.4/bin:$PATH
ENV CUDA_HOME /usr/local/cuda-12.4
RUN pip3 install --upgrade pip setuptools wheel
RUN pip3 install torchvision==0.19.0
RUN pip install --pre torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
RUN pip install vllm==0.6.3
RUN pip install -U "huggingface_hub[cli]"
RUN pip install tractorun
RUN apt-get install git --yes
RUN mkdir /verl_repo && git clone https://github.com/volcengine/verl /verl_repo && cd /verl_repo && pip3 install -e . -U
RUN pip3 install flash-attn --no-build-isolation
RUN mkdir /models && huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct --local-dir /models/Qwen2.5-0.5B-Instruct
RUN mkdir /data && mkdir /gsm8k && python3 /verl_repo/examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k
from tractorun.run import prepare_and_get_toolbox
from tractorun.backend.generic import GenericBackend
from tractorun.run import run
from tractorun.resources import Resources
from tractorun.mesh import Mesh
from tractorun.stderr_reader import StderrMode
import subprocess
import sys
import os
def controller(toolbox):
command = [
r"PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo data.train_files=/data/gsm8k/train.parquet "
r"data.val_files=/data/gsm8k/test.parquet data.train_batch_size=128 data.val_batch_size=656 "
r"data.max_prompt_length=512 data.max_response_length=256 actor_rollout_ref.model.path=/models/Qwen2.5-0.5B-Instruct "
r"actor_rollout_ref.actor.optim.lr=1e-6 actor_rollout_ref.actor.ppo_mini_batch_size=64 "
r"actor_rollout_ref.actor.ppo_micro_batch_size=8 actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 "
r"actor_rollout_ref.rollout.tensor_model_parallel_size=1 actor_rollout_ref.rollout.gpu_memory_utilization=0.4 "
r"actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 critic.optim.lr=1e-5 "
r"critic.model.path=/models/Qwen2.5-0.5B-Instruct critic.ppo_micro_batch_size=8 "
r"algorithm.kl_ctrl.kl_coef=0.001 +trainer.val_before_train=False trainer.default_hdfs_dir=null "
r"trainer.n_gpus_per_node=8 trainer.nnodes=1 trainer.save_freq=10 trainer.test_freq=10 "
r"trainer.total_epochs=1 trainer.logger=[console]"
]
os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
process = subprocess.run(command, shell=True, stdout=sys.stdout, stderr=sys.stderr, env=os.environ)
assert process.returncode == 0
run(
controller,
yt_path="//tmp/verl",
resources=Resources(
memory_limit=644245094400, # 600 GiB
cpu_limit=20,
),
mesh=Mesh(
node_count=1,
process_per_node=1,
# WARNING: on playground you have only 1 host with 1 gpu
# please set
# gpu_per_process=8,
gpu_per_process=8,
pool_trees=["gpu_h100"],
),
proxy_stderr_mode=StderrMode.primary,
backend=GenericBackend(),
docker_image="cr.eu-north1.nebius.cloud/e00faee7vas5hpsh3s/chiffa/verl:v6",
)