verl-project · davidmlw · Feb 2, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/docker/Dockerfile.stable.trtllm b/docker/Dockerfile.stable.trtllm
@@ -1,7 +1,7 @@
 # Base image from NGC TensorRT-LLM, which includes a pre-installed TensorRT-LLM.
 # For available images, visit: https://nvidia.github.io/TensorRT-LLM/installation/containers.html
 # Use TRTLLM_BASE_IMAGE to specify the base image (default: release:1.2.0rc6)
-ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6
+ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc1
 FROM ${TRTLLM_BASE_IMAGE}
 
 
@@ -11,58 +11,36 @@ FROM ${TRTLLM_BASE_IMAGE}
 # DeepEP is required for IBGDA support.
 # Clone and build gdrcopy and deepep-nvshmem dependencies.
 WORKDIR /home/dpsk_a2a
-RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
-    git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout a84a248 && \
-    cd /home/dpsk_a2a && \
-    wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
-    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
-    cd deepep-nvshmem && git apply /home/dpsk_a2a/DeepEP/third-party/nvshmem.patch && \
-    sed -i '16i#include <getopt.h>' /home/dpsk_a2a/deepep-nvshmem/examples/moe_shuffle.cu && \
-    sed -i 's/CUDA_STANDARD 11/CUDA_STANDARD 17/g' /home/dpsk_a2a/deepep-nvshmem/src/CMakeLists.txt && \
-    # Cleanup downloaded archive
-    rm /home/dpsk_a2a/nvshmem_src_3.2.5-1.txz
-
-# Set environment variables
-ENV CUDA_HOME=/usr/local/cuda \
-    CPATH=/usr/local/mpi/include \
-    LD_LIBRARY_PATH=/usr/local/mpi/lib:/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH \
-    GDRCOPY_HOME=/home/dpsk_a2a/gdrcopy
-
-# Build deepep-nvshmem
-WORKDIR /home/dpsk_a2a/deepep-nvshmem
-ARG CUDA_ARCHS="80;90;100"
-RUN NVSHMEM_SHMEM_SUPPORT=0 \
-    NVSHMEM_UCX_SUPPORT=0 \
-    NVSHMEM_USE_NCCL=0 \
-    NVSHMEM_MPI_SUPPORT=0 \
-    NVSHMEM_IBGDA_SUPPORT=1 \
-    NVSHMEM_PMIX_SUPPORT=0 \
-    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-    NVSHMEM_USE_GDRCOPY=1 \
-    NVSHMEM_BUILD_EXAMPLES=0 \
-    NVSHMEM_BUILD_TESTS=0 \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/home/dpsk_a2a/deepep-nvshmem/install -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" && \
-    cd build && make install -j && \
-    # Cleanup build directory
-    rm -rf /home/dpsk_a2a/deepep-nvshmem/build
-
-# Build deepep
-WORKDIR /home/dpsk_a2a/DeepEP
-ENV NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install
-RUN NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install python setup.py install
+RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \
+    pushd gdrcopy && \
+    make prefix=/usr/local lib_install && \
+    popd && rm -rf gdrcopy && \
+    pip install nvidia-nvshmem-cu13==3.3.20 && \
+    export NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem && \
+    export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" && \
+    export PATH="${NVSHMEM_DIR}/bin:$PATH" && \
+    pushd ${NVSHMEM_DIR}/lib && \
+    ln -s libnvshmem_host.so.3 libnvshmem_host.so && \
+    popd && \
+    git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git && \
+    pushd DeepEP && \
+    wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch && \
+    patch -p1 < deepep.patch && \
+    TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install && \
+    popd && rm -rf deepep
 
 # Install Python dependencies
-RUN pip3 install --no-cache-dir --no-deps trl && \
+RUN pip3 install --no-cache-dir --no-deps trl cachetools && \
     pip3 install --no-cache-dir nvtx matplotlib liger_kernel && \
     pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git && \
-    pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.14.0rc7
+    pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0
 
 
 # ==============================================================================
 # Install verl dependencies
 # ==============================================================================
-RUN pip install git+https://github.com/volcengine/verl.git@v0.6.0
-RUN pip uninstall -y verl
+RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0 && \
+    pip uninstall -y verl
 
 
 # ==============================================================================

@@ -27,14 +27,22 @@ Nsys options in controller nodes and worker nodes are configured in `global_prof
 * **`global_profiler.global_tool_config.nsys.controller_nsight_options`**. This config group is for the single controller. All fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. `ppo_trainer.yaml` provides a workable example. Users can reference [Nsight Systems manual](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Ray user guide](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html) for more details.
 * **`global_profiler.global_tool_config.nsys.worker_nsight_options`**. This config group is for the worker processes. Similarly all fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. Capture range is used to control the profiler when to start and stop. So `capture-range: "cudaProfilerApi"` is fixed and does not change it. Users can change `capture-range-end` with some accurate calculation or just leave it `null`.
 
-### Worker process profiling
+### Actor_rollout_ref (SPMD) Worker process profiling
 
 Verl manages mulitiple RL roles, _Actor_, _Ref_, _Rollout_, _Critic_, _Reward_, which are implemented in different Worker classes. And these workers can be combined into one Ray Actor, running in a process group. Each RL role has its own profiling config group, `profiler`, which consists of three fields:
 
-* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series ` worker_process_<PID>.<RID>.nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
+* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series `worker_process_<PID>.<RID>.nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
 * **`discrete`**. When set `False`, all the roles actions in one training step will be dumped in one database. When set `True`, the actions annotated by `DistProfiler.annotate` will be dumped into a discrete database. In this case, each role's action occupies one `<RID>`.
 * **Verl collocate mode**. Verl can combine two Worker sub classes to one Worker Actor. In this case, the user should take care that the combined Workers have consistent `discrete`. The Nsight Systems profiler uses a `torch.cuda.profiler.start()` and `stop()` pair to dump a `<step>` database anyway.
 
+### Rollout server worker process profiling
+Verl now use rollout server mode. AgentLoopManger mangages a list of rollout replicas; one repica manages a list of servers (in most cases, list length is 1); one server manages a list ranks of workers.
+In current config interface, `actor_rollout_ref.rollout.profiler` is a standalone config, and not is shared with Actor/Ref.
+`all_replicas=True` means all replicas are profiled, otherwise `replicas=[...]` are profiled.
+`all_ranks=True` means all ranks are profiled, otherwise `ranks=[...]` are profiled.
+Since a replica usually has one server, there is no control knobs for servers in a replica.
+An example is here `verl/examples/grpo_trainer/run_qwen2-7b_math_trtllm_nsys.sh`
+
 ### where to find the profiling data
 
 By default the `*.nsys-rep` files are saved in the directory `/tmp/ray/session_latest/logs/nsight/` at each node. According to the Ray manual, this default directory is not changeable. [&#34;however, Ray preserves the `--output` option of the default config&#34;](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html).
@@ -64,6 +72,13 @@ To enable profiling for specific components and steps, modify your ppo_trainer.y
                 enable: True
                 all_ranks: True
         # rollout & ref follow actor settings
+        rollout:
+            profiler:
+                enable: True
+                all_replicas: True
+                #replicas: [0,2]
+                all_ranks:False
+                ranks: [0,2]
     critic:
             profiler:
                 enable: True

@@ -0,0 +1,104 @@
+set -x
+
+# Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
+for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
+    unset "$v"
+done
+
+export RAY_DEDUP_LOGS=0
+
+# -----
+# Config
+# -----
+TP=${1:-4}
+PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
+EXP_NAME=trtllm-qwen2-7b-tp${TP}-8gpus${EXP_NAME_SUFFIX:+"-"}${EXP_NAME_SUFFIX}
+
+if [ $TP -eq 4 ]; then
+    MAX_BATCH_SIZE=1024
+else
+    MAX_BATCH_SIZE=384
+fi
+
+# -----
+# Data
+# -----
+DATADIR=${DATADIR:-$PWD/data}
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-7B-Instruct"}
+
+GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
+GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
+MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
+MATH_TEST_PATH=${DATADIR}/math/test.parquet
+
+TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
+TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
+
+# -----
+# Launch
+# -----
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    algorithm.rollout_correction.rollout_is_threshold=2.0 \
+    data.train_files="$TRAIN_FILES" \
+    data.val_files="$TEST_FILES" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=1024 \
+    data.return_raw_chat=True \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.hybrid_engine=True \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
+    actor_rollout_ref.rollout.name=trtllm \
+    actor_rollout_ref.rollout.mode="async" \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
+    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
+    +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console"]' \
+    trainer.project_name="${PROJECT_NAME}" \
+    trainer.experiment_name=${EXP_NAME} \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=2 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.resume_mode=disable \
+    trainer.total_epochs=15 \
+    trainer.val_before_train=False \
+    trainer.total_training_steps=6 \
+    global_profiler.tool=nsys \
+    global_profiler.steps='[2,3,5]' \
+    global_profiler.profile_continuous_steps=True \
+    global_profiler.global_tool_config.nsys.discrete=False \
+    global_profiler.global_tool_config.nsys.worker_nsight_options.capture-range-end='repeat-shutdown:2' \
+    actor_rollout_ref.actor.profiler.enable=True \
+    actor_rollout_ref.actor.profiler.all_ranks=False \
+    actor_rollout_ref.actor.profiler.ranks=[0,2] \
+    actor_rollout_ref.rollout.profiler.enable=True \
+    actor_rollout_ref.rollout.profiler.all_replicas=False \
+    actor_rollout_ref.rollout.profiler.replicas=[0,2] \
+    actor_rollout_ref.rollout.profiler.all_ranks=False \
+    actor_rollout_ref.rollout.profiler.ranks=[0,2] \
+    "${@:2}"
diff --git a/recipe b/recipe
@@ -854,6 +854,7 @@ def __init__(
         worker_group: RayWorkerGroup = None,
         rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
+        nsight_options: dict = None,
     ):
         """Initialize agent loop manager.
 
@@ -866,7 +867,19 @@ def __init__(
         self.config = config
         self.worker_group = worker_group
         self.reward_loop_worker_handles = reward_loop_worker_handles
-
+        self.nsight_options = nsight_options
+        profile_steps = OmegaConf.select(self.config.global_profiler, "steps")
+        if OmegaConf.select(self.config.global_profiler, "tool") == "nsys":
+            assert (
+                OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+                is not None
+            ), "worker_nsight_options must be set when using nsys with profile_steps"
+            nsight_options = OmegaConf.to_container(
+                OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+            )
+            if nsight_options is not None and nsight_options["capture-range-end"] is None:
+                nsight_options["capture-range-end"] = f"repeat-shutdown:{6 * len(profile_steps)}"
+            self.nsight_options = nsight_options
         # for recipe to change
         if not hasattr(self, "rollout_replica_class"):
             self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name)
@@ -897,9 +910,19 @@ def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool):
                 config=rollout_config,
                 model_config=model_config,
                 gpus_per_node=self.config.trainer.n_gpus_per_node,
+                nsight_options=self.nsight_options,
             )
             for replica_rank in range(num_replicas)
         ]
+        profiling_all_replicas = OmegaConf.select(self.config.actor_rollout_ref.rollout.profiler, "all_replicas")
+        profiling_replica_ranks = OmegaConf.select(self.config.actor_rollout_ref.rollout.profiler, "replicas")
+        self.profiling_replicas = (
+            self.rollout_replicas
+            if profiling_all_replicas
+            else [self.rollout_replicas[replica_rank] for replica_rank in profiling_replica_ranks]
+            if profiling_replica_ranks
+            else []
+        )
 
         if self.worker_group and rollout_config.name != "trtllm":
             self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
@@ -1000,14 +1023,18 @@ def clear_kv_cache(self):
 
     def start_profile(self, **kwargs):
         """Start profiling on all rollout replicas."""
-        self._run_all([replica.start_profile(**kwargs) for replica in self.rollout_replicas])
+        self._run_all([replica.start_profile(**kwargs) for replica in self.profiling_replicas])
 
     def stop_profile(self):
         """Stop profiling on all rollout replicas."""
-        self._run_all([replica.stop_profile() for replica in self.rollout_replicas])
+        self._run_all([replica.stop_profile() for replica in self.profiling_replicas])
 
     def _run_all(self, tasks: list[asyncio.Task]):
         async def run_all():
             await asyncio.gather(*tasks)
 
         asyncio.run(run_all())
+
+    def shutdown(self):
+        """Shutdown all rollout replicas."""
+        self._run_all([replica.shutdown() for replica in self.rollout_replicas])
@@ -301,6 +301,8 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: false
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}

@@ -290,6 +290,8 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: false
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}

@@ -289,6 +289,8 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: false
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}

@@ -271,6 +271,8 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: false
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}

@@ -316,6 +316,12 @@ profiler:
   # whether enable profile on rollout
   enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
 
+  # Whether to profile all replicas.
+  all_replicas: false
+
+  # The replicas that will be profiled. [] or [0,1,...]
+  replicas: []
+
   # Whether to profile all ranks.
   all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}