verl-project · davidmlw · Feb 2, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -27,14 +27,22 @@ Nsys options in controller nodes and worker nodes are configured in `global_prof
 * **`global_profiler.global_tool_config.nsys.controller_nsight_options`**. This config group is for the single controller. All fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. `ppo_trainer.yaml` provides a workable example. Users can reference [Nsight Systems manual](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Ray user guide](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html) for more details.
 * **`global_profiler.global_tool_config.nsys.worker_nsight_options`**. This config group is for the worker processes. Similarly all fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. Capture range is used to control the profiler when to start and stop. So `capture-range: "cudaProfilerApi"` is fixed and does not change it. Users can change `capture-range-end` with some accurate calculation or just leave it `null`.
 
-### Worker process profiling
+### Actor_rollout_ref (SPMD) Worker process profiling
 
 Verl manages mulitiple RL roles, _Actor_, _Ref_, _Rollout_, _Critic_, _Reward_, which are implemented in different Worker classes. And these workers can be combined into one Ray Actor, running in a process group. Each RL role has its own profiling config group, `profiler`, which consists of three fields:
 
-* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series ` worker_process_<PID>.<RID>.nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
+* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series `worker_process_<PID>.<RID>.nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
 * **`discrete`**. When set `False`, all the roles actions in one training step will be dumped in one database. When set `True`, the actions annotated by `DistProfiler.annotate` will be dumped into a discrete database. In this case, each role's action occupies one `<RID>`.
 * **Verl collocate mode**. Verl can combine two Worker sub classes to one Worker Actor. In this case, the user should take care that the combined Workers have consistent `discrete`. The Nsight Systems profiler uses a `torch.cuda.profiler.start()` and `stop()` pair to dump a `<step>` database anyway.
 
+### Rollout server worker process profiling
+Verl now use rollout server mode. AgentLoopManger mangages a list of rollout replicas; one repica manages a list of servers (in most cases, list length is 1); one server manages a list ranks of workers.
+In current config interface, `actor_rollout_ref.rollout.profiler` is a standalone config, and not is shared with Actor/Ref.
+`all_replicas=True` means all replicas are profiled, otherwise `replicas=[...]` are profiled.
+`all_ranks=True` means all ranks are profiled, otherwise `ranks=[...]` are profiled.
+Since a replica usually has one server, there is no control knobs for servers in a replica.
+An example is here `verl/examples/grpo_trainer/run_qwen2-7b_math_trtllm_nsys.sh`
+
 ### where to find the profiling data
 
 By default the `*.nsys-rep` files are saved in the directory `/tmp/ray/session_latest/logs/nsight/` at each node. According to the Ray manual, this default directory is not changeable. [&#34;however, Ray preserves the `--output` option of the default config&#34;](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html).
@@ -64,6 +72,13 @@ To enable profiling for specific components and steps, modify your ppo_trainer.y
                 enable: True
                 all_ranks: True
         # rollout & ref follow actor settings
+        rollout:
+            profiler:
+                enable: True
+                all_replicas: True
+                #replicas: [0,2]
+                all_ranks:False
+                ranks: [0,2]
     critic:
             profiler:
                 enable: True

@@ -0,0 +1,104 @@
+set -x
+
+# Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
+for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
+    unset "$v"
+done
+
+export RAY_DEDUP_LOGS=0
+
+# -----
+# Config
+# -----
+TP=${1:-4}
+PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
+EXP_NAME=trtllm-qwen2-7b-tp${TP}-8gpus${EXP_NAME_SUFFIX:+"-"}${EXP_NAME_SUFFIX}
+
+if [ $TP -eq 4 ]; then
+    MAX_BATCH_SIZE=1024
+else
+    MAX_BATCH_SIZE=384
+fi
+
+# -----
+# Data
+# -----
+DATADIR=${DATADIR:-$PWD/data}
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-7B-Instruct"}
+
+GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
+GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
+MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
+MATH_TEST_PATH=${DATADIR}/math/test.parquet
+
+TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
+TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
+
+# -----
+# Launch
+# -----
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    algorithm.rollout_correction.rollout_is_threshold=2.0 \
+    data.train_files="$TRAIN_FILES" \
+    data.val_files="$TEST_FILES" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=1024 \
+    data.return_raw_chat=True \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.hybrid_engine=True \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
+    actor_rollout_ref.rollout.name=trtllm \
+    actor_rollout_ref.rollout.mode="async" \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
+    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
+    +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console"]' \
+    trainer.project_name="${PROJECT_NAME}" \
+    trainer.experiment_name=${EXP_NAME} \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=2 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.resume_mode=disable \
+    trainer.total_epochs=15 \
+    trainer.val_before_train=False \
+    trainer.total_training_steps=6 \
+    global_profiler.tool=nsys \
+    global_profiler.steps='[2,3,5]' \
+    global_profiler.profile_continuous_steps=True \
+    global_profiler.global_tool_config.nsys.discrete=False \
+    global_profiler.global_tool_config.nsys.worker_nsight_options.capture-range-end='repeat-shutdown:2' \
+    actor_rollout_ref.actor.profiler.enable=True \
+    actor_rollout_ref.actor.profiler.all_ranks=False \
+    actor_rollout_ref.actor.profiler.ranks=[0,2] \
+    actor_rollout_ref.rollout.profiler.enable=True \
+    actor_rollout_ref.rollout.profiler.all_replicas=False \
+    actor_rollout_ref.rollout.profiler.replicas=[0,2] \
+    actor_rollout_ref.rollout.profiler.all_ranks=False \
+    actor_rollout_ref.rollout.profiler.ranks=[0,2] \
+    "${@:2}"
diff --git a/recipe b/recipe
diff --git a/tests/workers/rollout/rollout_trtllm/test_adapter.py b/tests/workers/rollout/rollout_trtllm/test_adapter.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import asyncio
 import os
-import subprocess
 from unittest.mock import AsyncMock, Mock, patch
 
 import aiohttp
@@ -142,7 +141,17 @@ def test_init_without_device_mesh(self):
 
         try:
             os.environ.setdefault("TLLM_RAY_FORCE_LOCAL_CLUSTER", "1")
-            ray.init(address="local", ignore_reinit_error=True, include_dashboard=False)
+            ray.init(
+                runtime_env={
+                    "env_vars": {
+                        "TOKENIZERS_PARALLELISM": "true",
+                        "NCCL_DEBUG": "WARN",
+                        "VLLM_LOGGING_LEVEL": "INFO",
+                        "VLLM_USE_V1": "1",
+                    }
+                },
+                ignore_reinit_error=True,
+            )
 
             config_dir = os.path.abspath("verl/verl/trainer/config")
             if not os.path.exists(config_dir):
@@ -187,5 +196,5 @@ def test_init_without_device_mesh(self):
                 os.environ.pop("RANK", None)
             else:
                 os.environ["RANK"] = prev_rank
+            print("\nShutting down Ray...")
             ray.shutdown()
-            subprocess.run(["ray", "stop"], capture_output=True)
diff --git a/tests/workers/rollout/rollout_trtllm/test_async_server.py b/tests/workers/rollout/rollout_trtllm/test_async_server.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import subprocess
 import time
 from unittest.mock import MagicMock, patch
 
@@ -170,7 +169,17 @@ def test_async_generate(self):
         """Test TRT-LLM generate method with real model."""
         try:
             os.environ.setdefault("TLLM_RAY_FORCE_LOCAL_CLUSTER", "1")
-            ray.init(address="local", ignore_reinit_error=True, include_dashboard=False)
+            ray.init(
+                runtime_env={
+                    "env_vars": {
+                        "TOKENIZERS_PARALLELISM": "true",
+                        "NCCL_DEBUG": "WARN",
+                        "VLLM_LOGGING_LEVEL": "INFO",
+                        "VLLM_USE_V1": "1",
+                    }
+                },
+                ignore_reinit_error=True,
+            )
 
             rollout_config, model_config = self._build_rollout_config(response_length=50)
 
@@ -209,14 +218,24 @@ def test_async_generate(self):
             print(f"Log probs: {result.log_probs[:10]}...")  # Print first 10 log probs
 
         finally:
+            print("\nShutting down Ray...")
             ray.shutdown()
-            subprocess.run(["ray", "stop"], capture_output=True)
 
     def test_async_memory_management(self):
         """Test TRT-LLM async memory management (sleep) reduces memory usage."""
         try:
             os.environ.setdefault("TLLM_RAY_FORCE_LOCAL_CLUSTER", "1")
-            ray.init(address="local", ignore_reinit_error=True, include_dashboard=False)
+            ray.init(
+                runtime_env={
+                    "env_vars": {
+                        "TOKENIZERS_PARALLELISM": "true",
+                        "NCCL_DEBUG": "WARN",
+                        "VLLM_LOGGING_LEVEL": "INFO",
+                        "VLLM_USE_V1": "1",
+                    }
+                },
+                ignore_reinit_error=True,
+            )
 
             rollout_config, model_config = self._build_rollout_config(free_cache_engine=True)
 
@@ -271,5 +290,5 @@ def get_gpu_memory_mb_for_device(device_uuid: str) -> float:
             )
 
         finally:
+            print("\nShutting down Ray...")
             ray.shutdown()
-            subprocess.run(["ray", "stop"], capture_output=True)
@@ -866,7 +866,6 @@ def __init__(
         self.config = config
         self.worker_group = worker_group
         self.reward_loop_worker_handles = reward_loop_worker_handles
-
         # for recipe to change
         if not hasattr(self, "rollout_replica_class"):
             self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name)
@@ -900,6 +899,15 @@ def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool):
             )
             for replica_rank in range(num_replicas)
         ]
+        profiling_all_replicas = OmegaConf.select(self.config.actor_rollout_ref.rollout.profiler, "all_replicas")
+        profiling_replica_ranks = OmegaConf.select(self.config.actor_rollout_ref.rollout.profiler, "replicas")
+        self.profiling_replicas = (
+            self.rollout_replicas
+            if profiling_all_replicas
+            else [self.rollout_replicas[replica_rank] for replica_rank in profiling_replica_ranks]
+            if profiling_replica_ranks
+            else []
+        )
 
         if self.worker_group and rollout_config.name != "trtllm":
             self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
@@ -1000,14 +1008,18 @@ def clear_kv_cache(self):
 
     def start_profile(self, **kwargs):
         """Start profiling on all rollout replicas."""
-        self._run_all([replica.start_profile(**kwargs) for replica in self.rollout_replicas])
+        self._run_all([replica.start_profile(**kwargs) for replica in self.profiling_replicas])
 
     def stop_profile(self):
         """Stop profiling on all rollout replicas."""
-        self._run_all([replica.stop_profile() for replica in self.rollout_replicas])
+        self._run_all([replica.stop_profile() for replica in self.profiling_replicas])
 
     def _run_all(self, tasks: list[asyncio.Task]):
         async def run_all():
             await asyncio.gather(*tasks)
 
         asyncio.run(run_all())
+
+    def shutdown(self):
+        """Shutdown all rollout replicas."""
+        self._run_all([replica.shutdown() for replica in self.rollout_replicas])
@@ -457,8 +457,6 @@ def __init__(
         self.profile_steps = kwargs.get("profile_steps", None)
         self.worker_nsight_options = kwargs.get("worker_nsight_options", None)
         self.customized_worker_env = kwargs.get("worker_env", {})
-        if self.worker_nsight_options is not None and self.worker_nsight_options["capture-range-end"] is None:
-            self.worker_nsight_options["capture-range-end"] = f"repeat-shutdown:{6 * len(self.profile_steps)}"
 
         if worker_names is not None and (not self.fused_worker_used):
             assert self._is_init_with_detached_workers

@@ -113,7 +113,7 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: false
-      all_ranks: false
+      all_ranks: true
       ranks: []
       save_path: ${oc.select:global_profiler.save_path,null}
       tool_config:
@@ -300,7 +300,10 @@ actor_rollout_ref:
     profiler:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
+      global_tool_config: ${oc.select:global_profiler.global_tool_config,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: true
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}
@@ -724,6 +727,7 @@ global_profiler:
   save_path: outputs/profile
   global_tool_config:
     nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
       discrete: false
       controller_nsight_options:
         trace: cuda,nvtx,cublas,ucx
@@ -734,7 +738,7 @@ global_profiler:
         cuda-memory-usage: 'true'
         cuda-graph-trace: graph
         capture-range: cudaProfilerApi
-        capture-range-end: null
+        capture-range-end: repeat-shutdown:6
         kill: none
     torch_memory:
       trace_alloc_max_entries: 100000

@@ -95,7 +95,7 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: false
-      all_ranks: false
+      all_ranks: true
       ranks: []
       save_path: ${oc.select:global_profiler.save_path,null}
       tool_config:
@@ -289,7 +289,10 @@ actor_rollout_ref:
     profiler:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
+      global_tool_config: ${oc.select:global_profiler.global_tool_config,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: true
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}
@@ -657,7 +660,7 @@ global_profiler:
         cuda-memory-usage: 'true'
         cuda-graph-trace: graph
         capture-range: cudaProfilerApi
-        capture-range-end: null
+        capture-range-end: repeat-shutdown:6
         kill: none
     torch_memory:
       trace_alloc_max_entries: 100000

@@ -94,7 +94,7 @@ actor_rollout_ref:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
       enable: false
-      all_ranks: false
+      all_ranks: true
       ranks: []
       save_path: ${oc.select:global_profiler.save_path,null}
       tool_config:
@@ -288,7 +288,10 @@ actor_rollout_ref:
     profiler:
       _target_: verl.utils.profiler.ProfilerConfig
       tool: ${oc.select:global_profiler.tool,null}
+      global_tool_config: ${oc.select:global_profiler.global_tool_config,null}
       enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_replicas: true
+      replicas: []
       all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
       ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
       save_path: ${oc.select:global_profiler.save_path,null}
@@ -669,7 +672,7 @@ global_profiler:
         cuda-memory-usage: 'true'
         cuda-graph-trace: graph
         capture-range: cudaProfilerApi
-        capture-range-end: null
+        capture-range-end: repeat-shutdown:6
         kill: none
     torch_memory:
       trace_alloc_max_entries: 100000