From 9bf16faf21e7539326cf9cf185b43d025cef30b6 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Thu, 26 Feb 2026 17:03:21 +0800
Subject: [PATCH 01/10] [BREAKING][rollout,cfg] refactor: get rid of
 actor_rollout_ref config from agent loop

---
 tests/experimental/agent_loop/agent_utils.py  |   4 +-
 verl/experimental/agent_loop/agent_loop.py    | 186 ++++++++++--------
 .../agent_loop/single_turn_agent_loop.py      |   6 +-
 .../agent_loop/tool_agent_loop.py             |  39 ++--
 .../_generated_ppo_megatron_trainer.yaml      |   4 +
 .../_generated_ppo_torchtitan_trainer.yaml    |   4 +
 .../config/_generated_ppo_trainer.yaml        |   4 +
 .../config/_generated_ppo_veomni_trainer.yaml |   4 +
 verl/trainer/config/rollout/rollout.yaml      |  12 ++
 verl/workers/config/rollout.py                |   4 +
 10 files changed, 154 insertions(+), 113 deletions(-)

diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py
index 20e6848746a..34f955faee1 100644
--- a/tests/experimental/agent_loop/agent_utils.py
+++ b/tests/experimental/agent_loop/agent_utils.py
@@ -80,7 +80,9 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG
         rm_resource_pool=rm_resource_pool,
     )
     agent_loop_manager = AgentLoopManager(
-        config=config,
+        rollout_config=config.actor_rollout_ref.rollout,
+        model_config=config.actor_rollout_ref.model,
+        data_config=config.data,
         worker_group=actor_rollout_wg,
         reward_loop_worker_handles=reward_loop_manager.reward_loop_workers,
     )
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index b591d093696..7afeeda3aee 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -35,10 +35,9 @@
 from verl.experimental.agent_loop.utils import resolve_config_path
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayResourcePool, RayWorkerGroup
-from verl.utils import hf_processor, hf_tokenizer
 from verl.utils.chat_template import initialize_system_prompt
+from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
-from verl.utils.fs import copy_to_local
 from verl.utils.model import compute_position_id_with_mask
 from verl.utils.ray_utils import get_event_loop
 from verl.utils.rollout_trace import (
@@ -47,6 +46,7 @@
     rollout_trace_op,
 )
 from verl.utils.transferqueue_utils import tqbridge
+from verl.workers.config import HFModelConfig, RolloutConfig
 from verl.workers.rollout.replica import TokenOutput, get_rollout_replica_class
 
 logger = logging.getLogger(__file__)
@@ -60,15 +60,17 @@ class AsyncLLMServerManager:
     - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
     """
 
-    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
+    def __init__(
+        self, rollout_config: RolloutConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000
+    ):
         """Initialize the AsyncLLMServerManager.
 
         Args:
-            config (DictConfig): YAML config.
+            rollout_config (RolloutConfig): rollout config.
             server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
             max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
         """
-        self.config = config
+        self.rollout_config = rollout_config
         self.server_handles = server_handles
         random.shuffle(self.server_handles)
 
@@ -190,35 +192,34 @@ def __init__(self, config: DictConfig):
 
 class AgentLoopBase(ABC):
     """An agent loop takes an input message, chat with OpenAI compatible LLM server and interact with various
-    environments."""
+    environments.
+
+    Args:
+        rollout_config (RolloutConfig): rollout config.
+        server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
+        tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
+        processor (AutoProcessor): Processor for process messages.
+        dataset_cls (type[Dataset]): Dataset class for creating dataset, Defaults to RLHFDataset.
+        data_config (DictConfigWrap): Dataset config.
+    """
 
     def __init__(
         self,
-        trainer_config: DictConfigWrap,
+        rollout_config: DictConfigWrap,
         server_manager: AsyncLLMServerManager,
         tokenizer: AutoTokenizer,
         processor: AutoProcessor,
         dataset_cls: type[RLHFDataset],
-        dataset_config: DictConfigWrap,
+        data_config: DictConfigWrap,
         **kwargs,
     ):
-        """Initialize agent loop, each sample will have its own loop instance.
-
-        Args:
-            trainer_config (DictConfigWrap): trainer config.
-            server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-            processor (AutoProcessor): Processor for process messages.
-            dataset_cls (type[Dataset]): Dataset class for creating dataset, Defaults to RLHFDataset.
-            dataset_config (DictConfigWrap): Dataset config.
-        """
-        self.config = trainer_config.config
+        self.rollout_config = rollout_config.config
         self.server_manager = server_manager
         self.tokenizer = tokenizer
         self.processor = processor
         self.dataset_cls = dataset_cls
-        self.dataset_config = dataset_config.config
-        self.apply_chat_template_kwargs = self.dataset_config.get("apply_chat_template_kwargs", {})
+        self.data_config = data_config.config
+        self.apply_chat_template_kwargs = self.data_config.get("apply_chat_template_kwargs", {})
         self.system_prompt = initialize_system_prompt(self.tokenizer, **self.apply_chat_template_kwargs)
         self.loop = get_event_loop()
 
@@ -234,7 +235,7 @@ async def process_vision_info(self, messages: list[dict]) -> dict:
         multi_modal_data = {}
         if self.processor is not None:
             images, videos = await self.dataset_cls.process_vision_info(
-                messages, image_patch_size=self.processor.image_processor.patch_size, config=self.dataset_config
+                messages, image_patch_size=self.processor.image_processor.patch_size, config=self.data_config
             )
             if images is not None:
                 multi_modal_data["images"] = images
@@ -342,50 +343,53 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]:
 
 
 class AgentLoopWorker:
-    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
+    """Agent loop worker takes a batch of messages and run each message in an agent loop.
+
+    Args:
+        rollout_config (RolloutConfig): rollout config.
+        model_config (HFModelConfig): model config.
+        data_config (DictConfig): data config.
+        server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
+        reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
+    """
 
     def __init__(
         self,
-        config: DictConfig,
+        rollout_config: RolloutConfig,
+        model_config: HFModelConfig,
+        data_config: DictConfig,
         server_handles: list[ray.actor.ActorHandle],
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        """Initialize agent loop manager.
-        Args:
-            config (DictConfig): YAML config.
-            server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
-            reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
-        """
-        self.config = config
+        self.rollout_config: RolloutConfig = omega_conf_to_dataclass(rollout_config)
+        self.model_config: HFModelConfig = omega_conf_to_dataclass(model_config)
+        self.data_config = data_config
 
         # for recipe to change
         if not hasattr(self, "server_manager"):
-            self.server_manager = AsyncLLMServerManager(config, server_handles)
+            self.server_manager = AsyncLLMServerManager(self.rollout_config, server_handles)
 
-        self.dataset_cls = get_dataset_class(config.data)
+        self.dataset_cls = get_dataset_class(data_config)
         self.reward_loop_worker_handles = reward_loop_worker_handles
 
-        model_path = config.actor_rollout_ref.model.path
-        self.model_name = "/".join(model_path.split("/")[-2:])
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True)
-        self.processor = hf_processor(local_path, trust_remote_code=True)
+        self.tokenizer = self.model_config.tokenizer
+        self.processor = self.model_config.processor
 
-        agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path
+        agent_loop_config_path = self.rollout_config.agent.agent_loop_config_path
         if agent_loop_config_path:
             resolved_path = resolve_config_path(agent_loop_config_path)
             agent_loop_configs = OmegaConf.load(resolved_path)
             for agent_loop_config in agent_loop_configs:
                 _agent_loop_registry[agent_loop_config.name] = agent_loop_config
-        if self.config.actor_rollout_ref.model.get("custom_chat_template", None) is not None:
-            if self.processor is not None:
-                self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
-            self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
+        if self.model_config.get("custom_chat_template", None) is not None:
+            if self.model_config.processor is not None:
+                self.model_config.processor.chat_template = self.model_config.custom_chat_template
+            self.model_config.tokenizer.chat_template = self.model_config.custom_chat_template
 
-        trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
+        trace_config = self.rollout_config.trace
         RolloutTraceConfig.init(
-            self.config.trainer.project_name,
-            self.config.trainer.experiment_name,
+            self.rollout_config.trace.project_name,
+            self.rollout_config.trace.experiment_name,
             trace_config.get("backend"),
             trace_config.get("token2text", False),
             trace_config.get("max_samples_per_step_per_worker", None),
@@ -413,7 +417,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
             responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
             response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
         """
-        config = self.config.actor_rollout_ref.rollout
+        config = self.rollout_config
         sampling_params = dict(
             temperature=config.temperature,
             top_p=config.top_p,
@@ -497,12 +501,12 @@ async def _run_agent_loop(
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
-                trainer_config=DictConfigWrap(config=self.config),
+                rollout_config=DictConfigWrap(self.rollout_config),
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
                 dataset_cls=self.dataset_cls,
-                dataset_config=DictConfigWrap(self.config.data),
+                data_config=DictConfigWrap(self.data_config),
             )
             output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs)
             return await self._agent_loop_postprocess(output, **kwargs)
@@ -536,7 +540,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
         prompt_output = self.tokenizer.pad(
             {"input_ids": output.prompt_ids},
             padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.prompt_length,
+            max_length=self.rollout_config.prompt_length,
             return_tensors="pt",
             return_attention_mask=True,
         )
@@ -548,7 +552,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
         response_output = self.tokenizer.pad(
             {"input_ids": output.response_ids},
             padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.response_length,
+            max_length=self.rollout_config.response_length,
             return_tensors="pt",
             return_attention_mask=True,
         )
@@ -559,7 +563,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
         response_mask_output = self.tokenizer.pad(
             {"input_ids": output.response_mask},
             padding="max_length",
-            max_length=self.config.actor_rollout_ref.rollout.response_length,
+            max_length=self.rollout_config.response_length,
             return_tensors="pt",
             return_attention_mask=False,
         )
@@ -568,7 +572,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
 
         response_logprobs = None
         if output.response_logprobs is not None:
-            pad_size = self.config.actor_rollout_ref.rollout.response_length - len(output.response_logprobs)
+            pad_size = self.rollout_config.response_length - len(output.response_logprobs)
             response_logprobs = torch.tensor(output.response_logprobs + [0.0] * pad_size).unsqueeze(0)
 
         response_mask = response_mask_output["input_ids"] * response_output["attention_mask"]
@@ -846,67 +850,77 @@ async def get_trajectory_info(step, index, validate):
 
 
 class AgentLoopManager:
-    """Agent loop manager that manages a group of agent loop workers."""
+    """Agent loop manager that manages a group of agent loop workers.
+
+    - if worker_group is not None, rollout server is in hybrid mode, share GPUs with training engine.
+    - otherwise, rollout server is in standalone mode, use separate GPUs, e.g., one-step-off/fully async training.
+
+    Args:
+        rollout_config (RolloutConfig): rollout config.
+        model_config (HFModelConfig): model config.
+        data_config (DictConfig): data config.
+        worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode.
+        rollout_resource_pool (RayResourcePool): Resource pool for hybrid mode, only used by TensorRT-LLM.
+        reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
+    """
 
     def __init__(
         self,
-        config: DictConfig,
+        rollout_config: RolloutConfig,
+        model_config: HFModelConfig,
+        data_config: DictConfig,
         worker_group: RayWorkerGroup = None,
         rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        """Initialize agent loop manager.
+        assert worker_group is not None or rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode"
 
-        Args:
-            config (DictConfig): trainer config.
-            worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode.
-            rollout_resource_pool (RayResourcePool): Resource pool for actor rollout (Colocate or Standalone mode).
-            reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
-        """
-        self.config = config
+        self.rollout_config = rollout_config
+        self.model_config = model_config
+        self.data_config = data_config
         self.worker_group = worker_group
+        self.rollout_resource_pool = rollout_resource_pool
         self.reward_loop_worker_handles = reward_loop_worker_handles
 
         # for recipe to change
         if not hasattr(self, "rollout_replica_class"):
-            self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name)
+            self.rollout_replica_class = get_rollout_replica_class(self.rollout_config.name)
         if not hasattr(self, "agent_loop_workers_class"):
             self.agent_loop_workers_class = ray.remote(AgentLoopWorker)
 
-        self._initialize_llm_servers(rollout_resource_pool)
+        self._initialize_llm_servers()
         self._init_agent_loop_workers()
 
-    def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool):
+    def _initialize_llm_servers(self):
         rollout_world_size = (
-            self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
-            * self.config.actor_rollout_ref.rollout.data_parallel_size
-            * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size
+            self.rollout_config.tensor_model_parallel_size
+            * self.rollout_config.data_parallel_size
+            * self.rollout_config.pipeline_model_parallel_size
         )
         world_size = (
             self.worker_group.world_size
             if self.worker_group
-            else self.config.trainer.n_gpus_per_node * self.config.trainer.nnodes
+            else self.rollout_config.n_gpus_per_node * self.rollout_config.nnodes
         )
         num_replicas = world_size // rollout_world_size
 
-        rollout_config = self.config.actor_rollout_ref.rollout
-        model_config = self.config.actor_rollout_ref.model
         self.rollout_replicas = [
             self.rollout_replica_class(
                 replica_rank=replica_rank,
-                config=rollout_config,
-                model_config=model_config,
-                gpus_per_node=self.config.trainer.n_gpus_per_node,
+                config=self.rollout_config,
+                model_config=self.model_config,
+                gpus_per_node=self.rollout_config.n_gpus_per_node,
             )
             for replica_rank in range(num_replicas)
         ]
 
-        if self.worker_group and rollout_config.name != "trtllm":
+        if self.worker_group and self.rollout_config.name != "trtllm":
             self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
-        elif self.worker_group and rollout_config.name == "trtllm":
+        # TODO: unify trtllm to init_hybrid
+        elif self.worker_group and self.rollout_config.name == "trtllm":
             self._run_all(
                 [
-                    server.init_hybrid_colocated(self.worker_group, rollout_resource_pool)
+                    server.init_hybrid_colocated(self.worker_group, self.rollout_resource_pool)
                     for server in self.rollout_replicas
                 ]
             )
@@ -919,14 +933,14 @@ def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool):
         print(f"AgentLoopManager: {self.server_addresses}")
 
         # Update Prometheus configuration with server addresses
-        if rollout_config.prometheus.enable:
-            if rollout_config.disable_log_stats:
+        if self.rollout_config.prometheus.enable:
+            if self.rollout_config.disable_log_stats:
                 raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.")
-            update_prometheus_config(rollout_config.prometheus, self.server_addresses, rollout_config.name)
+            update_prometheus_config(self.rollout_config.prometheus, self.server_addresses, self.rollout_config.name)
 
     def _init_agent_loop_workers(self):
         self.agent_loop_workers = []
-        num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers
+        num_workers = self.rollout_config.agent.num_workers
 
         node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]
         for i in range(num_workers):
@@ -938,7 +952,13 @@ def _init_agent_loop_workers(self):
                     scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
                         node_id=node_id, soft=True
                     ),
-                ).remote(self.config, self.server_handles, self.reward_loop_worker_handles)
+                ).remote(
+                    self.rollout_config,
+                    self.model_config,
+                    self.data_config,
+                    self.server_handles,
+                    self.reward_loop_worker_handles,
+                )
             )
 
     def generate_sequences(self, prompts: DataProto) -> DataProto:
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index 40c60743281..2a5831db771 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -30,10 +30,10 @@ class SingleTurnAgentLoop(AgentLoopBase):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+        self.prompt_length = self.rollout_config.prompt_length
+        self.response_length = self.rollout_config.response_length
 
-        tool_config_path = self.config.data.tool_config_path
+        tool_config_path = self.rollout_config.multi_turn.tool_config_path
         tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else []
         self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
 
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index ee6176775e0..d8b0f11e88d 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -21,13 +21,10 @@
 
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoTokenizer
 
 from verl.experimental.agent_loop.agent_loop import (
     AgentLoopBase,
     AgentLoopOutput,
-    AsyncLLMServerManager,
-    DictConfigWrap,
     register,
 )
 from verl.experimental.agent_loop.tool_parser import FunctionCall, ToolParser
@@ -96,37 +93,27 @@ def __init__(
 
 @register("tool_agent")
 class ToolAgentLoop(AgentLoopBase):
-    def __init__(
-        self,
-        trainer_config: DictConfigWrap,
-        server_manager: AsyncLLMServerManager,
-        tokenizer: AutoTokenizer,
-        processor: AutoProcessor,
-        **kwargs,
-    ):
-        super().__init__(trainer_config, server_manager, tokenizer, processor, **kwargs)
-        config = trainer_config.config
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
         # Initialize tools from config file
-        self.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns
-        self.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns
-        self.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls
-        self.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length
-        self.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side
-        tool_config_path = config.actor_rollout_ref.rollout.multi_turn.tool_config_path
+        self.max_user_turns = self.rollout_config.multi_turn.max_user_turns
+        self.max_assistant_turns = self.rollout_config.multi_turn.max_assistant_turns
+        self.max_parallel_calls = self.rollout_config.multi_turn.max_parallel_calls
+        self.max_tool_response_length = self.rollout_config.multi_turn.max_tool_response_length
+        self.tool_response_truncate_side = self.rollout_config.multi_turn.tool_response_truncate_side
+        tool_config_path = self.rollout_config.multi_turn.tool_config_path
         tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else []
         self.tools = {tool.name: tool for tool in tool_list}
         self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
-        self.tool_parser = ToolParser.get_tool_parser(
-            config.actor_rollout_ref.rollout.multi_turn.format, self.tokenizer
-        )
-        self.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format
+        self.tool_parser = ToolParser.get_tool_parser(self.rollout_config.multi_turn.format, self.tokenizer)
+        self.tool_parser_name = self.rollout_config.multi_turn.format
 
-        self.prompt_length = config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = config.actor_rollout_ref.rollout.response_length
+        self.prompt_length = self.rollout_config.prompt_length
+        self.response_length = self.rollout_config.response_length
 
         # Initialize interactions from config file
-        self.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path
+        self.interaction_config_file = self.rollout_config.multi_turn.interaction_config_path
         if self.interaction_config_file:
             self.interaction_map: dict[str, BaseInteraction] = self._initialize_interactions(
                 self.interaction_config_file
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index ea60c881619..09391ec6af3 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -216,6 +216,8 @@ actor_rollout_ref:
     _target_: verl.workers.config.RolloutConfig
     name: ???
     mode: async
+    nnodes: 0
+    n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
     temperature: 1.0
     top_k: -1
     top_p: 1
@@ -290,6 +292,8 @@ actor_rollout_ref:
       engine_kwargs: {}
     trace:
       _target_: verl.workers.config.TraceConfig
+      project_name: ${oc.select:trainer.project_name,null}
+      experiment_name: ${oc.select:trainer.experiment_name,null}
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
index b9a8b3aaf84..b923da853ec 100644
--- a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
@@ -205,6 +205,8 @@ actor_rollout_ref:
     _target_: verl.workers.config.RolloutConfig
     name: ???
     mode: async
+    nnodes: 0
+    n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
     temperature: 1.0
     top_k: -1
     top_p: 1
@@ -279,6 +281,8 @@ actor_rollout_ref:
       engine_kwargs: {}
     trace:
       _target_: verl.workers.config.TraceConfig
+      project_name: ${oc.select:trainer.project_name,null}
+      experiment_name: ${oc.select:trainer.experiment_name,null}
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index 6b97103ae9f..1cdc21b1ec8 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -204,6 +204,8 @@ actor_rollout_ref:
     _target_: verl.workers.config.RolloutConfig
     name: ???
     mode: async
+    nnodes: 0
+    n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
     temperature: 1.0
     top_k: -1
     top_p: 1
@@ -278,6 +280,8 @@ actor_rollout_ref:
       engine_kwargs: {}
     trace:
       _target_: verl.workers.config.TraceConfig
+      project_name: ${oc.select:trainer.project_name,null}
+      experiment_name: ${oc.select:trainer.experiment_name,null}
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
index 4528e0d667d..ccaf6582902 100644
--- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -186,6 +186,8 @@ actor_rollout_ref:
     _target_: verl.workers.config.RolloutConfig
     name: ???
     mode: async
+    nnodes: 0
+    n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
     temperature: 1.0
     top_k: -1
     top_p: 1
@@ -260,6 +262,8 @@ actor_rollout_ref:
       engine_kwargs: {}
     trace:
       _target_: verl.workers.config.TraceConfig
+      project_name: ${oc.select:trainer.project_name,null}
+      experiment_name: ${oc.select:trainer.experiment_name,null}
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index e1a4d2dad6d..894538d1d87 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -7,6 +7,12 @@ name: ???
 # sync: LLM, async: AsyncLLM
 mode: async
 
+# Number of nodes for standalone rollout server, must be > 0 in one-step-off/fully async training.
+nnodes: 0
+
+# Number of GPUs per node for rollout server.
+n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
+
 # Sampling temperature for rollout.
 temperature: 1.0
 
@@ -273,6 +279,12 @@ trace:
   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
   _target_: verl.workers.config.TraceConfig
 
+  # Project name for experiment tracking (e.g., wandb)
+  project_name: ${oc.select:trainer.project_name,null}
+
+  # Experiment name for run identification in tracking tools
+  experiment_name: ${oc.select:trainer.experiment_name,null}
+
   # trace backend, support mlflow, weave
   backend: null
 
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 8d0d732e263..d1d5c8f1768 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -80,6 +80,8 @@ class AgentLoopConfig(BaseConfig):
 
 @dataclass
 class TraceConfig(BaseConfig):
+    project_name: Optional[str] = None
+    experiment_name: Optional[str] = None
     backend: Optional[str] = None
     token2text: bool = False
     max_samples_per_step_per_worker: Optional[int] = None
@@ -138,6 +140,8 @@ class RolloutConfig(BaseConfig):
 
     name: Optional[str] = MISSING
     mode: str = "async"
+    nnodes: int = 0
+    n_gpus_per_node: int = 8
 
     temperature: float = 1.0
     top_k: int = -1

From 05f879c1d30005ed0d33020523f9e95657adf9a4 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Thu, 26 Feb 2026 17:56:02 +0800
Subject: [PATCH 02/10] one_step_off_policy

---
 tests/special_e2e/run_one_step_off_policy.sh  |  4 +-
 .../agent_loop/agent_loop.py                  | 50 -------------------
 .../one_step_off_ppo_megatron_trainer.yaml    |  7 ---
 .../config/one_step_off_ppo_trainer.yaml      |  7 ---
 .../one_step_off_policy/ray_trainer.py        |  5 +-
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64_ris.sh     |  4 +-
 .../shell/dapo_7b_math_fsdp2_sglang_4_12.sh   |  4 +-
 .../shell/dapo_7b_math_megatron_4_12.sh       |  4 +-
 .../shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh        |  4 +-
 .../shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh |  4 +-
 .../shell/grpo_3b_gsm8k_fsdp2_2_6.sh          |  4 +-
 .../grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh      |  4 +-
 14 files changed, 24 insertions(+), 85 deletions(-)

diff --git a/tests/special_e2e/run_one_step_off_policy.sh b/tests/special_e2e/run_one_step_off_policy.sh
index bdcba5caaaf..9bbe16045c2 100755
--- a/tests/special_e2e/run_one_step_off_policy.sh
+++ b/tests/special_e2e/run_one_step_off_policy.sh
@@ -90,6 +90,8 @@ common_params=(
     actor_rollout_ref.rollout.val_kwargs.n=1
     actor_rollout_ref.rollout.enable_chunked_prefill=True
     actor_rollout_ref.rollout.name=vllm
+    actor_rollout_ref.rollout.nnodes=1
+    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
     actor_rollout_ref.rollout.checkpoint_engine.backend='nccl'
     actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=1024
     reward.reward_manager.name=dapo
@@ -109,8 +111,6 @@ common_params=(
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
-    rollout.nnodes=1
-    rollout.n_gpus_per_node=${n_gpus_rollout}
 
 )
 
diff --git a/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py b/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py
index 2ae476df4da..85455d655b2 100644
--- a/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py
@@ -18,9 +18,7 @@
 import ray
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopManager
-from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config
 from verl.protocol import DataProto
-from verl.single_controller.ray import RayResourcePool
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -56,54 +54,6 @@ async def generate_sequences_async(self, prompts: DataProto) -> DataProto:
         output.meta_info = {"timing": timing, **outputs[0].meta_info}
         return output
 
-    def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool):
-        rollout_world_size = (
-            self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
-            * self.config.actor_rollout_ref.rollout.data_parallel_size
-            * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size
-        )
-        world_size = (
-            self.worker_group.world_size
-            if self.worker_group
-            else self.config.rollout.n_gpus_per_node * self.config.rollout.nnodes
-        )
-        num_replicas = world_size // rollout_world_size
-
-        rollout_config = self.config.actor_rollout_ref.rollout
-        model_config = self.config.actor_rollout_ref.model
-        self.rollout_replicas = [
-            self.rollout_replica_class(
-                replica_rank=replica_rank,
-                config=rollout_config,
-                model_config=model_config,
-                gpus_per_node=self.config.rollout.n_gpus_per_node,
-            )
-            for replica_rank in range(num_replicas)
-        ]
-
-        if self.worker_group and rollout_config.name != "trtllm":
-            self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
-        elif self.worker_group and rollout_config.name == "trtllm":
-            self._run_all(
-                [
-                    server.init_hybrid_colocated(self.worker_group, rollout_resource_pool)
-                    for server in self.rollout_replicas
-                ]
-            )
-        else:
-            self._run_all([server.init_standalone() for server in self.rollout_replicas])
-
-        self.server_handles = [server._server_handle for server in self.rollout_replicas]
-        self.server_addresses = [server._server_address for server in self.rollout_replicas]
-
-        print(f"AgentLoopManager: {self.server_addresses}")
-
-        # Update Prometheus configuration with server addresses
-        if rollout_config.prometheus.enable:
-            if rollout_config.disable_log_stats:
-                raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.")
-            update_prometheus_config(rollout_config.prometheus, self.server_addresses, rollout_config.name)
-
     async def wake_up(self):
         await asyncio.gather(*[replica.wake_up() for replica in self.rollout_replicas])
 
diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
index 0e4677be368..19d77597dc1 100644
--- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
+++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
@@ -9,13 +9,6 @@ defaults:
 trainer:
   use_legacy_worker_impl: disable
 
-# config for the rollout (only for resource isolation)
-rollout:
-  # Number of nodes used in the rollout
-  nnodes: 1
-  # Number of GPUs per node
-  n_gpus_per_node: 8
-
 # To adapt to the current logic of AgentLoopManager
 actor_rollout_ref:
   rollout:
diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
index dc784b2ae73..1a74af3df34 100644
--- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
+++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
@@ -9,13 +9,6 @@ defaults:
 trainer:
   use_legacy_worker_impl: disable
 
-# config for the rollout (only for resource isolation)
-rollout:
-  # Number of nodes used in the rollout
-  nnodes: 1
-  # Number of GPUs per node
-  n_gpus_per_node: 8
-
 # To adapt to the current logic of AgentLoopManager
 actor_rollout_ref:
   rollout:
diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py
index 70a2a3d3d90..caba2cf6ad3 100644
--- a/verl/experimental/one_step_off_policy/ray_trainer.py
+++ b/verl/experimental/one_step_off_policy/ray_trainer.py
@@ -183,7 +183,10 @@ def _init_async_rollout_manager(self):
 
         self.async_rollout_mode = True
         self.async_rollout_manager = OneStepOffAgentLoopManager(
-            config=self.config, reward_loop_worker_handles=reward_loop_worker_handles
+            rollout_config=self.config.actor_rollout_ref.rollout,
+            model_config=self.config.actor_rollout_ref.model,
+            data_config=self.config.data,
+            reward_loop_worker_handles=reward_loop_worker_handles,
         )
 
     def _create_continuous_iterator(self):
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index cbefe87424b..4df41235c03 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -135,5 +135,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}"
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index c35513cf9f2..e785e02c6e7 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -136,5 +136,5 @@ python -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}"
\ No newline at end of file
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}"
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
index 10ce9122269..6a462aeca91 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
@@ -146,8 +146,8 @@ python -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     algorithm.rollout_correction.rollout_is=null \
     algorithm.rollout_correction.rollout_is_threshold=null \
     algorithm.rollout_correction.rollout_rs=seq_mean_k1 \
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
index 2725bb5bc3d..c92a2ad6bca 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
@@ -136,5 +136,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}"
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
index a0da86affea..03fb457c090 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
@@ -142,5 +142,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}"
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
index facabdf58e8..971e77e583e 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
@@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
index 5c959f49961..6a5338e2269 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
@@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
index c5c5eb11d2a..935869c0575 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
@@ -60,5 +60,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
index d6f884ad53a..756c4009ad1 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
@@ -89,5 +89,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file

From d23b2829e523282094708b8ac16316878a5b0554 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Thu, 26 Feb 2026 20:23:38 +0800
Subject: [PATCH 03/10] fully_async

---
 tests/experimental/agent_loop/agent_utils.py  |  2 +-
 ...t_agent_loop_extra_fields_schema_on_cpu.py | 16 ++--
 .../test_agent_reward_loop_colocate.py        |  7 +-
 .../test_agent_reward_loop_standalone.py      |  7 +-
 tests/special_e2e/run_fully_async_policy.sh   |  4 +-
 tests/special_npu/run_fully_async_policy.sh   |  4 +-
 tests/special_npu/run_one_step_off_policy.sh  |  4 +-
 verl/experimental/agent_loop/agent_loop.py    | 62 ++++++++------
 .../experimental/fully_async_policy/README.md |  8 +-
 .../fully_async_policy/README_zh.md           |  8 +-
 .../agent_loop/agent_loop.py                  | 82 ++++---------------
 .../partial_single_turn_agent_loop.py         |  6 +-
 .../config/fully_async_ppo_trainer.yaml       |  6 --
 .../fully_async_policy/fully_async_main.py    | 10 ++-
 .../fully_async_rollouter.py                  |  8 +-
 .../fully_async_policy/fully_async_trainer.py |  8 +-
 .../shell/dapo_30b_a3b_base_math_fsdp.sh      |  4 +-
 .../shell/dapo_7b_async_retool.sh             |  4 +-
 .../shell/dapo_7b_math_fsdp2_16_16.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_32_32.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  4 +-
 .../shell/dapo_7b_math_fsdp2_4_4.sh           |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64_mis.sh     |  4 +-
 .../shell/dapo_7b_math_fsdp2_8_8.sh           |  4 +-
 .../shell/geo3k_qwen25vl_7b_megatron_4_4.sh   |  4 +-
 .../grpo_30b_a3b_base_math_megatron_96_32.sh  |  4 +-
 ...po_30b_a3b_base_math_megatron_96_32_mis.sh |  4 +-
 verl/trainer/ppo/ray_trainer.py               |  6 +-
 29 files changed, 138 insertions(+), 158 deletions(-)

diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py
index 34f955faee1..5103ff9c357 100644
--- a/tests/experimental/agent_loop/agent_utils.py
+++ b/tests/experimental/agent_loop/agent_utils.py
@@ -79,7 +79,7 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG
         config=config,
         rm_resource_pool=rm_resource_pool,
     )
-    agent_loop_manager = AgentLoopManager(
+    agent_loop_manager = AgentLoopManager.create(
         rollout_config=config.actor_rollout_ref.rollout,
         model_config=config.actor_rollout_ref.model,
         data_config=config.data,
diff --git a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
index f8bda825ab2..a5c5ab3dde3 100644
--- a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
+++ b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
@@ -147,7 +147,9 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
     # Minimal config surface used by the agent loops.
     config = OmegaConf.create(
         {
-            "actor_rollout_ref": {"rollout": {"prompt_length": 16, "response_length": 16}},
+            "actor_rollout_ref": {
+                "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}}
+            },
             "data": {
                 "tool_config_path": None,
                 "apply_chat_template_kwargs": {},
@@ -159,24 +161,24 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
     tokenizer = _FakeTokenizer()
     processor = None
 
-    trainer_config = DictConfigWrap(config)
-    dataset_config = DictConfigWrap(config.data)
+    rollout_config = DictConfigWrap(config.actor_rollout_ref.rollout)
+    data_config = DictConfigWrap(config.data)
 
     single_turn = SingleTurnAgentLoop(
-        trainer_config=trainer_config,
+        rollout_config=rollout_config,
         server_manager=server_manager,
         tokenizer=tokenizer,
         processor=processor,
         dataset_cls=RLHFDataset,
-        dataset_config=dataset_config,
+        data_config=data_config,
     )
     partial_single_turn = PartialSingleTurnAgentLoop(
-        trainer_config=trainer_config,
+        rollout_config=rollout_config,
         server_manager=server_manager,
         tokenizer=tokenizer,
         processor=processor,
         dataset_cls=RLHFDataset,
-        dataset_config=dataset_config,
+        data_config=data_config,
     )
 
     raw_prompt = [{"role": "user", "content": "hi"}]
diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
index 0e4e6b93683..1cf1014602e 100644
--- a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
+++ b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
@@ -98,7 +98,12 @@ def test_agent_reward_loop_standalone():
     )
     actor_rollout_wg.init_model()
 
-    agent_loop_manager = AgentLoopManager(config, worker_group=actor_rollout_wg)
+    agent_loop_manager = AgentLoopManager.create(
+        rollout_config=config.actor_rollout_ref.rollout,
+        model_config=config.actor_rollout_ref.model,
+        data_config=config.data,
+        worker_group=actor_rollout_wg,
+    )
     # sleep rollout replicas
     checkpoint_manager = CheckpointEngineManager(
         config=omega_conf_to_dataclass(config.actor_rollout_ref.rollout.checkpoint_engine),
diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
index bd9011b9874..b9c1f9a3f9d 100644
--- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
+++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
@@ -76,8 +76,11 @@ def test_agent_reward_loop_standalone():
 
     # 1. init reward model manager
     reward_loop_manager = RewardLoopManager(config)
-    agent_loop_manager = AgentLoopManager(
-        config=config, reward_loop_worker_handles=reward_loop_manager.reward_loop_workers
+    agent_loop_manager = AgentLoopManager.create(
+        rollout_config=config.actor_rollout_ref.rollout,
+        model_config=config.actor_rollout_ref.model,
+        data_config=config.data,
+        reward_loop_worker_handles=reward_loop_manager.reward_loop_workers,
     )
 
     # 2. init test data
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 01d807ba63a..4f7882b60d4 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -124,8 +124,8 @@ common_params=(
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
     trainer.log_val_generations=10
-    rollout.nnodes=1
-    rollout.n_gpus_per_node=${n_gpus_rollout}
+    actor_rollout_ref.rollout.nnodes=1
+    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
     rollout.test_freq=${test_freq}
diff --git a/tests/special_npu/run_fully_async_policy.sh b/tests/special_npu/run_fully_async_policy.sh
index fa517e81ae4..e5908798bcf 100644
--- a/tests/special_npu/run_fully_async_policy.sh
+++ b/tests/special_npu/run_fully_async_policy.sh
@@ -124,8 +124,8 @@ common_params=(
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
     trainer.log_val_generations=10
-    rollout.nnodes=1
-    rollout.n_gpus_per_node=${n_gpus_rollout}
+    actor_rollout_ref.rollout.nnodes=1
+    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
     rollout.test_freq=${test_freq}
diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh
index 2426a380fec..4c1ad9ce204 100644
--- a/tests/special_npu/run_one_step_off_policy.sh
+++ b/tests/special_npu/run_one_step_off_policy.sh
@@ -108,8 +108,8 @@ common_params=(
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_npus_training}
-    rollout.nnodes=1
-    rollout.n_gpus_per_node=${n_npus_rollout}
+    actor_rollout_ref.rollout.nnodes=1
+    actor_rollout_ref.rollout.n_gpus_per_node=${n_npus_rollout}
 
 )
 
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 7afeeda3aee..27d31458b88 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -39,7 +39,7 @@
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
 from verl.utils.model import compute_position_id_with_mask
-from verl.utils.ray_utils import get_event_loop
+from verl.utils.ray_utils import auto_await, get_event_loop
 from verl.utils.rollout_trace import (
     RolloutTraceConfig,
     rollout_trace_attr,
@@ -888,10 +888,26 @@ def __init__(
         if not hasattr(self, "agent_loop_workers_class"):
             self.agent_loop_workers_class = ray.remote(AgentLoopWorker)
 
-        self._initialize_llm_servers()
-        self._init_agent_loop_workers()
+    @classmethod
+    @auto_await
+    async def create(
+        cls,
+        rollout_config: RolloutConfig,
+        model_config: HFModelConfig,
+        data_config: DictConfig,
+        worker_group: RayWorkerGroup = None,
+        rollout_resource_pool: RayResourcePool = None,
+        reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
+    ):
+        """Create agent loop manager."""
+        instance = cls(
+            rollout_config, model_config, data_config, worker_group, rollout_resource_pool, reward_loop_worker_handles
+        )
+        await instance._initialize_llm_servers()
+        await instance._init_agent_loop_workers()
+        return instance
 
-    def _initialize_llm_servers(self):
+    async def _initialize_llm_servers(self):
         rollout_world_size = (
             self.rollout_config.tensor_model_parallel_size
             * self.rollout_config.data_parallel_size
@@ -915,17 +931,17 @@ def _initialize_llm_servers(self):
         ]
 
         if self.worker_group and self.rollout_config.name != "trtllm":
-            self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
+            await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
         # TODO: unify trtllm to init_hybrid
         elif self.worker_group and self.rollout_config.name == "trtllm":
-            self._run_all(
-                [
+            await asyncio.gather(
+                *[
                     server.init_hybrid_colocated(self.worker_group, self.rollout_resource_pool)
                     for server in self.rollout_replicas
                 ]
             )
         else:
-            self._run_all([server.init_standalone() for server in self.rollout_replicas])
+            await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas])
 
         self.server_handles = [server._server_handle for server in self.rollout_replicas]
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
@@ -938,7 +954,7 @@ def _initialize_llm_servers(self):
                 raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.")
             update_prometheus_config(self.rollout_config.prometheus, self.server_addresses, self.rollout_config.name)
 
-    def _init_agent_loop_workers(self):
+    async def _init_agent_loop_workers(self):
         self.agent_loop_workers = []
         num_workers = self.rollout_config.agent.num_workers
 
@@ -961,7 +977,8 @@ def _init_agent_loop_workers(self):
                 )
             )
 
-    def generate_sequences(self, prompts: DataProto) -> DataProto:
+    @auto_await
+    async def generate_sequences(self, prompts: DataProto) -> DataProto:
         """Split input batch and dispatch to agent loop workers.
 
         Args:
@@ -972,8 +989,8 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         """
 
         chunkes = prompts.chunk(len(self.agent_loop_workers))
-        outputs = ray.get(
-            [
+        outputs = await asyncio.gather(
+            *[
                 worker.generate_sequences.remote(chunk)
                 for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
             ]
@@ -1014,20 +1031,17 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
 
         return timing
 
-    def clear_kv_cache(self):
+    @auto_await
+    async def clear_kv_cache(self):
         """Clear all rollout kv cache, but don`t sleep."""
-        self._run_all([replica.clear_kv_cache() for replica in self.rollout_replicas])
+        await asyncio.gather(*[replica.clear_kv_cache() for replica in self.rollout_replicas])
 
-    def start_profile(self, **kwargs):
+    @auto_await
+    async def start_profile(self, **kwargs):
         """Start profiling on all rollout replicas."""
-        self._run_all([replica.start_profile(**kwargs) for replica in self.rollout_replicas])
+        await asyncio.gather(*[replica.start_profile(**kwargs) for replica in self.rollout_replicas])
 
-    def stop_profile(self):
+    @auto_await
+    async def stop_profile(self):
         """Stop profiling on all rollout replicas."""
-        self._run_all([replica.stop_profile() for replica in self.rollout_replicas])
-
-    def _run_all(self, tasks: list[asyncio.Task]):
-        async def run_all():
-            await asyncio.gather(*tasks)
-
-        asyncio.run(run_all())
+        await asyncio.gather(*[replica.stop_profile() for replica in self.rollout_replicas])
diff --git a/verl/experimental/fully_async_policy/README.md b/verl/experimental/fully_async_policy/README.md
index b7ff1756459..311e8dfc0ea 100644
--- a/verl/experimental/fully_async_policy/README.md
+++ b/verl/experimental/fully_async_policy/README.md
@@ -92,8 +92,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 |------------------------------------------------------------------|------------------------------------------------------------------------------------------------|
 | `trainer.nnodes`                                                 | Number of nodes for Trainer                                                                    |
 | `trainer.n_gpus_per_node`                                        | Number of GPUs per node for Trainer                                                            |
-| `rollout.nnodes`                                                 | Number of nodes for Rollouter                                                                  |
-| `rollout.n_gpus_per_node`                                        | Number of GPUs per node for Rollouter                                                          |
+| `actor_rollout_ref.rollout.nnodes`                                                 | Number of nodes for Rollouter                                                                  |
+| `actor_rollout_ref.rollout.n_gpus_per_node`                                        | Number of GPUs per node for Rollouter                                                          |
 | `data.train_batch_size`                                          | In the fully async strategy, this value is not effective (default is 0)                        |
 | `data.gen_batch_size`                                            | In the fully async strategy, uses streaming sample production logic (default is 1)             |
 | `rollout.total_rollout_steps`                                    | Total number of rollout samples                                                                |
@@ -313,8 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/README_zh.md b/verl/experimental/fully_async_policy/README_zh.md
index ad2e52e4167..6861114debe 100644
--- a/verl/experimental/fully_async_policy/README_zh.md
+++ b/verl/experimental/fully_async_policy/README_zh.md
@@ -69,8 +69,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 |------------------------------------------------------------------|-----------------------------------------------------------------|
 | `trainer.nnodes`                                                 | Trainer的node数量                                                  |
 | `trainer.n_gpus_per_node`                                        | Trainer每个node上gpu的数量                                            |
-| `rollout.nnodes`                                                 | Rollouter的node数量                                                |
-| `rollout.n_gpus_per_node`                                        | Rollouter每个node上gpu的数量                                          |
+| `actor_rollout_ref.rollout.nnodes`                                                 | Rollouter的node数量                                                |
+| `actor_rollout_ref.rollout.n_gpus_per_node`                                        | Rollouter每个node上gpu的数量                                          |
 | `data.train_batch_size`                                          | 在fully async策略中，该值不生效（默认设置为0）                                   |
 | `data.gen_batch_size`                                            | 在fully async策略中，使用流式的样本生产逻辑（默认设置为1)                             |
 | `rollout.total_rollout_steps`                                    | 总的rollout的sample数量                                              |
@@ -256,8 +256,8 @@ python -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
index 9240000c61c..c545032fece 100644
--- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
@@ -30,13 +30,13 @@
     _agent_loop_registry,
     get_trajectory_info,
 )
-from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config
 from verl.protocol import DataProto
-from verl.single_controller.ray import RayWorkerGroup
+from verl.single_controller.ray import RayResourcePool, RayWorkerGroup
 from verl.utils.rollout_trace import (
     rollout_trace_attr,
     rollout_trace_op,
 )
+from verl.workers.config import HFModelConfig, RolloutConfig
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -81,12 +81,14 @@ async def generate_for_partial(
 class FullyAsyncAgentLoopWorker(AgentLoopWorker):
     def __init__(
         self,
-        config: DictConfig,
+        rollout_config: RolloutConfig,
+        model_config: HFModelConfig,
+        data_config: DictConfig,
         server_handles: list[ray.actor.ActorHandle],
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        self.server_manager = FullyAsyncLLMServerManager(config, server_handles)
-        super().__init__(config, server_handles, reward_loop_worker_handles)
+        self.server_manager = FullyAsyncLLMServerManager(rollout_config, server_handles)
+        super().__init__(rollout_config, model_config, data_config, server_handles, reward_loop_worker_handles)
         # A shared cancellation event for all agent loops running on this worker.
         self.cancellation_event = asyncio.Event()
 
@@ -102,7 +104,7 @@ async def generate_sequences_no_post(
         Returns:
             list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch.
         """
-        config = self.config.actor_rollout_ref.rollout
+        config = self.rollout_config
         sampling_params = dict(
             temperature=config.temperature,
             top_p=config.top_p,
@@ -217,17 +219,22 @@ async def resume_agent_loops(self):
 class FullyAsyncAgentLoopManager(AgentLoopManager):
     def __init__(
         self,
-        config: DictConfig,
+        rollout_config: RolloutConfig,
+        model_config: HFModelConfig,
+        data_config: DictConfig,
         worker_group: RayWorkerGroup = None,
+        rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        self.config = config
+        self.rollout_config = rollout_config
+        self.model_config = model_config
+        self.data_config = data_config
         self.worker_group = worker_group
         self.reward_loop_worker_handles = reward_loop_worker_handles
         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker
 
         # Select rollout replica class based on rollout name
-        rollout_name = config.actor_rollout_ref.rollout.name
+        rollout_name = rollout_config.name
         if rollout_name == "sglang":
             from verl.experimental.fully_async_policy.sglang_rollout.sglang_async_server import FullyAsyncSGLangReplica
 
@@ -246,63 +253,6 @@ def __init__(
         self.server_addresses = None
         self.agent_loop_workers = None
 
-    @classmethod
-    async def create(
-        cls,
-        config: DictConfig,
-        worker_group: RayWorkerGroup = None,
-        reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
-    ):
-        instance = cls(config, worker_group, reward_loop_worker_handles)
-        await instance._async_init()
-        return instance
-
-    async def _async_init(self):
-        await self._initialize_llm_servers_async()
-        self._init_agent_loop_workers()
-
-    async def _initialize_llm_servers_async(self):
-        rollout_world_size = (
-            self.config.actor_rollout_ref.rollout.tensor_model_parallel_size
-            * self.config.actor_rollout_ref.rollout.data_parallel_size
-            * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size
-        )
-        world_size = (
-            self.worker_group.world_size
-            if self.worker_group
-            else self.config.rollout.n_gpus_per_node * self.config.rollout.nnodes
-        )
-        num_replicas = world_size // rollout_world_size
-
-        rollout_config = self.config.actor_rollout_ref.rollout
-        model_config = self.config.actor_rollout_ref.model
-        self.rollout_replicas = [
-            self.rollout_replica_class(
-                replica_rank=replica_rank,
-                config=rollout_config,
-                model_config=model_config,
-                gpus_per_node=self.config.rollout.n_gpus_per_node,
-            )
-            for replica_rank in range(num_replicas)
-        ]
-
-        if self.worker_group:
-            await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas])
-        else:
-            await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas])
-
-        self.server_handles = [server._server_handle for server in self.rollout_replicas]
-        self.server_addresses = [server._server_address for server in self.rollout_replicas]
-
-        print(f"AgentLoopManager: {self.server_addresses}")
-        # Update Prometheus configuration with server addresses
-        if rollout_config.prometheus.enable:
-            if rollout_config.disable_log_stats:
-                raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.")
-            await asyncio.to_thread(
-                update_prometheus_config, rollout_config.prometheus, self.server_addresses, rollout_config.name
-            )
-
     async def generate_single_sample_async(
         self,
         sample: DataProto,
diff --git a/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
index b0aef45bd67..6982184f8f6 100644
--- a/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py
@@ -30,9 +30,9 @@ class PartialSingleTurnAgentLoop(AgentLoopBase):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
-        self.response_length = self.config.actor_rollout_ref.rollout.response_length
-        self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {})
+        self.prompt_length = self.rollout_config.prompt_length
+        self.response_length = self.rollout_config.response_length
+        self.apply_chat_template_kwargs = self.data_config.get("apply_chat_template_kwargs", {})
 
     async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
         output: Optional[AgentLoopOutput] = kwargs.get("output", None)
diff --git a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
index 1f4b4db8c82..d4cff51de3b 100644
--- a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -30,12 +30,6 @@ async_training:
 # Rollout config
 rollout:
 
-  # Number of nodes used in the rollout
-  nnodes: 1
-
-  # Number of GPUs per node                     
-  n_gpus_per_node: 8
-
   # number of responses (i.e. num sample times). > 1 for grpo
   n: 4
 
diff --git a/verl/experimental/fully_async_policy/fully_async_main.py b/verl/experimental/fully_async_policy/fully_async_main.py
index fe43abb1b6e..80332364ad1 100644
--- a/verl/experimental/fully_async_policy/fully_async_main.py
+++ b/verl/experimental/fully_async_policy/fully_async_main.py
@@ -59,10 +59,14 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
 
     # Rollout resource pool
     if Role.Rollout in roles:
-        assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
-        assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
+        assert config.actor_rollout_ref.rollout.n_gpus_per_node > 0, (
+            "config.actor_rollout_ref.rollout.n_gpus_per_node must be greater than 0"
+        )
+        assert config.actor_rollout_ref.rollout.nnodes > 0, (
+            "config.actor_rollout_ref.rollout.nnodes must be greater than 0"
+        )
 
-        rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes
+        rollout_pool = [config.actor_rollout_ref.rollout.n_gpus_per_node] * config.actor_rollout_ref.rollout.nnodes
         resource_pool_spec["rollout_pool"] = rollout_pool
         mapping[Role.Rollout] = "rollout_pool"
 
diff --git a/verl/experimental/fully_async_policy/fully_async_rollouter.py b/verl/experimental/fully_async_policy/fully_async_rollouter.py
index 4810a3730da..5ad6bd503ce 100644
--- a/verl/experimental/fully_async_policy/fully_async_rollouter.py
+++ b/verl/experimental/fully_async_policy/fully_async_rollouter.py
@@ -104,7 +104,7 @@ def __init__(
 
         self._validate_config()
         if self.config.async_training.use_trainer_do_validate:
-            rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node
+            rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
             train_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node
             total_gpus = rollout_gpus + train_gpus
             print(f"[FullyAsyncRollouter] split before val_dataset total len: {len(val_dataset)}")
@@ -444,7 +444,11 @@ async def _init_async_rollout_manager(self):
 
         self.async_rollout_mode = True
         self.async_rollout_manager = await FullyAsyncAgentLoopManager.create(
-            config=self.config, worker_group=self.rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles
+            rollout_config=self.config.actor_rollout_ref.rollout,
+            model_config=self.config.actor_rollout_ref.model,
+            data_config=self.config.data,
+            worker_group=self.rollout_wg,
+            reward_loop_worker_handles=reward_loop_worker_handles,
         )
 
     # Add samples to the pending_queue
diff --git a/verl/experimental/fully_async_policy/fully_async_trainer.py b/verl/experimental/fully_async_policy/fully_async_trainer.py
index 9519c594dbd..31df91f4e44 100644
--- a/verl/experimental/fully_async_policy/fully_async_trainer.py
+++ b/verl/experimental/fully_async_policy/fully_async_trainer.py
@@ -137,7 +137,7 @@ def __init__(
         self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches
         total_gpus = (
             config.trainer.nnodes * config.trainer.n_gpus_per_node
-            + config.rollout.nnodes * config.rollout.n_gpus_per_node
+            + config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
         )
         self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus)
 
@@ -147,7 +147,7 @@ def __init__(
             from verl.utils.dataset.rl_dataset import collate_fn
 
             val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
-            rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node
+            rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
             print(f"[FullyAsyncTrainer] split before val_dataset total len: {len(val_dataset)}")
             split_dataset = val_dataset.split(total_gpus)
             rollout_val_dataset0 = split_dataset[rollout_gpus:]
@@ -311,7 +311,9 @@ async def _init_async_rollout_manager(self):
 
             self.async_rollout_mode = True
             self.async_rollout_manager = await FullyAsyncAgentLoopManager.create(
-                config=self.config,
+                rollout_config=self.config.actor_rollout_ref.rollout,
+                model_config=self.config.actor_rollout_ref.model,
+                data_config=self.config.data,
                 worker_group=self.actor_rollout_wg,
                 reward_loop_worker_handles=reward_loop_worker_handles,
             )
diff --git a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
index cc936f50dc1..209930aeb59 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
@@ -176,8 +176,8 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.resume_mode=auto \
     trainer.nnodes="${n_nodes_train}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${n_nodes_rollout}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    actor_rollout_ref.rollout.nnodes="${n_nodes_rollout}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq=${test_freq} \
     rollout.total_epochs=10 \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
index 2a5eb1bb966..a94387f2af9 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
@@ -129,8 +129,8 @@ python3 -m verl.experimental.fully_async_policy.fully_async_main \
     data.gen_batch_size=${gen_prompt_bsz} \
     trainer.nnodes=$NNODES \
     trainer.n_gpus_per_node=$n_gpus_training \
-    rollout.nnodes=$NNODES \
-    rollout.n_gpus_per_node=$n_gpus_rollout \
+    actor_rollout_ref.rollout.nnodes=$NNODES \
+    actor_rollout_ref.rollout.n_gpus_per_node=$n_gpus_rollout \
     rollout.total_rollout_steps=$total_rollout_steps \
     rollout.total_epochs=10 \
     rollout.test_freq=$test_freq \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
index ba8e6804fdb..1dcb5018c68 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
index 5561208ee6d..6577caada6e 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index 242a5117a5e..9823231aed1 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -153,8 +153,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
index ee0657eace7..aef1bac704d 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
@@ -152,8 +152,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index 002c1206b8a..4a273c2c8ba 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
index f01fb8184e7..e1146d79d26 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
@@ -156,8 +156,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index 2b2143ffa21..18291a62bf7 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
index 8b32c6e0078..741c695de0b 100644
--- a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
+++ b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
@@ -99,8 +99,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    rollout.nnodes="${NNODES}" \
-    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs="${total_epochs}" \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
index ebcb634ff72..1b95a5becd8 100644
--- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
+++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
@@ -217,8 +217,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
index c04a09d3266..3ea5196f1c6 100644
--- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
+++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
@@ -226,8 +226,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    rollout.nnodes="${NNODES_ROLLOUT}" \
-    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
+    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 9d6560881be..8ecaa1c0c70 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -831,8 +831,10 @@ def init_workers(self):
         # if enable_agent_reward_loop, we directly pass reward_loop_workers to agent loop manager
         # to stream reward computation with actor rollout
         reward_loop_worker_handles = self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None
-        self.async_rollout_manager = AgentLoopManager(
-            config=self.config,
+        self.async_rollout_manager = AgentLoopManager.create(
+            rollout_config=self.config.actor_rollout_ref.rollout,
+            model_config=self.config.actor_rollout_ref.model,
+            data_config=self.config.actor_rollout_ref.data,
             worker_group=self.actor_rollout_wg,
             rollout_resource_pool=actor_rollout_resource_pool,
             reward_loop_worker_handles=reward_loop_worker_handles,

From 80c8f3dd30f945726b5e90baf1ba1409584b466f Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Thu, 26 Feb 2026 22:26:34 +0800
Subject: [PATCH 04/10] fix auto_await

---
 .../one_step_off_policy/ray_trainer.py        |  2 +-
 verl/utils/ray_utils.py                       | 29 +++++++++++++++----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py
index caba2cf6ad3..d2c64ec577d 100644
--- a/verl/experimental/one_step_off_policy/ray_trainer.py
+++ b/verl/experimental/one_step_off_policy/ray_trainer.py
@@ -182,7 +182,7 @@ def _init_async_rollout_manager(self):
         from verl.experimental.one_step_off_policy.agent_loop import OneStepOffAgentLoopManager
 
         self.async_rollout_mode = True
-        self.async_rollout_manager = OneStepOffAgentLoopManager(
+        self.async_rollout_manager = OneStepOffAgentLoopManager.create(
             rollout_config=self.config.actor_rollout_ref.rollout,
             model_config=self.config.actor_rollout_ref.model,
             data_config=self.config.data,
diff --git a/verl/utils/ray_utils.py b/verl/utils/ray_utils.py
index 5ba20649365..eff3d91085f 100644
--- a/verl/utils/ray_utils.py
+++ b/verl/utils/ray_utils.py
@@ -97,9 +97,13 @@ def get_event_loop():
 def auto_await(func):
     """Auto await a coroutine function.
 
-    If the function is called in an async context (with a running event loop),
-    it will return the coroutine object. Otherwise, it will block the current thread
-    and run the coroutine until completion.
+    Handles three cases:
+    1. When the decorated function is called with await: returns the coroutine
+       so the caller can await it.
+    2. When called directly and there is no running event loop: runs the
+       coroutine with asyncio.run() and returns the result.
+    3. When called directly and the event loop is already running: runs the
+       coroutine (e.g. in a thread pool to avoid deadlock) and returns the result.
     """
 
     @functools.wraps(func)
@@ -114,9 +118,22 @@ def wrapper(*args, **kwargs):
         except RuntimeError:
             loop = None
 
-        if loop and loop.is_running():
-            return coro
-        else:
+        # Case 1: No running loop -> run with asyncio.run()
+        if loop is None:
             return asyncio.run(coro)
 
+        # Case 2: Running loop -> return coro if caller will await
+        caller_frame = inspect.currentframe()
+        if caller_frame is not None:
+            caller_frame = caller_frame.f_back
+        caller_is_async = caller_frame is not None and (caller_frame.f_code.co_flags & inspect.CO_COROUTINE) != 0
+        if caller_is_async:
+            return coro
+
+        # Case 3: Running loop -> run coro in thread pool
+        # (cannot block the loop thread without deadlock)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+            future = pool.submit(asyncio.run, coro)
+            return future.result()
+
     return wrapper

From 231ec7c115b28a2e727783c27b4651781ab9da6e Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 01:35:51 +0800
Subject: [PATCH 05/10] revert

---
 tests/experimental/agent_loop/agent_utils.py  |  4 +-
 ...t_agent_loop_extra_fields_schema_on_cpu.py |  9 ++-
 .../test_agent_reward_loop_colocate.py        |  4 +-
 .../test_agent_reward_loop_standalone.py      |  4 +-
 tests/special_e2e/run_fully_async_policy.sh   |  4 +-
 tests/special_e2e/run_one_step_off_policy.sh  |  4 +-
 verl/experimental/agent_loop/agent_loop.py    | 75 ++++++++-----------
 .../experimental/fully_async_policy/README.md |  8 +-
 .../fully_async_policy/README_zh.md           |  8 +-
 .../agent_loop/agent_loop.py                  | 20 ++---
 .../agent_loop/partial_tool_agent_loop.py     |  6 +-
 .../config/fully_async_ppo_trainer.yaml       |  6 ++
 .../fully_async_policy/fully_async_main.py    | 13 ++--
 .../fully_async_rollouter.py                  |  8 +-
 .../fully_async_policy/fully_async_trainer.py |  8 +-
 .../shell/dapo_30b_a3b_base_math_fsdp.sh      |  4 +-
 .../shell/dapo_7b_async_retool.sh             |  4 +-
 .../shell/dapo_7b_math_fsdp2_16_16.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_32_32.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  4 +-
 .../shell/dapo_7b_math_fsdp2_4_4.sh           |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64_mis.sh     |  4 +-
 .../shell/dapo_7b_math_fsdp2_8_8.sh           |  4 +-
 .../shell/geo3k_qwen25vl_7b_megatron_4_4.sh   |  4 +-
 .../grpo_30b_a3b_base_math_megatron_96_32.sh  |  4 +-
 ...po_30b_a3b_base_math_megatron_96_32_mis.sh |  4 +-
 .../one_step_off_ppo_megatron_trainer.yaml    |  7 ++
 .../config/one_step_off_ppo_trainer.yaml      |  7 ++
 .../one_step_off_policy/main_ppo.py           |  4 +
 .../one_step_off_policy/ray_trainer.py        |  5 +-
 .../shell/dapo_7b_math_fsdp2_4_12.sh          |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64.sh         |  4 +-
 .../shell/dapo_7b_math_fsdp2_64_64_ris.sh     |  4 +-
 .../shell/dapo_7b_math_fsdp2_sglang_4_12.sh   |  4 +-
 .../shell/dapo_7b_math_megatron_4_12.sh       |  4 +-
 .../shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh        |  4 +-
 .../shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh |  4 +-
 .../shell/grpo_3b_gsm8k_fsdp2_2_6.sh          |  4 +-
 .../grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh      |  4 +-
 verl/trainer/ppo/ray_trainer.py               |  4 +-
 41 files changed, 141 insertions(+), 151 deletions(-)

diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py
index 5103ff9c357..4596236bc78 100644
--- a/tests/experimental/agent_loop/agent_utils.py
+++ b/tests/experimental/agent_loop/agent_utils.py
@@ -80,9 +80,7 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG
         rm_resource_pool=rm_resource_pool,
     )
     agent_loop_manager = AgentLoopManager.create(
-        rollout_config=config.actor_rollout_ref.rollout,
-        model_config=config.actor_rollout_ref.model,
-        data_config=config.data,
+        config=config,
         worker_group=actor_rollout_wg,
         reward_loop_worker_handles=reward_loop_manager.reward_loop_workers,
     )
diff --git a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
index a5c5ab3dde3..e5d296a8756 100644
--- a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
+++ b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
@@ -148,7 +148,8 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
     config = OmegaConf.create(
         {
             "actor_rollout_ref": {
-                "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}}
+                "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}},
+                "model": {},
             },
             "data": {
                 "tool_config_path": None,
@@ -161,11 +162,11 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
     tokenizer = _FakeTokenizer()
     processor = None
 
-    rollout_config = DictConfigWrap(config.actor_rollout_ref.rollout)
+    trainer_config = DictConfigWrap(config)
     data_config = DictConfigWrap(config.data)
 
     single_turn = SingleTurnAgentLoop(
-        rollout_config=rollout_config,
+        trainer_config=trainer_config,
         server_manager=server_manager,
         tokenizer=tokenizer,
         processor=processor,
@@ -173,7 +174,7 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
         data_config=data_config,
     )
     partial_single_turn = PartialSingleTurnAgentLoop(
-        rollout_config=rollout_config,
+        trainer_config=trainer_config,
         server_manager=server_manager,
         tokenizer=tokenizer,
         processor=processor,
diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
index 1cf1014602e..0ea96dca409 100644
--- a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
+++ b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
@@ -99,9 +99,7 @@ def test_agent_reward_loop_standalone():
     actor_rollout_wg.init_model()
 
     agent_loop_manager = AgentLoopManager.create(
-        rollout_config=config.actor_rollout_ref.rollout,
-        model_config=config.actor_rollout_ref.model,
-        data_config=config.data,
+        config=config,
         worker_group=actor_rollout_wg,
     )
     # sleep rollout replicas
diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
index b9c1f9a3f9d..99af766cbbe 100644
--- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
+++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
@@ -77,9 +77,7 @@ def test_agent_reward_loop_standalone():
     # 1. init reward model manager
     reward_loop_manager = RewardLoopManager(config)
     agent_loop_manager = AgentLoopManager.create(
-        rollout_config=config.actor_rollout_ref.rollout,
-        model_config=config.actor_rollout_ref.model,
-        data_config=config.data,
+        config=config,
         reward_loop_worker_handles=reward_loop_manager.reward_loop_workers,
     )
 
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 4f7882b60d4..01d807ba63a 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -124,8 +124,8 @@ common_params=(
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
     trainer.log_val_generations=10
-    actor_rollout_ref.rollout.nnodes=1
-    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
+    rollout.nnodes=1
+    rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
     rollout.test_freq=${test_freq}
diff --git a/tests/special_e2e/run_one_step_off_policy.sh b/tests/special_e2e/run_one_step_off_policy.sh
index 9bbe16045c2..bdcba5caaaf 100755
--- a/tests/special_e2e/run_one_step_off_policy.sh
+++ b/tests/special_e2e/run_one_step_off_policy.sh
@@ -90,8 +90,6 @@ common_params=(
     actor_rollout_ref.rollout.val_kwargs.n=1
     actor_rollout_ref.rollout.enable_chunked_prefill=True
     actor_rollout_ref.rollout.name=vllm
-    actor_rollout_ref.rollout.nnodes=1
-    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
     actor_rollout_ref.rollout.checkpoint_engine.backend='nccl'
     actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=1024
     reward.reward_manager.name=dapo
@@ -111,6 +109,8 @@ common_params=(
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
+    rollout.nnodes=1
+    rollout.n_gpus_per_node=${n_gpus_rollout}
 
 )
 
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 27d31458b88..6f089033b6c 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -53,6 +53,14 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+def _get_rollout_and_model_config(config: DictConfig) -> RolloutConfig:
+    # TODO: backward compatibility, remove this once we switch to new trainer.
+    if config.get("actor_rollout_ref"):
+        return config.actor_rollout_ref.rollout, config.actor_rollout_ref.model
+    else:
+        return config.rollout, config.model
+
+
 class AsyncLLMServerManager:
     """
     A class to manage multiple OpenAI compatible LLM servers. This class provides
@@ -60,17 +68,15 @@ class AsyncLLMServerManager:
     - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
     """
 
-    def __init__(
-        self, rollout_config: RolloutConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000
-    ):
+    def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000):
         """Initialize the AsyncLLMServerManager.
 
         Args:
-            rollout_config (RolloutConfig): rollout config.
+            config (DictConfig): whole config for main entrypoint.
             server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
             max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000.
         """
-        self.rollout_config = rollout_config
+        self.config = config
         self.server_handles = server_handles
         random.shuffle(self.server_handles)
 
@@ -195,7 +201,7 @@ class AgentLoopBase(ABC):
     environments.
 
     Args:
-        rollout_config (RolloutConfig): rollout config.
+        trainer_config (DictConfig): whole config for main entrypoint.
         server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
         tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
         processor (AutoProcessor): Processor for process messages.
@@ -205,7 +211,7 @@ class AgentLoopBase(ABC):
 
     def __init__(
         self,
-        rollout_config: DictConfigWrap,
+        trainer_config: DictConfigWrap,
         server_manager: AsyncLLMServerManager,
         tokenizer: AutoTokenizer,
         processor: AutoProcessor,
@@ -213,7 +219,8 @@ def __init__(
         data_config: DictConfigWrap,
         **kwargs,
     ):
-        self.rollout_config = rollout_config.config
+        self.config = trainer_config.config
+        self.rollout_config, _ = _get_rollout_and_model_config(self.config)
         self.server_manager = server_manager
         self.tokenizer = tokenizer
         self.processor = processor
@@ -346,30 +353,27 @@ class AgentLoopWorker:
     """Agent loop worker takes a batch of messages and run each message in an agent loop.
 
     Args:
-        rollout_config (RolloutConfig): rollout config.
-        model_config (HFModelConfig): model config.
-        data_config (DictConfig): data config.
+        config (DictConfig): whole config for main entrypoint.
         server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles.
         reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
     """
 
     def __init__(
         self,
-        rollout_config: RolloutConfig,
-        model_config: HFModelConfig,
-        data_config: DictConfig,
+        config: DictConfig,
         server_handles: list[ray.actor.ActorHandle],
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
+        self.config = config
+        rollout_config, model_config = _get_rollout_and_model_config(config)
         self.rollout_config: RolloutConfig = omega_conf_to_dataclass(rollout_config)
         self.model_config: HFModelConfig = omega_conf_to_dataclass(model_config)
-        self.data_config = data_config
 
         # for recipe to change
         if not hasattr(self, "server_manager"):
-            self.server_manager = AsyncLLMServerManager(self.rollout_config, server_handles)
+            self.server_manager = AsyncLLMServerManager(config, server_handles)
 
-        self.dataset_cls = get_dataset_class(data_config)
+        self.dataset_cls = get_dataset_class(config.data)
         self.reward_loop_worker_handles = reward_loop_worker_handles
 
         self.tokenizer = self.model_config.tokenizer
@@ -501,12 +505,12 @@ async def _run_agent_loop(
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
-                rollout_config=DictConfigWrap(self.rollout_config),
+                trainer_config=DictConfigWrap(config=self.config),
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
                 dataset_cls=self.dataset_cls,
-                data_config=DictConfigWrap(self.data_config),
+                data_config=DictConfigWrap(self.config.data),
             )
             output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs)
             return await self._agent_loop_postprocess(output, **kwargs)
@@ -856,9 +860,7 @@ class AgentLoopManager:
     - otherwise, rollout server is in standalone mode, use separate GPUs, e.g., one-step-off/fully async training.
 
     Args:
-        rollout_config (RolloutConfig): rollout config.
-        model_config (HFModelConfig): model config.
-        data_config (DictConfig): data config.
+        config (DictConfig): whole config for main entrypoint.
         worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode.
         rollout_resource_pool (RayResourcePool): Resource pool for hybrid mode, only used by TensorRT-LLM.
         reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
@@ -866,22 +868,19 @@ class AgentLoopManager:
 
     def __init__(
         self,
-        rollout_config: RolloutConfig,
-        model_config: HFModelConfig,
-        data_config: DictConfig,
+        config: DictConfig,
         worker_group: RayWorkerGroup = None,
         rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        assert worker_group is not None or rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode"
-
-        self.rollout_config = rollout_config
-        self.model_config = model_config
-        self.data_config = data_config
+        self.config = config
+        self.rollout_config, self.model_config = _get_rollout_and_model_config(config)
         self.worker_group = worker_group
         self.rollout_resource_pool = rollout_resource_pool
         self.reward_loop_worker_handles = reward_loop_worker_handles
 
+        assert worker_group is not None or self.rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode"
+
         # for recipe to change
         if not hasattr(self, "rollout_replica_class"):
             self.rollout_replica_class = get_rollout_replica_class(self.rollout_config.name)
@@ -892,17 +891,13 @@ def __init__(
     @auto_await
     async def create(
         cls,
-        rollout_config: RolloutConfig,
-        model_config: HFModelConfig,
-        data_config: DictConfig,
+        config: DictConfig,
         worker_group: RayWorkerGroup = None,
         rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
         """Create agent loop manager."""
-        instance = cls(
-            rollout_config, model_config, data_config, worker_group, rollout_resource_pool, reward_loop_worker_handles
-        )
+        instance = cls(config, worker_group, rollout_resource_pool, reward_loop_worker_handles)
         await instance._initialize_llm_servers()
         await instance._init_agent_loop_workers()
         return instance
@@ -968,13 +963,7 @@ async def _init_agent_loop_workers(self):
                     scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
                         node_id=node_id, soft=True
                     ),
-                ).remote(
-                    self.rollout_config,
-                    self.model_config,
-                    self.data_config,
-                    self.server_handles,
-                    self.reward_loop_worker_handles,
-                )
+                ).remote(self.config, self.server_handles, self.reward_loop_worker_handles)
             )
 
     @auto_await
diff --git a/verl/experimental/fully_async_policy/README.md b/verl/experimental/fully_async_policy/README.md
index 311e8dfc0ea..b7ff1756459 100644
--- a/verl/experimental/fully_async_policy/README.md
+++ b/verl/experimental/fully_async_policy/README.md
@@ -92,8 +92,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 |------------------------------------------------------------------|------------------------------------------------------------------------------------------------|
 | `trainer.nnodes`                                                 | Number of nodes for Trainer                                                                    |
 | `trainer.n_gpus_per_node`                                        | Number of GPUs per node for Trainer                                                            |
-| `actor_rollout_ref.rollout.nnodes`                                                 | Number of nodes for Rollouter                                                                  |
-| `actor_rollout_ref.rollout.n_gpus_per_node`                                        | Number of GPUs per node for Rollouter                                                          |
+| `rollout.nnodes`                                                 | Number of nodes for Rollouter                                                                  |
+| `rollout.n_gpus_per_node`                                        | Number of GPUs per node for Rollouter                                                          |
 | `data.train_batch_size`                                          | In the fully async strategy, this value is not effective (default is 0)                        |
 | `data.gen_batch_size`                                            | In the fully async strategy, uses streaming sample production logic (default is 1)             |
 | `rollout.total_rollout_steps`                                    | Total number of rollout samples                                                                |
@@ -313,8 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/README_zh.md b/verl/experimental/fully_async_policy/README_zh.md
index 6861114debe..ad2e52e4167 100644
--- a/verl/experimental/fully_async_policy/README_zh.md
+++ b/verl/experimental/fully_async_policy/README_zh.md
@@ -69,8 +69,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 |------------------------------------------------------------------|-----------------------------------------------------------------|
 | `trainer.nnodes`                                                 | Trainer的node数量                                                  |
 | `trainer.n_gpus_per_node`                                        | Trainer每个node上gpu的数量                                            |
-| `actor_rollout_ref.rollout.nnodes`                                                 | Rollouter的node数量                                                |
-| `actor_rollout_ref.rollout.n_gpus_per_node`                                        | Rollouter每个node上gpu的数量                                          |
+| `rollout.nnodes`                                                 | Rollouter的node数量                                                |
+| `rollout.n_gpus_per_node`                                        | Rollouter每个node上gpu的数量                                          |
 | `data.train_batch_size`                                          | 在fully async策略中，该值不生效（默认设置为0）                                   |
 | `data.gen_batch_size`                                            | 在fully async策略中，使用流式的样本生产逻辑（默认设置为1)                             |
 | `rollout.total_rollout_steps`                                    | 总的rollout的sample数量                                              |
@@ -256,8 +256,8 @@ python -m recipe.fully_async_policy.fully_async_main \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq="${test_freq}" \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
index c545032fece..d23c700d7c6 100644
--- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
@@ -36,7 +36,6 @@
     rollout_trace_attr,
     rollout_trace_op,
 )
-from verl.workers.config import HFModelConfig, RolloutConfig
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -81,14 +80,12 @@ async def generate_for_partial(
 class FullyAsyncAgentLoopWorker(AgentLoopWorker):
     def __init__(
         self,
-        rollout_config: RolloutConfig,
-        model_config: HFModelConfig,
-        data_config: DictConfig,
+        config: DictConfig,
         server_handles: list[ray.actor.ActorHandle],
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        self.server_manager = FullyAsyncLLMServerManager(rollout_config, server_handles)
-        super().__init__(rollout_config, model_config, data_config, server_handles, reward_loop_worker_handles)
+        self.server_manager = FullyAsyncLLMServerManager(config, server_handles)
+        super().__init__(config, server_handles, reward_loop_worker_handles)
         # A shared cancellation event for all agent loops running on this worker.
         self.cancellation_event = asyncio.Event()
 
@@ -219,22 +216,19 @@ async def resume_agent_loops(self):
 class FullyAsyncAgentLoopManager(AgentLoopManager):
     def __init__(
         self,
-        rollout_config: RolloutConfig,
-        model_config: HFModelConfig,
-        data_config: DictConfig,
+        config: DictConfig,
         worker_group: RayWorkerGroup = None,
         rollout_resource_pool: RayResourcePool = None,
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
-        self.rollout_config = rollout_config
-        self.model_config = model_config
-        self.data_config = data_config
+        self.config = config
+        self.rollout_config = config.actor_rollout_ref.rollout
         self.worker_group = worker_group
         self.reward_loop_worker_handles = reward_loop_worker_handles
         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker
 
         # Select rollout replica class based on rollout name
-        rollout_name = rollout_config.name
+        rollout_name = self.rollout_config.name
         if rollout_name == "sglang":
             from verl.experimental.fully_async_policy.sglang_rollout.sglang_async_server import FullyAsyncSGLangReplica
 
diff --git a/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py
index 0082fc13bc8..370587f0364 100644
--- a/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py
@@ -33,9 +33,9 @@ class AsyncPartialToolAgentLoop(ToolAgentLoop):
 
     """
 
-    def __init__(self, trainer_config, **kwargs):
-        super().__init__(trainer_config, **kwargs)
-        self.enable_partial_rollout = trainer_config.config.async_training.get("partial_rollout", False)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.enable_partial_rollout = self.config.async_training.get("partial_rollout", False)
 
     # async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
     async def run(
diff --git a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
index d4cff51de3b..1f4b4db8c82 100644
--- a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
+++ b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml
@@ -30,6 +30,12 @@ async_training:
 # Rollout config
 rollout:
 
+  # Number of nodes used in the rollout
+  nnodes: 1
+
+  # Number of GPUs per node                     
+  n_gpus_per_node: 8
+
   # number of responses (i.e. num sample times). > 1 for grpo
   n: 4
 
diff --git a/verl/experimental/fully_async_policy/fully_async_main.py b/verl/experimental/fully_async_policy/fully_async_main.py
index 80332364ad1..4e9e509475f 100644
--- a/verl/experimental/fully_async_policy/fully_async_main.py
+++ b/verl/experimental/fully_async_policy/fully_async_main.py
@@ -59,14 +59,10 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
 
     # Rollout resource pool
     if Role.Rollout in roles:
-        assert config.actor_rollout_ref.rollout.n_gpus_per_node > 0, (
-            "config.actor_rollout_ref.rollout.n_gpus_per_node must be greater than 0"
-        )
-        assert config.actor_rollout_ref.rollout.nnodes > 0, (
-            "config.actor_rollout_ref.rollout.nnodes must be greater than 0"
-        )
+        assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0"
+        assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0"
 
-        rollout_pool = [config.actor_rollout_ref.rollout.n_gpus_per_node] * config.actor_rollout_ref.rollout.nnodes
+        rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes
         resource_pool_spec["rollout_pool"] = rollout_pool
         mapping[Role.Rollout] = "rollout_pool"
 
@@ -291,6 +287,9 @@ def main(config):
     from time import time
 
     start_time = time()
+    # TODO: unify rollout config with actor_rollout_ref
+    config.actor_rollout_ref.rollout.nnodes = config.rollout.nnodes
+    config.actor_rollout_ref.rollout.n_gpus_per_node = config.rollout.n_gpus_per_node
     run_ppo(config, task_runner_class=FullyAsyncTaskRunner)
     print(f"total time: {time() - start_time:.2f} seconds")
 
diff --git a/verl/experimental/fully_async_policy/fully_async_rollouter.py b/verl/experimental/fully_async_policy/fully_async_rollouter.py
index 5ad6bd503ce..4810a3730da 100644
--- a/verl/experimental/fully_async_policy/fully_async_rollouter.py
+++ b/verl/experimental/fully_async_policy/fully_async_rollouter.py
@@ -104,7 +104,7 @@ def __init__(
 
         self._validate_config()
         if self.config.async_training.use_trainer_do_validate:
-            rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
+            rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node
             train_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node
             total_gpus = rollout_gpus + train_gpus
             print(f"[FullyAsyncRollouter] split before val_dataset total len: {len(val_dataset)}")
@@ -444,11 +444,7 @@ async def _init_async_rollout_manager(self):
 
         self.async_rollout_mode = True
         self.async_rollout_manager = await FullyAsyncAgentLoopManager.create(
-            rollout_config=self.config.actor_rollout_ref.rollout,
-            model_config=self.config.actor_rollout_ref.model,
-            data_config=self.config.data,
-            worker_group=self.rollout_wg,
-            reward_loop_worker_handles=reward_loop_worker_handles,
+            config=self.config, worker_group=self.rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles
         )
 
     # Add samples to the pending_queue
diff --git a/verl/experimental/fully_async_policy/fully_async_trainer.py b/verl/experimental/fully_async_policy/fully_async_trainer.py
index 31df91f4e44..9519c594dbd 100644
--- a/verl/experimental/fully_async_policy/fully_async_trainer.py
+++ b/verl/experimental/fully_async_policy/fully_async_trainer.py
@@ -137,7 +137,7 @@ def __init__(
         self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches
         total_gpus = (
             config.trainer.nnodes * config.trainer.n_gpus_per_node
-            + config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
+            + config.rollout.nnodes * config.rollout.n_gpus_per_node
         )
         self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus)
 
@@ -147,7 +147,7 @@ def __init__(
             from verl.utils.dataset.rl_dataset import collate_fn
 
             val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor)
-            rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node
+            rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node
             print(f"[FullyAsyncTrainer] split before val_dataset total len: {len(val_dataset)}")
             split_dataset = val_dataset.split(total_gpus)
             rollout_val_dataset0 = split_dataset[rollout_gpus:]
@@ -311,9 +311,7 @@ async def _init_async_rollout_manager(self):
 
             self.async_rollout_mode = True
             self.async_rollout_manager = await FullyAsyncAgentLoopManager.create(
-                rollout_config=self.config.actor_rollout_ref.rollout,
-                model_config=self.config.actor_rollout_ref.model,
-                data_config=self.config.data,
+                config=self.config,
                 worker_group=self.actor_rollout_wg,
                 reward_loop_worker_handles=reward_loop_worker_handles,
             )
diff --git a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
index 209930aeb59..cc936f50dc1 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
@@ -176,8 +176,8 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.resume_mode=auto \
     trainer.nnodes="${n_nodes_train}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${n_nodes_rollout}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.nnodes="${n_nodes_rollout}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.test_freq=${test_freq} \
     rollout.total_epochs=10 \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
index a94387f2af9..2a5eb1bb966 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh
@@ -129,8 +129,8 @@ python3 -m verl.experimental.fully_async_policy.fully_async_main \
     data.gen_batch_size=${gen_prompt_bsz} \
     trainer.nnodes=$NNODES \
     trainer.n_gpus_per_node=$n_gpus_training \
-    actor_rollout_ref.rollout.nnodes=$NNODES \
-    actor_rollout_ref.rollout.n_gpus_per_node=$n_gpus_rollout \
+    rollout.nnodes=$NNODES \
+    rollout.n_gpus_per_node=$n_gpus_rollout \
     rollout.total_rollout_steps=$total_rollout_steps \
     rollout.total_epochs=10 \
     rollout.test_freq=$test_freq \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
index 1dcb5018c68..ba8e6804fdb 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
index 6577caada6e..5561208ee6d 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index 9823231aed1..242a5117a5e 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -153,8 +153,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     async_training.staleness_threshold="${staleness_threshold}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
index aef1bac704d..ee0657eace7 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh
@@ -152,8 +152,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index 4a273c2c8ba..002c1206b8a 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
index e1146d79d26..f01fb8184e7 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh
@@ -156,8 +156,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
index 18291a62bf7..2b2143ffa21 100644
--- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
+++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh
@@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
index 741c695de0b..8b32c6e0078 100644
--- a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
+++ b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh
@@ -99,8 +99,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs="${total_epochs}" \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
index 1b95a5becd8..ebcb634ff72 100644
--- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
+++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh
@@ -217,8 +217,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
index 3ea5196f1c6..c04a09d3266 100644
--- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
+++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh
@@ -226,8 +226,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     rollout.total_rollout_steps="${total_rollout_steps}" \
     rollout.total_epochs=10 \
     rollout.test_freq="${test_freq}" \
diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
index 19d77597dc1..0e4677be368 100644
--- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
+++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml
@@ -9,6 +9,13 @@ defaults:
 trainer:
   use_legacy_worker_impl: disable
 
+# config for the rollout (only for resource isolation)
+rollout:
+  # Number of nodes used in the rollout
+  nnodes: 1
+  # Number of GPUs per node
+  n_gpus_per_node: 8
+
 # To adapt to the current logic of AgentLoopManager
 actor_rollout_ref:
   rollout:
diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
index 1a74af3df34..dc784b2ae73 100644
--- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
+++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml
@@ -9,6 +9,13 @@ defaults:
 trainer:
   use_legacy_worker_impl: disable
 
+# config for the rollout (only for resource isolation)
+rollout:
+  # Number of nodes used in the rollout
+  nnodes: 1
+  # Number of GPUs per node
+  n_gpus_per_node: 8
+
 # To adapt to the current logic of AgentLoopManager
 actor_rollout_ref:
   rollout:
diff --git a/verl/experimental/one_step_off_policy/main_ppo.py b/verl/experimental/one_step_off_policy/main_ppo.py
index 2c2fe6108ea..0c6ecaedf0e 100644
--- a/verl/experimental/one_step_off_policy/main_ppo.py
+++ b/verl/experimental/one_step_off_policy/main_ppo.py
@@ -182,6 +182,10 @@ def main(config):
     # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
     auto_set_device(config)
 
+    # TODO: unify rollout config with actor_rollout_ref
+    config.actor_rollout_ref.rollout.nnodes = config.rollout.nnodes
+    config.actor_rollout_ref.rollout.n_gpus_per_node = config.rollout.n_gpus_per_node
+
     run_ppo(config, task_runner_class=OneStepTaskRunner)
     print(f"total time: {time() - start_time:.2f} seconds")
 
diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py
index d2c64ec577d..144632dead5 100644
--- a/verl/experimental/one_step_off_policy/ray_trainer.py
+++ b/verl/experimental/one_step_off_policy/ray_trainer.py
@@ -183,10 +183,7 @@ def _init_async_rollout_manager(self):
 
         self.async_rollout_mode = True
         self.async_rollout_manager = OneStepOffAgentLoopManager.create(
-            rollout_config=self.config.actor_rollout_ref.rollout,
-            model_config=self.config.actor_rollout_ref.model,
-            data_config=self.config.data,
-            reward_loop_worker_handles=reward_loop_worker_handles,
+            config=self.config, reward_loop_worker_handles=reward_loop_worker_handles
         )
 
     def _create_continuous_iterator(self):
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
index 4df41235c03..cbefe87424b 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh
@@ -135,5 +135,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
index e785e02c6e7..c35513cf9f2 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh
@@ -136,5 +136,5 @@ python -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}"
\ No newline at end of file
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}"
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
index 6a462aeca91..10ce9122269 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh
@@ -146,8 +146,8 @@ python -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES_TRAIN}" \
     trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
     algorithm.rollout_correction.rollout_is=null \
     algorithm.rollout_correction.rollout_is_threshold=null \
     algorithm.rollout_correction.rollout_rs=seq_mean_k1 \
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
index c92a2ad6bca..2725bb5bc3d 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh
@@ -136,5 +136,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
index 03fb457c090..a0da86affea 100644
--- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
+++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh
@@ -142,5 +142,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.log_val_generations=10 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}"
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}"
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
index 971e77e583e..facabdf58e8 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh
@@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
index 6a5338e2269..5c959f49961 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh
@@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
index 935869c0575..c5c5eb11d2a 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh
@@ -60,5 +60,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.total_epochs=2 \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
index 756c4009ad1..d6f884ad53a 100644
--- a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
+++ b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
@@ -89,5 +89,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
-    actor_rollout_ref.rollout.nnodes="${NNODES}" \
-    actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
+    rollout.nnodes="${NNODES}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" $@
\ No newline at end of file
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 8ecaa1c0c70..ae43d2bad5c 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -832,9 +832,7 @@ def init_workers(self):
         # to stream reward computation with actor rollout
         reward_loop_worker_handles = self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None
         self.async_rollout_manager = AgentLoopManager.create(
-            rollout_config=self.config.actor_rollout_ref.rollout,
-            model_config=self.config.actor_rollout_ref.model,
-            data_config=self.config.actor_rollout_ref.data,
+            config=self.config,
             worker_group=self.actor_rollout_wg,
             rollout_resource_pool=actor_rollout_resource_pool,
             reward_loop_worker_handles=reward_loop_worker_handles,

From 273f8e66b0182f6ae7813096b36bc08f0f3a0c31 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 01:37:25 +0800
Subject: [PATCH 06/10] fix

---
 verl/experimental/fully_async_policy/agent_loop/agent_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
index d23c700d7c6..89b8cb0fe86 100644
--- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
@@ -28,6 +28,7 @@
     AsyncLLMServerManager,
     DictConfigWrap,
     _agent_loop_registry,
+    _get_rollout_and_model_config,
     get_trajectory_info,
 )
 from verl.protocol import DataProto
@@ -222,7 +223,7 @@ def __init__(
         reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
     ):
         self.config = config
-        self.rollout_config = config.actor_rollout_ref.rollout
+        self.rollout_config, self.model_config = _get_rollout_and_model_config(config)
         self.worker_group = worker_group
         self.reward_loop_worker_handles = reward_loop_worker_handles
         self.agent_loop_workers_class = FullyAsyncAgentLoopWorker

From 7a6417e90e86dd89e99cd25f9f85a075092d8cf2 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 01:44:01 +0800
Subject: [PATCH 07/10] fix

---
 verl/experimental/fully_async_policy/agent_loop/agent_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
index 89b8cb0fe86..88a012224eb 100644
--- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
@@ -191,7 +191,7 @@ async def _partial_run_agent_loop(
                     tokenizer=self.tokenizer,
                     processor=self.processor,
                     dataset_cls=self.dataset_cls,
-                    dataset_config=DictConfigWrap(config=self.config.data),
+                    data_config=DictConfigWrap(config=self.config.data),
                 )
                 output: AgentLoopOutput = await agent_loop.run(
                     sampling_params, cancellation_event=self.cancellation_event, **kwargs

From 380467c8a16145f5c164958522adcb27bd3a16da Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 01:47:21 +0800
Subject: [PATCH 08/10] revert

---
 tests/special_npu/run_fully_async_policy.sh  | 4 ++--
 tests/special_npu/run_one_step_off_policy.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/special_npu/run_fully_async_policy.sh b/tests/special_npu/run_fully_async_policy.sh
index e5908798bcf..fa517e81ae4 100644
--- a/tests/special_npu/run_fully_async_policy.sh
+++ b/tests/special_npu/run_fully_async_policy.sh
@@ -124,8 +124,8 @@ common_params=(
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_gpus_training}
     trainer.log_val_generations=10
-    actor_rollout_ref.rollout.nnodes=1
-    actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout}
+    rollout.nnodes=1
+    rollout.n_gpus_per_node=${n_gpus_rollout}
     rollout.total_rollout_steps=${total_rollout_steps}
     rollout.total_epochs=2
     rollout.test_freq=${test_freq}
diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh
index 4c1ad9ce204..2426a380fec 100644
--- a/tests/special_npu/run_one_step_off_policy.sh
+++ b/tests/special_npu/run_one_step_off_policy.sh
@@ -108,8 +108,8 @@ common_params=(
     trainer.resume_mode=disable
     trainer.nnodes=1
     trainer.n_gpus_per_node=${n_npus_training}
-    actor_rollout_ref.rollout.nnodes=1
-    actor_rollout_ref.rollout.n_gpus_per_node=${n_npus_rollout}
+    rollout.nnodes=1
+    rollout.n_gpus_per_node=${n_npus_rollout}
 
 )
 

From 16e1f8782dc1acc2f103260c75b16327adc69b48 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 16:14:20 +0800
Subject: [PATCH 09/10] fix ci

---
 .../reward_loop/test_agent_reward_loop_standalone.py            | 1 +
 verl/experimental/transfer_queue/ray_trainer.py                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
index 99af766cbbe..80a0945bec7 100644
--- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
+++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
@@ -56,6 +56,7 @@ def test_agent_reward_loop_standalone():
     config.actor_rollout_ref.rollout.prompt_length = 1024
     config.actor_rollout_ref.rollout.response_length = 4096
     config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    config.actor_rollout_ref.rollout.nnodes = 1
     config.trainer.n_gpus_per_node = 4
     config.trainer.nnodes = 1
 
diff --git a/verl/experimental/transfer_queue/ray_trainer.py b/verl/experimental/transfer_queue/ray_trainer.py
index 96c6d181334..dfb2e721d66 100644
--- a/verl/experimental/transfer_queue/ray_trainer.py
+++ b/verl/experimental/transfer_queue/ray_trainer.py
@@ -817,7 +817,7 @@ def init_workers(self):
             reward_loop_worker_handles = (
                 self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None
             )
-            self.async_rollout_manager = AgentLoopManager(
+            self.async_rollout_manager = AgentLoopManager.create(
                 config=self.config,
                 worker_group=self.actor_rollout_wg,
                 reward_loop_worker_handles=reward_loop_worker_handles,

From 66a9c5e17d759d52be93180e032f74b488be9a79 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Fri, 27 Feb 2026 01:48:35 +0800
Subject: [PATCH 10/10] fix

---
 verl/experimental/agent_loop/agent_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 6f089033b6c..c60baa6abbb 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -53,7 +53,7 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
-def _get_rollout_and_model_config(config: DictConfig) -> RolloutConfig:
+def _get_rollout_and_model_config(config: DictConfig) -> tuple[DictConfig, DictConfig]:
     # TODO: backward compatibility, remove this once we switch to new trainer.
     if config.get("actor_rollout_ref"):
         return config.actor_rollout_ref.rollout, config.actor_rollout_ref.model