From 9bf16faf21e7539326cf9cf185b43d025cef30b6 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Thu, 26 Feb 2026 17:03:21 +0800 Subject: [PATCH 01/10] [BREAKING][rollout,cfg] refactor: get rid of actor_rollout_ref config from agent loop --- tests/experimental/agent_loop/agent_utils.py | 4 +- verl/experimental/agent_loop/agent_loop.py | 186 ++++++++++-------- .../agent_loop/single_turn_agent_loop.py | 6 +- .../agent_loop/tool_agent_loop.py | 39 ++-- .../_generated_ppo_megatron_trainer.yaml | 4 + .../_generated_ppo_torchtitan_trainer.yaml | 4 + .../config/_generated_ppo_trainer.yaml | 4 + .../config/_generated_ppo_veomni_trainer.yaml | 4 + verl/trainer/config/rollout/rollout.yaml | 12 ++ verl/workers/config/rollout.py | 4 + 10 files changed, 154 insertions(+), 113 deletions(-) diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py index 20e6848746a..34f955faee1 100644 --- a/tests/experimental/agent_loop/agent_utils.py +++ b/tests/experimental/agent_loop/agent_utils.py @@ -80,7 +80,9 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG rm_resource_pool=rm_resource_pool, ) agent_loop_manager = AgentLoopManager( - config=config, + rollout_config=config.actor_rollout_ref.rollout, + model_config=config.actor_rollout_ref.model, + data_config=config.data, worker_group=actor_rollout_wg, reward_loop_worker_handles=reward_loop_manager.reward_loop_workers, ) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index b591d093696..7afeeda3aee 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -35,10 +35,9 @@ from verl.experimental.agent_loop.utils import resolve_config_path from verl.protocol import DataProto from verl.single_controller.ray.base import RayResourcePool, RayWorkerGroup -from verl.utils import hf_processor, hf_tokenizer from verl.utils.chat_template import initialize_system_prompt +from verl.utils.config import omega_conf_to_dataclass from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class -from verl.utils.fs import copy_to_local from verl.utils.model import compute_position_id_with_mask from verl.utils.ray_utils import get_event_loop from verl.utils.rollout_trace import ( @@ -47,6 +46,7 @@ rollout_trace_op, ) from verl.utils.transferqueue_utils import tqbridge +from verl.workers.config import HFModelConfig, RolloutConfig from verl.workers.rollout.replica import TokenOutput, get_rollout_replica_class logger = logging.getLogger(__file__) @@ -60,15 +60,17 @@ class AsyncLLMServerManager: - Sticky session: send multi-turn chat completions to same server for automatic prefix caching """ - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): + def __init__( + self, rollout_config: RolloutConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000 + ): """Initialize the AsyncLLMServerManager. Args: - config (DictConfig): YAML config. + rollout_config (RolloutConfig): rollout config. server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. """ - self.config = config + self.rollout_config = rollout_config self.server_handles = server_handles random.shuffle(self.server_handles) @@ -190,35 +192,34 @@ def __init__(self, config: DictConfig): class AgentLoopBase(ABC): """An agent loop takes an input message, chat with OpenAI compatible LLM server and interact with various - environments.""" + environments. + + Args: + rollout_config (RolloutConfig): rollout config. + server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. + tokenizer (AutoTokenizer): Tokenizer for tokenize messages. + processor (AutoProcessor): Processor for process messages. + dataset_cls (type[Dataset]): Dataset class for creating dataset, Defaults to RLHFDataset. + data_config (DictConfigWrap): Dataset config. + """ def __init__( self, - trainer_config: DictConfigWrap, + rollout_config: DictConfigWrap, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, processor: AutoProcessor, dataset_cls: type[RLHFDataset], - dataset_config: DictConfigWrap, + data_config: DictConfigWrap, **kwargs, ): - """Initialize agent loop, each sample will have its own loop instance. - - Args: - trainer_config (DictConfigWrap): trainer config. - server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - processor (AutoProcessor): Processor for process messages. - dataset_cls (type[Dataset]): Dataset class for creating dataset, Defaults to RLHFDataset. - dataset_config (DictConfigWrap): Dataset config. - """ - self.config = trainer_config.config + self.rollout_config = rollout_config.config self.server_manager = server_manager self.tokenizer = tokenizer self.processor = processor self.dataset_cls = dataset_cls - self.dataset_config = dataset_config.config - self.apply_chat_template_kwargs = self.dataset_config.get("apply_chat_template_kwargs", {}) + self.data_config = data_config.config + self.apply_chat_template_kwargs = self.data_config.get("apply_chat_template_kwargs", {}) self.system_prompt = initialize_system_prompt(self.tokenizer, **self.apply_chat_template_kwargs) self.loop = get_event_loop() @@ -234,7 +235,7 @@ async def process_vision_info(self, messages: list[dict]) -> dict: multi_modal_data = {} if self.processor is not None: images, videos = await self.dataset_cls.process_vision_info( - messages, image_patch_size=self.processor.image_processor.patch_size, config=self.dataset_config + messages, image_patch_size=self.processor.image_processor.patch_size, config=self.data_config ) if images is not None: multi_modal_data["images"] = images @@ -342,50 +343,53 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]: class AgentLoopWorker: - """Agent loop worker takes a batch of messages and run each message in an agent loop.""" + """Agent loop worker takes a batch of messages and run each message in an agent loop. + + Args: + rollout_config (RolloutConfig): rollout config. + model_config (HFModelConfig): model config. + data_config (DictConfig): data config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. + """ def __init__( self, - config: DictConfig, + rollout_config: RolloutConfig, + model_config: HFModelConfig, + data_config: DictConfig, server_handles: list[ray.actor.ActorHandle], reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - """Initialize agent loop manager. - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. - """ - self.config = config + self.rollout_config: RolloutConfig = omega_conf_to_dataclass(rollout_config) + self.model_config: HFModelConfig = omega_conf_to_dataclass(model_config) + self.data_config = data_config # for recipe to change if not hasattr(self, "server_manager"): - self.server_manager = AsyncLLMServerManager(config, server_handles) + self.server_manager = AsyncLLMServerManager(self.rollout_config, server_handles) - self.dataset_cls = get_dataset_class(config.data) + self.dataset_cls = get_dataset_class(data_config) self.reward_loop_worker_handles = reward_loop_worker_handles - model_path = config.actor_rollout_ref.model.path - self.model_name = "/".join(model_path.split("/")[-2:]) - local_path = copy_to_local(config.actor_rollout_ref.model.path) - self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) - self.processor = hf_processor(local_path, trust_remote_code=True) + self.tokenizer = self.model_config.tokenizer + self.processor = self.model_config.processor - agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path + agent_loop_config_path = self.rollout_config.agent.agent_loop_config_path if agent_loop_config_path: resolved_path = resolve_config_path(agent_loop_config_path) agent_loop_configs = OmegaConf.load(resolved_path) for agent_loop_config in agent_loop_configs: _agent_loop_registry[agent_loop_config.name] = agent_loop_config - if self.config.actor_rollout_ref.model.get("custom_chat_template", None) is not None: - if self.processor is not None: - self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template - self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template + if self.model_config.get("custom_chat_template", None) is not None: + if self.model_config.processor is not None: + self.model_config.processor.chat_template = self.model_config.custom_chat_template + self.model_config.tokenizer.chat_template = self.model_config.custom_chat_template - trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) + trace_config = self.rollout_config.trace RolloutTraceConfig.init( - self.config.trainer.project_name, - self.config.trainer.experiment_name, + self.rollout_config.trace.project_name, + self.rollout_config.trace.experiment_name, trace_config.get("backend"), trace_config.get("token2text", False), trace_config.get("max_samples_per_step_per_worker", None), @@ -413,7 +417,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: responses: |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->| response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0| """ - config = self.config.actor_rollout_ref.rollout + config = self.rollout_config sampling_params = dict( temperature=config.temperature, top_p=config.top_p, @@ -497,12 +501,12 @@ async def _run_agent_loop( agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, - trainer_config=DictConfigWrap(config=self.config), + rollout_config=DictConfigWrap(self.rollout_config), server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, dataset_cls=self.dataset_cls, - dataset_config=DictConfigWrap(self.config.data), + data_config=DictConfigWrap(self.data_config), ) output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs) return await self._agent_loop_postprocess(output, **kwargs) @@ -536,7 +540,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO prompt_output = self.tokenizer.pad( {"input_ids": output.prompt_ids}, padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.prompt_length, + max_length=self.rollout_config.prompt_length, return_tensors="pt", return_attention_mask=True, ) @@ -548,7 +552,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO response_output = self.tokenizer.pad( {"input_ids": output.response_ids}, padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.response_length, + max_length=self.rollout_config.response_length, return_tensors="pt", return_attention_mask=True, ) @@ -559,7 +563,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO response_mask_output = self.tokenizer.pad( {"input_ids": output.response_mask}, padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.response_length, + max_length=self.rollout_config.response_length, return_tensors="pt", return_attention_mask=False, ) @@ -568,7 +572,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO response_logprobs = None if output.response_logprobs is not None: - pad_size = self.config.actor_rollout_ref.rollout.response_length - len(output.response_logprobs) + pad_size = self.rollout_config.response_length - len(output.response_logprobs) response_logprobs = torch.tensor(output.response_logprobs + [0.0] * pad_size).unsqueeze(0) response_mask = response_mask_output["input_ids"] * response_output["attention_mask"] @@ -846,67 +850,77 @@ async def get_trajectory_info(step, index, validate): class AgentLoopManager: - """Agent loop manager that manages a group of agent loop workers.""" + """Agent loop manager that manages a group of agent loop workers. + + - if worker_group is not None, rollout server is in hybrid mode, share GPUs with training engine. + - otherwise, rollout server is in standalone mode, use separate GPUs, e.g., one-step-off/fully async training. + + Args: + rollout_config (RolloutConfig): rollout config. + model_config (HFModelConfig): model config. + data_config (DictConfig): data config. + worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode. + rollout_resource_pool (RayResourcePool): Resource pool for hybrid mode, only used by TensorRT-LLM. + reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. + """ def __init__( self, - config: DictConfig, + rollout_config: RolloutConfig, + model_config: HFModelConfig, + data_config: DictConfig, worker_group: RayWorkerGroup = None, rollout_resource_pool: RayResourcePool = None, reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - """Initialize agent loop manager. + assert worker_group is not None or rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode" - Args: - config (DictConfig): trainer config. - worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode. - rollout_resource_pool (RayResourcePool): Resource pool for actor rollout (Colocate or Standalone mode). - reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. - """ - self.config = config + self.rollout_config = rollout_config + self.model_config = model_config + self.data_config = data_config self.worker_group = worker_group + self.rollout_resource_pool = rollout_resource_pool self.reward_loop_worker_handles = reward_loop_worker_handles # for recipe to change if not hasattr(self, "rollout_replica_class"): - self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name) + self.rollout_replica_class = get_rollout_replica_class(self.rollout_config.name) if not hasattr(self, "agent_loop_workers_class"): self.agent_loop_workers_class = ray.remote(AgentLoopWorker) - self._initialize_llm_servers(rollout_resource_pool) + self._initialize_llm_servers() self._init_agent_loop_workers() - def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool): + def _initialize_llm_servers(self): rollout_world_size = ( - self.config.actor_rollout_ref.rollout.tensor_model_parallel_size - * self.config.actor_rollout_ref.rollout.data_parallel_size - * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size + self.rollout_config.tensor_model_parallel_size + * self.rollout_config.data_parallel_size + * self.rollout_config.pipeline_model_parallel_size ) world_size = ( self.worker_group.world_size if self.worker_group - else self.config.trainer.n_gpus_per_node * self.config.trainer.nnodes + else self.rollout_config.n_gpus_per_node * self.rollout_config.nnodes ) num_replicas = world_size // rollout_world_size - rollout_config = self.config.actor_rollout_ref.rollout - model_config = self.config.actor_rollout_ref.model self.rollout_replicas = [ self.rollout_replica_class( replica_rank=replica_rank, - config=rollout_config, - model_config=model_config, - gpus_per_node=self.config.trainer.n_gpus_per_node, + config=self.rollout_config, + model_config=self.model_config, + gpus_per_node=self.rollout_config.n_gpus_per_node, ) for replica_rank in range(num_replicas) ] - if self.worker_group and rollout_config.name != "trtllm": + if self.worker_group and self.rollout_config.name != "trtllm": self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) - elif self.worker_group and rollout_config.name == "trtllm": + # TODO: unify trtllm to init_hybrid + elif self.worker_group and self.rollout_config.name == "trtllm": self._run_all( [ - server.init_hybrid_colocated(self.worker_group, rollout_resource_pool) + server.init_hybrid_colocated(self.worker_group, self.rollout_resource_pool) for server in self.rollout_replicas ] ) @@ -919,14 +933,14 @@ def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool): print(f"AgentLoopManager: {self.server_addresses}") # Update Prometheus configuration with server addresses - if rollout_config.prometheus.enable: - if rollout_config.disable_log_stats: + if self.rollout_config.prometheus.enable: + if self.rollout_config.disable_log_stats: raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.") - update_prometheus_config(rollout_config.prometheus, self.server_addresses, rollout_config.name) + update_prometheus_config(self.rollout_config.prometheus, self.server_addresses, self.rollout_config.name) def _init_agent_loop_workers(self): self.agent_loop_workers = [] - num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers + num_workers = self.rollout_config.agent.num_workers node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0] for i in range(num_workers): @@ -938,7 +952,13 @@ def _init_agent_loop_workers(self): scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( node_id=node_id, soft=True ), - ).remote(self.config, self.server_handles, self.reward_loop_worker_handles) + ).remote( + self.rollout_config, + self.model_config, + self.data_config, + self.server_handles, + self.reward_loop_worker_handles, + ) ) def generate_sequences(self, prompts: DataProto) -> DataProto: diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py index 40c60743281..2a5831db771 100644 --- a/verl/experimental/agent_loop/single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/single_turn_agent_loop.py @@ -30,10 +30,10 @@ class SingleTurnAgentLoop(AgentLoopBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length - self.response_length = self.config.actor_rollout_ref.rollout.response_length + self.prompt_length = self.rollout_config.prompt_length + self.response_length = self.rollout_config.response_length - tool_config_path = self.config.data.tool_config_path + tool_config_path = self.rollout_config.multi_turn.tool_config_path tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else [] self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list] diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index ee6176775e0..d8b0f11e88d 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -21,13 +21,10 @@ import torch from PIL import Image -from transformers import AutoProcessor, AutoTokenizer from verl.experimental.agent_loop.agent_loop import ( AgentLoopBase, AgentLoopOutput, - AsyncLLMServerManager, - DictConfigWrap, register, ) from verl.experimental.agent_loop.tool_parser import FunctionCall, ToolParser @@ -96,37 +93,27 @@ def __init__( @register("tool_agent") class ToolAgentLoop(AgentLoopBase): - def __init__( - self, - trainer_config: DictConfigWrap, - server_manager: AsyncLLMServerManager, - tokenizer: AutoTokenizer, - processor: AutoProcessor, - **kwargs, - ): - super().__init__(trainer_config, server_manager, tokenizer, processor, **kwargs) - config = trainer_config.config + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) # Initialize tools from config file - self.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns - self.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns - self.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls - self.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length - self.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side - tool_config_path = config.actor_rollout_ref.rollout.multi_turn.tool_config_path + self.max_user_turns = self.rollout_config.multi_turn.max_user_turns + self.max_assistant_turns = self.rollout_config.multi_turn.max_assistant_turns + self.max_parallel_calls = self.rollout_config.multi_turn.max_parallel_calls + self.max_tool_response_length = self.rollout_config.multi_turn.max_tool_response_length + self.tool_response_truncate_side = self.rollout_config.multi_turn.tool_response_truncate_side + tool_config_path = self.rollout_config.multi_turn.tool_config_path tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else [] self.tools = {tool.name: tool for tool in tool_list} self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list] - self.tool_parser = ToolParser.get_tool_parser( - config.actor_rollout_ref.rollout.multi_turn.format, self.tokenizer - ) - self.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format + self.tool_parser = ToolParser.get_tool_parser(self.rollout_config.multi_turn.format, self.tokenizer) + self.tool_parser_name = self.rollout_config.multi_turn.format - self.prompt_length = config.actor_rollout_ref.rollout.prompt_length - self.response_length = config.actor_rollout_ref.rollout.response_length + self.prompt_length = self.rollout_config.prompt_length + self.response_length = self.rollout_config.response_length # Initialize interactions from config file - self.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path + self.interaction_config_file = self.rollout_config.multi_turn.interaction_config_path if self.interaction_config_file: self.interaction_map: dict[str, BaseInteraction] = self._initialize_interactions( self.interaction_config_file diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml index ea60c881619..09391ec6af3 100644 --- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml @@ -216,6 +216,8 @@ actor_rollout_ref: _target_: verl.workers.config.RolloutConfig name: ??? mode: async + nnodes: 0 + n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8} temperature: 1.0 top_k: -1 top_p: 1 @@ -290,6 +292,8 @@ actor_rollout_ref: engine_kwargs: {} trace: _target_: verl.workers.config.TraceConfig + project_name: ${oc.select:trainer.project_name,null} + experiment_name: ${oc.select:trainer.experiment_name,null} backend: null token2text: false max_samples_per_step_per_worker: null diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml index b9a8b3aaf84..b923da853ec 100644 --- a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml @@ -205,6 +205,8 @@ actor_rollout_ref: _target_: verl.workers.config.RolloutConfig name: ??? mode: async + nnodes: 0 + n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8} temperature: 1.0 top_k: -1 top_p: 1 @@ -279,6 +281,8 @@ actor_rollout_ref: engine_kwargs: {} trace: _target_: verl.workers.config.TraceConfig + project_name: ${oc.select:trainer.project_name,null} + experiment_name: ${oc.select:trainer.experiment_name,null} backend: null token2text: false max_samples_per_step_per_worker: null diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index 6b97103ae9f..1cdc21b1ec8 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -204,6 +204,8 @@ actor_rollout_ref: _target_: verl.workers.config.RolloutConfig name: ??? mode: async + nnodes: 0 + n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8} temperature: 1.0 top_k: -1 top_p: 1 @@ -278,6 +280,8 @@ actor_rollout_ref: engine_kwargs: {} trace: _target_: verl.workers.config.TraceConfig + project_name: ${oc.select:trainer.project_name,null} + experiment_name: ${oc.select:trainer.experiment_name,null} backend: null token2text: false max_samples_per_step_per_worker: null diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml index 4528e0d667d..ccaf6582902 100644 --- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml @@ -186,6 +186,8 @@ actor_rollout_ref: _target_: verl.workers.config.RolloutConfig name: ??? mode: async + nnodes: 0 + n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8} temperature: 1.0 top_k: -1 top_p: 1 @@ -260,6 +262,8 @@ actor_rollout_ref: engine_kwargs: {} trace: _target_: verl.workers.config.TraceConfig + project_name: ${oc.select:trainer.project_name,null} + experiment_name: ${oc.select:trainer.experiment_name,null} backend: null token2text: false max_samples_per_step_per_worker: null diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml index e1a4d2dad6d..894538d1d87 100644 --- a/verl/trainer/config/rollout/rollout.yaml +++ b/verl/trainer/config/rollout/rollout.yaml @@ -7,6 +7,12 @@ name: ??? # sync: LLM, async: AsyncLLM mode: async +# Number of nodes for standalone rollout server, must be > 0 in one-step-off/fully async training. +nnodes: 0 + +# Number of GPUs per node for rollout server. +n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8} + # Sampling temperature for rollout. temperature: 1.0 @@ -273,6 +279,12 @@ trace: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.workers.config.TraceConfig + # Project name for experiment tracking (e.g., wandb) + project_name: ${oc.select:trainer.project_name,null} + + # Experiment name for run identification in tracking tools + experiment_name: ${oc.select:trainer.experiment_name,null} + # trace backend, support mlflow, weave backend: null diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py index 8d0d732e263..d1d5c8f1768 100644 --- a/verl/workers/config/rollout.py +++ b/verl/workers/config/rollout.py @@ -80,6 +80,8 @@ class AgentLoopConfig(BaseConfig): @dataclass class TraceConfig(BaseConfig): + project_name: Optional[str] = None + experiment_name: Optional[str] = None backend: Optional[str] = None token2text: bool = False max_samples_per_step_per_worker: Optional[int] = None @@ -138,6 +140,8 @@ class RolloutConfig(BaseConfig): name: Optional[str] = MISSING mode: str = "async" + nnodes: int = 0 + n_gpus_per_node: int = 8 temperature: float = 1.0 top_k: int = -1 From 05f879c1d30005ed0d33020523f9e95657adf9a4 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Thu, 26 Feb 2026 17:56:02 +0800 Subject: [PATCH 02/10] one_step_off_policy --- tests/special_e2e/run_one_step_off_policy.sh | 4 +- .../agent_loop/agent_loop.py | 50 ------------------- .../one_step_off_ppo_megatron_trainer.yaml | 7 --- .../config/one_step_off_ppo_trainer.yaml | 7 --- .../one_step_off_policy/ray_trainer.py | 5 +- .../shell/dapo_7b_math_fsdp2_4_12.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64_ris.sh | 4 +- .../shell/dapo_7b_math_fsdp2_sglang_4_12.sh | 4 +- .../shell/dapo_7b_math_megatron_4_12.sh | 4 +- .../shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh | 4 +- .../shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh | 4 +- .../shell/grpo_3b_gsm8k_fsdp2_2_6.sh | 4 +- .../grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh | 4 +- 14 files changed, 24 insertions(+), 85 deletions(-) diff --git a/tests/special_e2e/run_one_step_off_policy.sh b/tests/special_e2e/run_one_step_off_policy.sh index bdcba5caaaf..9bbe16045c2 100755 --- a/tests/special_e2e/run_one_step_off_policy.sh +++ b/tests/special_e2e/run_one_step_off_policy.sh @@ -90,6 +90,8 @@ common_params=( actor_rollout_ref.rollout.val_kwargs.n=1 actor_rollout_ref.rollout.enable_chunked_prefill=True actor_rollout_ref.rollout.name=vllm + actor_rollout_ref.rollout.nnodes=1 + actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} actor_rollout_ref.rollout.checkpoint_engine.backend='nccl' actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=1024 reward.reward_manager.name=dapo @@ -109,8 +111,6 @@ common_params=( trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} - rollout.nnodes=1 - rollout.n_gpus_per_node=${n_gpus_rollout} ) diff --git a/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py b/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py index 2ae476df4da..85455d655b2 100644 --- a/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py +++ b/verl/experimental/one_step_off_policy/agent_loop/agent_loop.py @@ -18,9 +18,7 @@ import ray from verl.experimental.agent_loop.agent_loop import AgentLoopManager -from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config from verl.protocol import DataProto -from verl.single_controller.ray import RayResourcePool logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -56,54 +54,6 @@ async def generate_sequences_async(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing, **outputs[0].meta_info} return output - def _initialize_llm_servers(self, rollout_resource_pool: RayResourcePool): - rollout_world_size = ( - self.config.actor_rollout_ref.rollout.tensor_model_parallel_size - * self.config.actor_rollout_ref.rollout.data_parallel_size - * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size - ) - world_size = ( - self.worker_group.world_size - if self.worker_group - else self.config.rollout.n_gpus_per_node * self.config.rollout.nnodes - ) - num_replicas = world_size // rollout_world_size - - rollout_config = self.config.actor_rollout_ref.rollout - model_config = self.config.actor_rollout_ref.model - self.rollout_replicas = [ - self.rollout_replica_class( - replica_rank=replica_rank, - config=rollout_config, - model_config=model_config, - gpus_per_node=self.config.rollout.n_gpus_per_node, - ) - for replica_rank in range(num_replicas) - ] - - if self.worker_group and rollout_config.name != "trtllm": - self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) - elif self.worker_group and rollout_config.name == "trtllm": - self._run_all( - [ - server.init_hybrid_colocated(self.worker_group, rollout_resource_pool) - for server in self.rollout_replicas - ] - ) - else: - self._run_all([server.init_standalone() for server in self.rollout_replicas]) - - self.server_handles = [server._server_handle for server in self.rollout_replicas] - self.server_addresses = [server._server_address for server in self.rollout_replicas] - - print(f"AgentLoopManager: {self.server_addresses}") - - # Update Prometheus configuration with server addresses - if rollout_config.prometheus.enable: - if rollout_config.disable_log_stats: - raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.") - update_prometheus_config(rollout_config.prometheus, self.server_addresses, rollout_config.name) - async def wake_up(self): await asyncio.gather(*[replica.wake_up() for replica in self.rollout_replicas]) diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml index 0e4677be368..19d77597dc1 100644 --- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml +++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml @@ -9,13 +9,6 @@ defaults: trainer: use_legacy_worker_impl: disable -# config for the rollout (only for resource isolation) -rollout: - # Number of nodes used in the rollout - nnodes: 1 - # Number of GPUs per node - n_gpus_per_node: 8 - # To adapt to the current logic of AgentLoopManager actor_rollout_ref: rollout: diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml index dc784b2ae73..1a74af3df34 100644 --- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml +++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml @@ -9,13 +9,6 @@ defaults: trainer: use_legacy_worker_impl: disable -# config for the rollout (only for resource isolation) -rollout: - # Number of nodes used in the rollout - nnodes: 1 - # Number of GPUs per node - n_gpus_per_node: 8 - # To adapt to the current logic of AgentLoopManager actor_rollout_ref: rollout: diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py index 70a2a3d3d90..caba2cf6ad3 100644 --- a/verl/experimental/one_step_off_policy/ray_trainer.py +++ b/verl/experimental/one_step_off_policy/ray_trainer.py @@ -183,7 +183,10 @@ def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = OneStepOffAgentLoopManager( - config=self.config, reward_loop_worker_handles=reward_loop_worker_handles + rollout_config=self.config.actor_rollout_ref.rollout, + model_config=self.config.actor_rollout_ref.model, + data_config=self.config.data, + reward_loop_worker_handles=reward_loop_worker_handles, ) def _create_continuous_iterator(self): diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh index cbefe87424b..4df41235c03 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -135,5 +135,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh index c35513cf9f2..e785e02c6e7 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -136,5 +136,5 @@ python -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ No newline at end of file + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh index 10ce9122269..6a462aeca91 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh @@ -146,8 +146,8 @@ python -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ algorithm.rollout_correction.rollout_is=null \ algorithm.rollout_correction.rollout_is_threshold=null \ algorithm.rollout_correction.rollout_rs=seq_mean_k1 \ diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh index 2725bb5bc3d..c92a2ad6bca 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh @@ -136,5 +136,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh index a0da86affea..03fb457c090 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh @@ -142,5 +142,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh index facabdf58e8..971e77e583e 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh @@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh index 5c959f49961..6a5338e2269 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh @@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh index c5c5eb11d2a..935869c0575 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh @@ -60,5 +60,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh index d6f884ad53a..756c4009ad1 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh @@ -89,5 +89,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file From d23b2829e523282094708b8ac16316878a5b0554 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Thu, 26 Feb 2026 20:23:38 +0800 Subject: [PATCH 03/10] fully_async --- tests/experimental/agent_loop/agent_utils.py | 2 +- ...t_agent_loop_extra_fields_schema_on_cpu.py | 16 ++-- .../test_agent_reward_loop_colocate.py | 7 +- .../test_agent_reward_loop_standalone.py | 7 +- tests/special_e2e/run_fully_async_policy.sh | 4 +- tests/special_npu/run_fully_async_policy.sh | 4 +- tests/special_npu/run_one_step_off_policy.sh | 4 +- verl/experimental/agent_loop/agent_loop.py | 62 ++++++++------ .../experimental/fully_async_policy/README.md | 8 +- .../fully_async_policy/README_zh.md | 8 +- .../agent_loop/agent_loop.py | 82 ++++--------------- .../partial_single_turn_agent_loop.py | 6 +- .../config/fully_async_ppo_trainer.yaml | 6 -- .../fully_async_policy/fully_async_main.py | 10 ++- .../fully_async_rollouter.py | 8 +- .../fully_async_policy/fully_async_trainer.py | 8 +- .../shell/dapo_30b_a3b_base_math_fsdp.sh | 4 +- .../shell/dapo_7b_async_retool.sh | 4 +- .../shell/dapo_7b_math_fsdp2_16_16.sh | 4 +- .../shell/dapo_7b_math_fsdp2_32_32.sh | 4 +- .../shell/dapo_7b_math_fsdp2_4_12.sh | 4 +- .../shell/dapo_7b_math_fsdp2_4_4.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64_mis.sh | 4 +- .../shell/dapo_7b_math_fsdp2_8_8.sh | 4 +- .../shell/geo3k_qwen25vl_7b_megatron_4_4.sh | 4 +- .../grpo_30b_a3b_base_math_megatron_96_32.sh | 4 +- ...po_30b_a3b_base_math_megatron_96_32_mis.sh | 4 +- verl/trainer/ppo/ray_trainer.py | 6 +- 29 files changed, 138 insertions(+), 158 deletions(-) diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py index 34f955faee1..5103ff9c357 100644 --- a/tests/experimental/agent_loop/agent_utils.py +++ b/tests/experimental/agent_loop/agent_utils.py @@ -79,7 +79,7 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG config=config, rm_resource_pool=rm_resource_pool, ) - agent_loop_manager = AgentLoopManager( + agent_loop_manager = AgentLoopManager.create( rollout_config=config.actor_rollout_ref.rollout, model_config=config.actor_rollout_ref.model, data_config=config.data, diff --git a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py index f8bda825ab2..a5c5ab3dde3 100644 --- a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py +++ b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py @@ -147,7 +147,9 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu( # Minimal config surface used by the agent loops. config = OmegaConf.create( { - "actor_rollout_ref": {"rollout": {"prompt_length": 16, "response_length": 16}}, + "actor_rollout_ref": { + "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}} + }, "data": { "tool_config_path": None, "apply_chat_template_kwargs": {}, @@ -159,24 +161,24 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu( tokenizer = _FakeTokenizer() processor = None - trainer_config = DictConfigWrap(config) - dataset_config = DictConfigWrap(config.data) + rollout_config = DictConfigWrap(config.actor_rollout_ref.rollout) + data_config = DictConfigWrap(config.data) single_turn = SingleTurnAgentLoop( - trainer_config=trainer_config, + rollout_config=rollout_config, server_manager=server_manager, tokenizer=tokenizer, processor=processor, dataset_cls=RLHFDataset, - dataset_config=dataset_config, + data_config=data_config, ) partial_single_turn = PartialSingleTurnAgentLoop( - trainer_config=trainer_config, + rollout_config=rollout_config, server_manager=server_manager, tokenizer=tokenizer, processor=processor, dataset_cls=RLHFDataset, - dataset_config=dataset_config, + data_config=data_config, ) raw_prompt = [{"role": "user", "content": "hi"}] diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py index 0e4e6b93683..1cf1014602e 100644 --- a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py +++ b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py @@ -98,7 +98,12 @@ def test_agent_reward_loop_standalone(): ) actor_rollout_wg.init_model() - agent_loop_manager = AgentLoopManager(config, worker_group=actor_rollout_wg) + agent_loop_manager = AgentLoopManager.create( + rollout_config=config.actor_rollout_ref.rollout, + model_config=config.actor_rollout_ref.model, + data_config=config.data, + worker_group=actor_rollout_wg, + ) # sleep rollout replicas checkpoint_manager = CheckpointEngineManager( config=omega_conf_to_dataclass(config.actor_rollout_ref.rollout.checkpoint_engine), diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py index bd9011b9874..b9c1f9a3f9d 100644 --- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py +++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py @@ -76,8 +76,11 @@ def test_agent_reward_loop_standalone(): # 1. init reward model manager reward_loop_manager = RewardLoopManager(config) - agent_loop_manager = AgentLoopManager( - config=config, reward_loop_worker_handles=reward_loop_manager.reward_loop_workers + agent_loop_manager = AgentLoopManager.create( + rollout_config=config.actor_rollout_ref.rollout, + model_config=config.actor_rollout_ref.model, + data_config=config.data, + reward_loop_worker_handles=reward_loop_manager.reward_loop_workers, ) # 2. init test data diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 01d807ba63a..4f7882b60d4 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -124,8 +124,8 @@ common_params=( trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} trainer.log_val_generations=10 - rollout.nnodes=1 - rollout.n_gpus_per_node=${n_gpus_rollout} + actor_rollout_ref.rollout.nnodes=1 + actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 rollout.test_freq=${test_freq} diff --git a/tests/special_npu/run_fully_async_policy.sh b/tests/special_npu/run_fully_async_policy.sh index fa517e81ae4..e5908798bcf 100644 --- a/tests/special_npu/run_fully_async_policy.sh +++ b/tests/special_npu/run_fully_async_policy.sh @@ -124,8 +124,8 @@ common_params=( trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} trainer.log_val_generations=10 - rollout.nnodes=1 - rollout.n_gpus_per_node=${n_gpus_rollout} + actor_rollout_ref.rollout.nnodes=1 + actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 rollout.test_freq=${test_freq} diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh index 2426a380fec..4c1ad9ce204 100644 --- a/tests/special_npu/run_one_step_off_policy.sh +++ b/tests/special_npu/run_one_step_off_policy.sh @@ -108,8 +108,8 @@ common_params=( trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_npus_training} - rollout.nnodes=1 - rollout.n_gpus_per_node=${n_npus_rollout} + actor_rollout_ref.rollout.nnodes=1 + actor_rollout_ref.rollout.n_gpus_per_node=${n_npus_rollout} ) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 7afeeda3aee..27d31458b88 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -39,7 +39,7 @@ from verl.utils.config import omega_conf_to_dataclass from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class from verl.utils.model import compute_position_id_with_mask -from verl.utils.ray_utils import get_event_loop +from verl.utils.ray_utils import auto_await, get_event_loop from verl.utils.rollout_trace import ( RolloutTraceConfig, rollout_trace_attr, @@ -888,10 +888,26 @@ def __init__( if not hasattr(self, "agent_loop_workers_class"): self.agent_loop_workers_class = ray.remote(AgentLoopWorker) - self._initialize_llm_servers() - self._init_agent_loop_workers() + @classmethod + @auto_await + async def create( + cls, + rollout_config: RolloutConfig, + model_config: HFModelConfig, + data_config: DictConfig, + worker_group: RayWorkerGroup = None, + rollout_resource_pool: RayResourcePool = None, + reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, + ): + """Create agent loop manager.""" + instance = cls( + rollout_config, model_config, data_config, worker_group, rollout_resource_pool, reward_loop_worker_handles + ) + await instance._initialize_llm_servers() + await instance._init_agent_loop_workers() + return instance - def _initialize_llm_servers(self): + async def _initialize_llm_servers(self): rollout_world_size = ( self.rollout_config.tensor_model_parallel_size * self.rollout_config.data_parallel_size @@ -915,17 +931,17 @@ def _initialize_llm_servers(self): ] if self.worker_group and self.rollout_config.name != "trtllm": - self._run_all([server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) + await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) # TODO: unify trtllm to init_hybrid elif self.worker_group and self.rollout_config.name == "trtllm": - self._run_all( - [ + await asyncio.gather( + *[ server.init_hybrid_colocated(self.worker_group, self.rollout_resource_pool) for server in self.rollout_replicas ] ) else: - self._run_all([server.init_standalone() for server in self.rollout_replicas]) + await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas]) self.server_handles = [server._server_handle for server in self.rollout_replicas] self.server_addresses = [server._server_address for server in self.rollout_replicas] @@ -938,7 +954,7 @@ def _initialize_llm_servers(self): raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.") update_prometheus_config(self.rollout_config.prometheus, self.server_addresses, self.rollout_config.name) - def _init_agent_loop_workers(self): + async def _init_agent_loop_workers(self): self.agent_loop_workers = [] num_workers = self.rollout_config.agent.num_workers @@ -961,7 +977,8 @@ def _init_agent_loop_workers(self): ) ) - def generate_sequences(self, prompts: DataProto) -> DataProto: + @auto_await + async def generate_sequences(self, prompts: DataProto) -> DataProto: """Split input batch and dispatch to agent loop workers. Args: @@ -972,8 +989,8 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: """ chunkes = prompts.chunk(len(self.agent_loop_workers)) - outputs = ray.get( - [ + outputs = await asyncio.gather( + *[ worker.generate_sequences.remote(chunk) for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True) ] @@ -1014,20 +1031,17 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data return timing - def clear_kv_cache(self): + @auto_await + async def clear_kv_cache(self): """Clear all rollout kv cache, but don`t sleep.""" - self._run_all([replica.clear_kv_cache() for replica in self.rollout_replicas]) + await asyncio.gather(*[replica.clear_kv_cache() for replica in self.rollout_replicas]) - def start_profile(self, **kwargs): + @auto_await + async def start_profile(self, **kwargs): """Start profiling on all rollout replicas.""" - self._run_all([replica.start_profile(**kwargs) for replica in self.rollout_replicas]) + await asyncio.gather(*[replica.start_profile(**kwargs) for replica in self.rollout_replicas]) - def stop_profile(self): + @auto_await + async def stop_profile(self): """Stop profiling on all rollout replicas.""" - self._run_all([replica.stop_profile() for replica in self.rollout_replicas]) - - def _run_all(self, tasks: list[asyncio.Task]): - async def run_all(): - await asyncio.gather(*tasks) - - asyncio.run(run_all()) + await asyncio.gather(*[replica.stop_profile() for replica in self.rollout_replicas]) diff --git a/verl/experimental/fully_async_policy/README.md b/verl/experimental/fully_async_policy/README.md index b7ff1756459..311e8dfc0ea 100644 --- a/verl/experimental/fully_async_policy/README.md +++ b/verl/experimental/fully_async_policy/README.md @@ -92,8 +92,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev |------------------------------------------------------------------|------------------------------------------------------------------------------------------------| | `trainer.nnodes` | Number of nodes for Trainer | | `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer | -| `rollout.nnodes` | Number of nodes for Rollouter | -| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | +| `actor_rollout_ref.rollout.nnodes` | Number of nodes for Rollouter | +| `actor_rollout_ref.rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | | `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) | | `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) | | `rollout.total_rollout_steps` | Total number of rollout samples | @@ -313,8 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.mode=${rollout_mode} \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/README_zh.md b/verl/experimental/fully_async_policy/README_zh.md index ad2e52e4167..6861114debe 100644 --- a/verl/experimental/fully_async_policy/README_zh.md +++ b/verl/experimental/fully_async_policy/README_zh.md @@ -69,8 +69,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev |------------------------------------------------------------------|-----------------------------------------------------------------| | `trainer.nnodes` | Trainer的node数量 | | `trainer.n_gpus_per_node` | Trainer每个node上gpu的数量 | -| `rollout.nnodes` | Rollouter的node数量 | -| `rollout.n_gpus_per_node` | Rollouter每个node上gpu的数量 | +| `actor_rollout_ref.rollout.nnodes` | Rollouter的node数量 | +| `actor_rollout_ref.rollout.n_gpus_per_node` | Rollouter每个node上gpu的数量 | | `data.train_batch_size` | 在fully async策略中,该值不生效(默认设置为0) | | `data.gen_batch_size` | 在fully async策略中,使用流式的样本生产逻辑(默认设置为1) | | `rollout.total_rollout_steps` | 总的rollout的sample数量 | @@ -256,8 +256,8 @@ python -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.mode=${rollout_mode} \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py index 9240000c61c..c545032fece 100644 --- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py @@ -30,13 +30,13 @@ _agent_loop_registry, get_trajectory_info, ) -from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config from verl.protocol import DataProto -from verl.single_controller.ray import RayWorkerGroup +from verl.single_controller.ray import RayResourcePool, RayWorkerGroup from verl.utils.rollout_trace import ( rollout_trace_attr, rollout_trace_op, ) +from verl.workers.config import HFModelConfig, RolloutConfig logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -81,12 +81,14 @@ async def generate_for_partial( class FullyAsyncAgentLoopWorker(AgentLoopWorker): def __init__( self, - config: DictConfig, + rollout_config: RolloutConfig, + model_config: HFModelConfig, + data_config: DictConfig, server_handles: list[ray.actor.ActorHandle], reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - self.server_manager = FullyAsyncLLMServerManager(config, server_handles) - super().__init__(config, server_handles, reward_loop_worker_handles) + self.server_manager = FullyAsyncLLMServerManager(rollout_config, server_handles) + super().__init__(rollout_config, model_config, data_config, server_handles, reward_loop_worker_handles) # A shared cancellation event for all agent loops running on this worker. self.cancellation_event = asyncio.Event() @@ -102,7 +104,7 @@ async def generate_sequences_no_post( Returns: list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. """ - config = self.config.actor_rollout_ref.rollout + config = self.rollout_config sampling_params = dict( temperature=config.temperature, top_p=config.top_p, @@ -217,17 +219,22 @@ async def resume_agent_loops(self): class FullyAsyncAgentLoopManager(AgentLoopManager): def __init__( self, - config: DictConfig, + rollout_config: RolloutConfig, + model_config: HFModelConfig, + data_config: DictConfig, worker_group: RayWorkerGroup = None, + rollout_resource_pool: RayResourcePool = None, reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - self.config = config + self.rollout_config = rollout_config + self.model_config = model_config + self.data_config = data_config self.worker_group = worker_group self.reward_loop_worker_handles = reward_loop_worker_handles self.agent_loop_workers_class = FullyAsyncAgentLoopWorker # Select rollout replica class based on rollout name - rollout_name = config.actor_rollout_ref.rollout.name + rollout_name = rollout_config.name if rollout_name == "sglang": from verl.experimental.fully_async_policy.sglang_rollout.sglang_async_server import FullyAsyncSGLangReplica @@ -246,63 +253,6 @@ def __init__( self.server_addresses = None self.agent_loop_workers = None - @classmethod - async def create( - cls, - config: DictConfig, - worker_group: RayWorkerGroup = None, - reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, - ): - instance = cls(config, worker_group, reward_loop_worker_handles) - await instance._async_init() - return instance - - async def _async_init(self): - await self._initialize_llm_servers_async() - self._init_agent_loop_workers() - - async def _initialize_llm_servers_async(self): - rollout_world_size = ( - self.config.actor_rollout_ref.rollout.tensor_model_parallel_size - * self.config.actor_rollout_ref.rollout.data_parallel_size - * self.config.actor_rollout_ref.rollout.pipeline_model_parallel_size - ) - world_size = ( - self.worker_group.world_size - if self.worker_group - else self.config.rollout.n_gpus_per_node * self.config.rollout.nnodes - ) - num_replicas = world_size // rollout_world_size - - rollout_config = self.config.actor_rollout_ref.rollout - model_config = self.config.actor_rollout_ref.model - self.rollout_replicas = [ - self.rollout_replica_class( - replica_rank=replica_rank, - config=rollout_config, - model_config=model_config, - gpus_per_node=self.config.rollout.n_gpus_per_node, - ) - for replica_rank in range(num_replicas) - ] - - if self.worker_group: - await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) - else: - await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas]) - - self.server_handles = [server._server_handle for server in self.rollout_replicas] - self.server_addresses = [server._server_address for server in self.rollout_replicas] - - print(f"AgentLoopManager: {self.server_addresses}") - # Update Prometheus configuration with server addresses - if rollout_config.prometheus.enable: - if rollout_config.disable_log_stats: - raise ValueError("PROMETHEUS needs disable_log_stats==False, but it is currently True.") - await asyncio.to_thread( - update_prometheus_config, rollout_config.prometheus, self.server_addresses, rollout_config.name - ) - async def generate_single_sample_async( self, sample: DataProto, diff --git a/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index b0aef45bd67..6982184f8f6 100644 --- a/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -30,9 +30,9 @@ class PartialSingleTurnAgentLoop(AgentLoopBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length - self.response_length = self.config.actor_rollout_ref.rollout.response_length - self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {}) + self.prompt_length = self.rollout_config.prompt_length + self.response_length = self.rollout_config.response_length + self.apply_chat_template_kwargs = self.data_config.get("apply_chat_template_kwargs", {}) async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: output: Optional[AgentLoopOutput] = kwargs.get("output", None) diff --git a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml index 1f4b4db8c82..d4cff51de3b 100644 --- a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -30,12 +30,6 @@ async_training: # Rollout config rollout: - # Number of nodes used in the rollout - nnodes: 1 - - # Number of GPUs per node - n_gpus_per_node: 8 - # number of responses (i.e. num sample times). > 1 for grpo n: 4 diff --git a/verl/experimental/fully_async_policy/fully_async_main.py b/verl/experimental/fully_async_policy/fully_async_main.py index fe43abb1b6e..80332364ad1 100644 --- a/verl/experimental/fully_async_policy/fully_async_main.py +++ b/verl/experimental/fully_async_policy/fully_async_main.py @@ -59,10 +59,14 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: # Rollout resource pool if Role.Rollout in roles: - assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" - assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" + assert config.actor_rollout_ref.rollout.n_gpus_per_node > 0, ( + "config.actor_rollout_ref.rollout.n_gpus_per_node must be greater than 0" + ) + assert config.actor_rollout_ref.rollout.nnodes > 0, ( + "config.actor_rollout_ref.rollout.nnodes must be greater than 0" + ) - rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes + rollout_pool = [config.actor_rollout_ref.rollout.n_gpus_per_node] * config.actor_rollout_ref.rollout.nnodes resource_pool_spec["rollout_pool"] = rollout_pool mapping[Role.Rollout] = "rollout_pool" diff --git a/verl/experimental/fully_async_policy/fully_async_rollouter.py b/verl/experimental/fully_async_policy/fully_async_rollouter.py index 4810a3730da..5ad6bd503ce 100644 --- a/verl/experimental/fully_async_policy/fully_async_rollouter.py +++ b/verl/experimental/fully_async_policy/fully_async_rollouter.py @@ -104,7 +104,7 @@ def __init__( self._validate_config() if self.config.async_training.use_trainer_do_validate: - rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node + rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node train_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node total_gpus = rollout_gpus + train_gpus print(f"[FullyAsyncRollouter] split before val_dataset total len: {len(val_dataset)}") @@ -444,7 +444,11 @@ async def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = await FullyAsyncAgentLoopManager.create( - config=self.config, worker_group=self.rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles + rollout_config=self.config.actor_rollout_ref.rollout, + model_config=self.config.actor_rollout_ref.model, + data_config=self.config.data, + worker_group=self.rollout_wg, + reward_loop_worker_handles=reward_loop_worker_handles, ) # Add samples to the pending_queue diff --git a/verl/experimental/fully_async_policy/fully_async_trainer.py b/verl/experimental/fully_async_policy/fully_async_trainer.py index 9519c594dbd..31df91f4e44 100644 --- a/verl/experimental/fully_async_policy/fully_async_trainer.py +++ b/verl/experimental/fully_async_policy/fully_async_trainer.py @@ -137,7 +137,7 @@ def __init__( self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches total_gpus = ( config.trainer.nnodes * config.trainer.n_gpus_per_node - + config.rollout.nnodes * config.rollout.n_gpus_per_node + + config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node ) self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus) @@ -147,7 +147,7 @@ def __init__( from verl.utils.dataset.rl_dataset import collate_fn val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) - rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node + rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node print(f"[FullyAsyncTrainer] split before val_dataset total len: {len(val_dataset)}") split_dataset = val_dataset.split(total_gpus) rollout_val_dataset0 = split_dataset[rollout_gpus:] @@ -311,7 +311,9 @@ async def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = await FullyAsyncAgentLoopManager.create( - config=self.config, + rollout_config=self.config.actor_rollout_ref.rollout, + model_config=self.config.actor_rollout_ref.model, + data_config=self.config.data, worker_group=self.actor_rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles, ) diff --git a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh index cc936f50dc1..209930aeb59 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh @@ -176,8 +176,8 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.resume_mode=auto \ trainer.nnodes="${n_nodes_train}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${n_nodes_rollout}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + actor_rollout_ref.rollout.nnodes="${n_nodes_rollout}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq=${test_freq} \ rollout.total_epochs=10 \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh index 2a5eb1bb966..a94387f2af9 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh @@ -129,8 +129,8 @@ python3 -m verl.experimental.fully_async_policy.fully_async_main \ data.gen_batch_size=${gen_prompt_bsz} \ trainer.nnodes=$NNODES \ trainer.n_gpus_per_node=$n_gpus_training \ - rollout.nnodes=$NNODES \ - rollout.n_gpus_per_node=$n_gpus_rollout \ + actor_rollout_ref.rollout.nnodes=$NNODES \ + actor_rollout_ref.rollout.n_gpus_per_node=$n_gpus_rollout \ rollout.total_rollout_steps=$total_rollout_steps \ rollout.total_epochs=10 \ rollout.test_freq=$test_freq \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh index ba8e6804fdb..1dcb5018c68 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh index 5561208ee6d..6577caada6e 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index 242a5117a5e..9823231aed1 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -153,8 +153,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh index ee0657eace7..aef1bac704d 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh @@ -152,8 +152,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh index 002c1206b8a..4a273c2c8ba 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh index f01fb8184e7..e1146d79d26 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh @@ -156,8 +156,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index 2b2143ffa21..18291a62bf7 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh index 8b32c6e0078..741c695de0b 100644 --- a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh +++ b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh @@ -99,8 +99,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + actor_rollout_ref.rollout.nnodes="${NNODES}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs="${total_epochs}" \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh index ebcb634ff72..1b95a5becd8 100644 --- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh +++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh @@ -217,8 +217,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh index c04a09d3266..3ea5196f1c6 100644 --- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh +++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh @@ -226,8 +226,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ + actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 9d6560881be..8ecaa1c0c70 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -831,8 +831,10 @@ def init_workers(self): # if enable_agent_reward_loop, we directly pass reward_loop_workers to agent loop manager # to stream reward computation with actor rollout reward_loop_worker_handles = self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None - self.async_rollout_manager = AgentLoopManager( - config=self.config, + self.async_rollout_manager = AgentLoopManager.create( + rollout_config=self.config.actor_rollout_ref.rollout, + model_config=self.config.actor_rollout_ref.model, + data_config=self.config.actor_rollout_ref.data, worker_group=self.actor_rollout_wg, rollout_resource_pool=actor_rollout_resource_pool, reward_loop_worker_handles=reward_loop_worker_handles, From 80c8f3dd30f945726b5e90baf1ba1409584b466f Mon Sep 17 00:00:00 2001 From: wuxibin Date: Thu, 26 Feb 2026 22:26:34 +0800 Subject: [PATCH 04/10] fix auto_await --- .../one_step_off_policy/ray_trainer.py | 2 +- verl/utils/ray_utils.py | 29 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py index caba2cf6ad3..d2c64ec577d 100644 --- a/verl/experimental/one_step_off_policy/ray_trainer.py +++ b/verl/experimental/one_step_off_policy/ray_trainer.py @@ -182,7 +182,7 @@ def _init_async_rollout_manager(self): from verl.experimental.one_step_off_policy.agent_loop import OneStepOffAgentLoopManager self.async_rollout_mode = True - self.async_rollout_manager = OneStepOffAgentLoopManager( + self.async_rollout_manager = OneStepOffAgentLoopManager.create( rollout_config=self.config.actor_rollout_ref.rollout, model_config=self.config.actor_rollout_ref.model, data_config=self.config.data, diff --git a/verl/utils/ray_utils.py b/verl/utils/ray_utils.py index 5ba20649365..eff3d91085f 100644 --- a/verl/utils/ray_utils.py +++ b/verl/utils/ray_utils.py @@ -97,9 +97,13 @@ def get_event_loop(): def auto_await(func): """Auto await a coroutine function. - If the function is called in an async context (with a running event loop), - it will return the coroutine object. Otherwise, it will block the current thread - and run the coroutine until completion. + Handles three cases: + 1. When the decorated function is called with await: returns the coroutine + so the caller can await it. + 2. When called directly and there is no running event loop: runs the + coroutine with asyncio.run() and returns the result. + 3. When called directly and the event loop is already running: runs the + coroutine (e.g. in a thread pool to avoid deadlock) and returns the result. """ @functools.wraps(func) @@ -114,9 +118,22 @@ def wrapper(*args, **kwargs): except RuntimeError: loop = None - if loop and loop.is_running(): - return coro - else: + # Case 1: No running loop -> run with asyncio.run() + if loop is None: return asyncio.run(coro) + # Case 2: Running loop -> return coro if caller will await + caller_frame = inspect.currentframe() + if caller_frame is not None: + caller_frame = caller_frame.f_back + caller_is_async = caller_frame is not None and (caller_frame.f_code.co_flags & inspect.CO_COROUTINE) != 0 + if caller_is_async: + return coro + + # Case 3: Running loop -> run coro in thread pool + # (cannot block the loop thread without deadlock) + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + future = pool.submit(asyncio.run, coro) + return future.result() + return wrapper From 231ec7c115b28a2e727783c27b4651781ab9da6e Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 01:35:51 +0800 Subject: [PATCH 05/10] revert --- tests/experimental/agent_loop/agent_utils.py | 4 +- ...t_agent_loop_extra_fields_schema_on_cpu.py | 9 ++- .../test_agent_reward_loop_colocate.py | 4 +- .../test_agent_reward_loop_standalone.py | 4 +- tests/special_e2e/run_fully_async_policy.sh | 4 +- tests/special_e2e/run_one_step_off_policy.sh | 4 +- verl/experimental/agent_loop/agent_loop.py | 75 ++++++++----------- .../experimental/fully_async_policy/README.md | 8 +- .../fully_async_policy/README_zh.md | 8 +- .../agent_loop/agent_loop.py | 20 ++--- .../agent_loop/partial_tool_agent_loop.py | 6 +- .../config/fully_async_ppo_trainer.yaml | 6 ++ .../fully_async_policy/fully_async_main.py | 13 ++-- .../fully_async_rollouter.py | 8 +- .../fully_async_policy/fully_async_trainer.py | 8 +- .../shell/dapo_30b_a3b_base_math_fsdp.sh | 4 +- .../shell/dapo_7b_async_retool.sh | 4 +- .../shell/dapo_7b_math_fsdp2_16_16.sh | 4 +- .../shell/dapo_7b_math_fsdp2_32_32.sh | 4 +- .../shell/dapo_7b_math_fsdp2_4_12.sh | 4 +- .../shell/dapo_7b_math_fsdp2_4_4.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64_mis.sh | 4 +- .../shell/dapo_7b_math_fsdp2_8_8.sh | 4 +- .../shell/geo3k_qwen25vl_7b_megatron_4_4.sh | 4 +- .../grpo_30b_a3b_base_math_megatron_96_32.sh | 4 +- ...po_30b_a3b_base_math_megatron_96_32_mis.sh | 4 +- .../one_step_off_ppo_megatron_trainer.yaml | 7 ++ .../config/one_step_off_ppo_trainer.yaml | 7 ++ .../one_step_off_policy/main_ppo.py | 4 + .../one_step_off_policy/ray_trainer.py | 5 +- .../shell/dapo_7b_math_fsdp2_4_12.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64.sh | 4 +- .../shell/dapo_7b_math_fsdp2_64_64_ris.sh | 4 +- .../shell/dapo_7b_math_fsdp2_sglang_4_12.sh | 4 +- .../shell/dapo_7b_math_megatron_4_12.sh | 4 +- .../shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh | 4 +- .../shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh | 4 +- .../shell/grpo_3b_gsm8k_fsdp2_2_6.sh | 4 +- .../grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh | 4 +- verl/trainer/ppo/ray_trainer.py | 4 +- 41 files changed, 141 insertions(+), 151 deletions(-) diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py index 5103ff9c357..4596236bc78 100644 --- a/tests/experimental/agent_loop/agent_utils.py +++ b/tests/experimental/agent_loop/agent_utils.py @@ -80,9 +80,7 @@ def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerG rm_resource_pool=rm_resource_pool, ) agent_loop_manager = AgentLoopManager.create( - rollout_config=config.actor_rollout_ref.rollout, - model_config=config.actor_rollout_ref.model, - data_config=config.data, + config=config, worker_group=actor_rollout_wg, reward_loop_worker_handles=reward_loop_manager.reward_loop_workers, ) diff --git a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py index a5c5ab3dde3..e5d296a8756 100644 --- a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py +++ b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py @@ -148,7 +148,8 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu( config = OmegaConf.create( { "actor_rollout_ref": { - "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}} + "rollout": {"prompt_length": 16, "response_length": 16, "multi_turn": {"tool_config_path": None}}, + "model": {}, }, "data": { "tool_config_path": None, @@ -161,11 +162,11 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu( tokenizer = _FakeTokenizer() processor = None - rollout_config = DictConfigWrap(config.actor_rollout_ref.rollout) + trainer_config = DictConfigWrap(config) data_config = DictConfigWrap(config.data) single_turn = SingleTurnAgentLoop( - rollout_config=rollout_config, + trainer_config=trainer_config, server_manager=server_manager, tokenizer=tokenizer, processor=processor, @@ -173,7 +174,7 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu( data_config=data_config, ) partial_single_turn = PartialSingleTurnAgentLoop( - rollout_config=rollout_config, + trainer_config=trainer_config, server_manager=server_manager, tokenizer=tokenizer, processor=processor, diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py index 1cf1014602e..0ea96dca409 100644 --- a/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py +++ b/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py @@ -99,9 +99,7 @@ def test_agent_reward_loop_standalone(): actor_rollout_wg.init_model() agent_loop_manager = AgentLoopManager.create( - rollout_config=config.actor_rollout_ref.rollout, - model_config=config.actor_rollout_ref.model, - data_config=config.data, + config=config, worker_group=actor_rollout_wg, ) # sleep rollout replicas diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py index b9c1f9a3f9d..99af766cbbe 100644 --- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py +++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py @@ -77,9 +77,7 @@ def test_agent_reward_loop_standalone(): # 1. init reward model manager reward_loop_manager = RewardLoopManager(config) agent_loop_manager = AgentLoopManager.create( - rollout_config=config.actor_rollout_ref.rollout, - model_config=config.actor_rollout_ref.model, - data_config=config.data, + config=config, reward_loop_worker_handles=reward_loop_manager.reward_loop_workers, ) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 4f7882b60d4..01d807ba63a 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -124,8 +124,8 @@ common_params=( trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} trainer.log_val_generations=10 - actor_rollout_ref.rollout.nnodes=1 - actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 rollout.test_freq=${test_freq} diff --git a/tests/special_e2e/run_one_step_off_policy.sh b/tests/special_e2e/run_one_step_off_policy.sh index 9bbe16045c2..bdcba5caaaf 100755 --- a/tests/special_e2e/run_one_step_off_policy.sh +++ b/tests/special_e2e/run_one_step_off_policy.sh @@ -90,8 +90,6 @@ common_params=( actor_rollout_ref.rollout.val_kwargs.n=1 actor_rollout_ref.rollout.enable_chunked_prefill=True actor_rollout_ref.rollout.name=vllm - actor_rollout_ref.rollout.nnodes=1 - actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} actor_rollout_ref.rollout.checkpoint_engine.backend='nccl' actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=1024 reward.reward_manager.name=dapo @@ -111,6 +109,8 @@ common_params=( trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_gpus_rollout} ) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 27d31458b88..6f089033b6c 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -53,6 +53,14 @@ logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) +def _get_rollout_and_model_config(config: DictConfig) -> RolloutConfig: + # TODO: backward compatibility, remove this once we switch to new trainer. + if config.get("actor_rollout_ref"): + return config.actor_rollout_ref.rollout, config.actor_rollout_ref.model + else: + return config.rollout, config.model + + class AsyncLLMServerManager: """ A class to manage multiple OpenAI compatible LLM servers. This class provides @@ -60,17 +68,15 @@ class AsyncLLMServerManager: - Sticky session: send multi-turn chat completions to same server for automatic prefix caching """ - def __init__( - self, rollout_config: RolloutConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000 - ): + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): """Initialize the AsyncLLMServerManager. Args: - rollout_config (RolloutConfig): rollout config. + config (DictConfig): whole config for main entrypoint. server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. """ - self.rollout_config = rollout_config + self.config = config self.server_handles = server_handles random.shuffle(self.server_handles) @@ -195,7 +201,7 @@ class AgentLoopBase(ABC): environments. Args: - rollout_config (RolloutConfig): rollout config. + trainer_config (DictConfig): whole config for main entrypoint. server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. tokenizer (AutoTokenizer): Tokenizer for tokenize messages. processor (AutoProcessor): Processor for process messages. @@ -205,7 +211,7 @@ class AgentLoopBase(ABC): def __init__( self, - rollout_config: DictConfigWrap, + trainer_config: DictConfigWrap, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, processor: AutoProcessor, @@ -213,7 +219,8 @@ def __init__( data_config: DictConfigWrap, **kwargs, ): - self.rollout_config = rollout_config.config + self.config = trainer_config.config + self.rollout_config, _ = _get_rollout_and_model_config(self.config) self.server_manager = server_manager self.tokenizer = tokenizer self.processor = processor @@ -346,30 +353,27 @@ class AgentLoopWorker: """Agent loop worker takes a batch of messages and run each message in an agent loop. Args: - rollout_config (RolloutConfig): rollout config. - model_config (HFModelConfig): model config. - data_config (DictConfig): data config. + config (DictConfig): whole config for main entrypoint. server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. """ def __init__( self, - rollout_config: RolloutConfig, - model_config: HFModelConfig, - data_config: DictConfig, + config: DictConfig, server_handles: list[ray.actor.ActorHandle], reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): + self.config = config + rollout_config, model_config = _get_rollout_and_model_config(config) self.rollout_config: RolloutConfig = omega_conf_to_dataclass(rollout_config) self.model_config: HFModelConfig = omega_conf_to_dataclass(model_config) - self.data_config = data_config # for recipe to change if not hasattr(self, "server_manager"): - self.server_manager = AsyncLLMServerManager(self.rollout_config, server_handles) + self.server_manager = AsyncLLMServerManager(config, server_handles) - self.dataset_cls = get_dataset_class(data_config) + self.dataset_cls = get_dataset_class(config.data) self.reward_loop_worker_handles = reward_loop_worker_handles self.tokenizer = self.model_config.tokenizer @@ -501,12 +505,12 @@ async def _run_agent_loop( agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, - rollout_config=DictConfigWrap(self.rollout_config), + trainer_config=DictConfigWrap(config=self.config), server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, dataset_cls=self.dataset_cls, - data_config=DictConfigWrap(self.data_config), + data_config=DictConfigWrap(self.config.data), ) output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs) return await self._agent_loop_postprocess(output, **kwargs) @@ -856,9 +860,7 @@ class AgentLoopManager: - otherwise, rollout server is in standalone mode, use separate GPUs, e.g., one-step-off/fully async training. Args: - rollout_config (RolloutConfig): rollout config. - model_config (HFModelConfig): model config. - data_config (DictConfig): data config. + config (DictConfig): whole config for main entrypoint. worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode. rollout_resource_pool (RayResourcePool): Resource pool for hybrid mode, only used by TensorRT-LLM. reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation. @@ -866,22 +868,19 @@ class AgentLoopManager: def __init__( self, - rollout_config: RolloutConfig, - model_config: HFModelConfig, - data_config: DictConfig, + config: DictConfig, worker_group: RayWorkerGroup = None, rollout_resource_pool: RayResourcePool = None, reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - assert worker_group is not None or rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode" - - self.rollout_config = rollout_config - self.model_config = model_config - self.data_config = data_config + self.config = config + self.rollout_config, self.model_config = _get_rollout_and_model_config(config) self.worker_group = worker_group self.rollout_resource_pool = rollout_resource_pool self.reward_loop_worker_handles = reward_loop_worker_handles + assert worker_group is not None or self.rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode" + # for recipe to change if not hasattr(self, "rollout_replica_class"): self.rollout_replica_class = get_rollout_replica_class(self.rollout_config.name) @@ -892,17 +891,13 @@ def __init__( @auto_await async def create( cls, - rollout_config: RolloutConfig, - model_config: HFModelConfig, - data_config: DictConfig, + config: DictConfig, worker_group: RayWorkerGroup = None, rollout_resource_pool: RayResourcePool = None, reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): """Create agent loop manager.""" - instance = cls( - rollout_config, model_config, data_config, worker_group, rollout_resource_pool, reward_loop_worker_handles - ) + instance = cls(config, worker_group, rollout_resource_pool, reward_loop_worker_handles) await instance._initialize_llm_servers() await instance._init_agent_loop_workers() return instance @@ -968,13 +963,7 @@ async def _init_agent_loop_workers(self): scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( node_id=node_id, soft=True ), - ).remote( - self.rollout_config, - self.model_config, - self.data_config, - self.server_handles, - self.reward_loop_worker_handles, - ) + ).remote(self.config, self.server_handles, self.reward_loop_worker_handles) ) @auto_await diff --git a/verl/experimental/fully_async_policy/README.md b/verl/experimental/fully_async_policy/README.md index 311e8dfc0ea..b7ff1756459 100644 --- a/verl/experimental/fully_async_policy/README.md +++ b/verl/experimental/fully_async_policy/README.md @@ -92,8 +92,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev |------------------------------------------------------------------|------------------------------------------------------------------------------------------------| | `trainer.nnodes` | Number of nodes for Trainer | | `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer | -| `actor_rollout_ref.rollout.nnodes` | Number of nodes for Rollouter | -| `actor_rollout_ref.rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | +| `rollout.nnodes` | Number of nodes for Rollouter | +| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | | `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) | | `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) | | `rollout.total_rollout_steps` | Total number of rollout samples | @@ -313,8 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.mode=${rollout_mode} \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/README_zh.md b/verl/experimental/fully_async_policy/README_zh.md index 6861114debe..ad2e52e4167 100644 --- a/verl/experimental/fully_async_policy/README_zh.md +++ b/verl/experimental/fully_async_policy/README_zh.md @@ -69,8 +69,8 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev |------------------------------------------------------------------|-----------------------------------------------------------------| | `trainer.nnodes` | Trainer的node数量 | | `trainer.n_gpus_per_node` | Trainer每个node上gpu的数量 | -| `actor_rollout_ref.rollout.nnodes` | Rollouter的node数量 | -| `actor_rollout_ref.rollout.n_gpus_per_node` | Rollouter每个node上gpu的数量 | +| `rollout.nnodes` | Rollouter的node数量 | +| `rollout.n_gpus_per_node` | Rollouter每个node上gpu的数量 | | `data.train_batch_size` | 在fully async策略中,该值不生效(默认设置为0) | | `data.gen_batch_size` | 在fully async策略中,使用流式的样本生产逻辑(默认设置为1) | | `rollout.total_rollout_steps` | 总的rollout的sample数量 | @@ -256,8 +256,8 @@ python -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.mode=${rollout_mode} \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py index c545032fece..d23c700d7c6 100644 --- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py @@ -36,7 +36,6 @@ rollout_trace_attr, rollout_trace_op, ) -from verl.workers.config import HFModelConfig, RolloutConfig logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -81,14 +80,12 @@ async def generate_for_partial( class FullyAsyncAgentLoopWorker(AgentLoopWorker): def __init__( self, - rollout_config: RolloutConfig, - model_config: HFModelConfig, - data_config: DictConfig, + config: DictConfig, server_handles: list[ray.actor.ActorHandle], reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - self.server_manager = FullyAsyncLLMServerManager(rollout_config, server_handles) - super().__init__(rollout_config, model_config, data_config, server_handles, reward_loop_worker_handles) + self.server_manager = FullyAsyncLLMServerManager(config, server_handles) + super().__init__(config, server_handles, reward_loop_worker_handles) # A shared cancellation event for all agent loops running on this worker. self.cancellation_event = asyncio.Event() @@ -219,22 +216,19 @@ async def resume_agent_loops(self): class FullyAsyncAgentLoopManager(AgentLoopManager): def __init__( self, - rollout_config: RolloutConfig, - model_config: HFModelConfig, - data_config: DictConfig, + config: DictConfig, worker_group: RayWorkerGroup = None, rollout_resource_pool: RayResourcePool = None, reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): - self.rollout_config = rollout_config - self.model_config = model_config - self.data_config = data_config + self.config = config + self.rollout_config = config.actor_rollout_ref.rollout self.worker_group = worker_group self.reward_loop_worker_handles = reward_loop_worker_handles self.agent_loop_workers_class = FullyAsyncAgentLoopWorker # Select rollout replica class based on rollout name - rollout_name = rollout_config.name + rollout_name = self.rollout_config.name if rollout_name == "sglang": from verl.experimental.fully_async_policy.sglang_rollout.sglang_async_server import FullyAsyncSGLangReplica diff --git a/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py index 0082fc13bc8..370587f0364 100644 --- a/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/partial_tool_agent_loop.py @@ -33,9 +33,9 @@ class AsyncPartialToolAgentLoop(ToolAgentLoop): """ - def __init__(self, trainer_config, **kwargs): - super().__init__(trainer_config, **kwargs) - self.enable_partial_rollout = trainer_config.config.async_training.get("partial_rollout", False) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.enable_partial_rollout = self.config.async_training.get("partial_rollout", False) # async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: async def run( diff --git a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml index d4cff51de3b..1f4b4db8c82 100644 --- a/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/verl/experimental/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -30,6 +30,12 @@ async_training: # Rollout config rollout: + # Number of nodes used in the rollout + nnodes: 1 + + # Number of GPUs per node + n_gpus_per_node: 8 + # number of responses (i.e. num sample times). > 1 for grpo n: 4 diff --git a/verl/experimental/fully_async_policy/fully_async_main.py b/verl/experimental/fully_async_policy/fully_async_main.py index 80332364ad1..4e9e509475f 100644 --- a/verl/experimental/fully_async_policy/fully_async_main.py +++ b/verl/experimental/fully_async_policy/fully_async_main.py @@ -59,14 +59,10 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: # Rollout resource pool if Role.Rollout in roles: - assert config.actor_rollout_ref.rollout.n_gpus_per_node > 0, ( - "config.actor_rollout_ref.rollout.n_gpus_per_node must be greater than 0" - ) - assert config.actor_rollout_ref.rollout.nnodes > 0, ( - "config.actor_rollout_ref.rollout.nnodes must be greater than 0" - ) + assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" + assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" - rollout_pool = [config.actor_rollout_ref.rollout.n_gpus_per_node] * config.actor_rollout_ref.rollout.nnodes + rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes resource_pool_spec["rollout_pool"] = rollout_pool mapping[Role.Rollout] = "rollout_pool" @@ -291,6 +287,9 @@ def main(config): from time import time start_time = time() + # TODO: unify rollout config with actor_rollout_ref + config.actor_rollout_ref.rollout.nnodes = config.rollout.nnodes + config.actor_rollout_ref.rollout.n_gpus_per_node = config.rollout.n_gpus_per_node run_ppo(config, task_runner_class=FullyAsyncTaskRunner) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/verl/experimental/fully_async_policy/fully_async_rollouter.py b/verl/experimental/fully_async_policy/fully_async_rollouter.py index 5ad6bd503ce..4810a3730da 100644 --- a/verl/experimental/fully_async_policy/fully_async_rollouter.py +++ b/verl/experimental/fully_async_policy/fully_async_rollouter.py @@ -104,7 +104,7 @@ def __init__( self._validate_config() if self.config.async_training.use_trainer_do_validate: - rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node + rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node train_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node total_gpus = rollout_gpus + train_gpus print(f"[FullyAsyncRollouter] split before val_dataset total len: {len(val_dataset)}") @@ -444,11 +444,7 @@ async def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = await FullyAsyncAgentLoopManager.create( - rollout_config=self.config.actor_rollout_ref.rollout, - model_config=self.config.actor_rollout_ref.model, - data_config=self.config.data, - worker_group=self.rollout_wg, - reward_loop_worker_handles=reward_loop_worker_handles, + config=self.config, worker_group=self.rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles ) # Add samples to the pending_queue diff --git a/verl/experimental/fully_async_policy/fully_async_trainer.py b/verl/experimental/fully_async_policy/fully_async_trainer.py index 31df91f4e44..9519c594dbd 100644 --- a/verl/experimental/fully_async_policy/fully_async_trainer.py +++ b/verl/experimental/fully_async_policy/fully_async_trainer.py @@ -137,7 +137,7 @@ def __init__( self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches total_gpus = ( config.trainer.nnodes * config.trainer.n_gpus_per_node - + config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node + + config.rollout.nnodes * config.rollout.n_gpus_per_node ) self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus) @@ -147,7 +147,7 @@ def __init__( from verl.utils.dataset.rl_dataset import collate_fn val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) - rollout_gpus = config.actor_rollout_ref.rollout.nnodes * config.actor_rollout_ref.rollout.n_gpus_per_node + rollout_gpus = config.rollout.nnodes * config.rollout.n_gpus_per_node print(f"[FullyAsyncTrainer] split before val_dataset total len: {len(val_dataset)}") split_dataset = val_dataset.split(total_gpus) rollout_val_dataset0 = split_dataset[rollout_gpus:] @@ -311,9 +311,7 @@ async def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = await FullyAsyncAgentLoopManager.create( - rollout_config=self.config.actor_rollout_ref.rollout, - model_config=self.config.actor_rollout_ref.model, - data_config=self.config.data, + config=self.config, worker_group=self.actor_rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles, ) diff --git a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh index 209930aeb59..cc936f50dc1 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh @@ -176,8 +176,8 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.resume_mode=auto \ trainer.nnodes="${n_nodes_train}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${n_nodes_rollout}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.nnodes="${n_nodes_rollout}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.test_freq=${test_freq} \ rollout.total_epochs=10 \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh index a94387f2af9..2a5eb1bb966 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_async_retool.sh @@ -129,8 +129,8 @@ python3 -m verl.experimental.fully_async_policy.fully_async_main \ data.gen_batch_size=${gen_prompt_bsz} \ trainer.nnodes=$NNODES \ trainer.n_gpus_per_node=$n_gpus_training \ - actor_rollout_ref.rollout.nnodes=$NNODES \ - actor_rollout_ref.rollout.n_gpus_per_node=$n_gpus_rollout \ + rollout.nnodes=$NNODES \ + rollout.n_gpus_per_node=$n_gpus_rollout \ rollout.total_rollout_steps=$total_rollout_steps \ rollout.total_epochs=10 \ rollout.test_freq=$test_freq \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh index 1dcb5018c68..ba8e6804fdb 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_16_16.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh index 6577caada6e..5561208ee6d 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index 9823231aed1..242a5117a5e 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -153,8 +153,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ async_training.staleness_threshold="${staleness_threshold}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh index aef1bac704d..ee0657eace7 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh @@ -152,8 +152,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh index 4a273c2c8ba..002c1206b8a 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh index e1146d79d26..f01fb8184e7 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64_mis.sh @@ -156,8 +156,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index 18291a62bf7..2b2143ffa21 100644 --- a/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/verl/experimental/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -150,8 +150,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh index 741c695de0b..8b32c6e0078 100644 --- a/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh +++ b/verl/experimental/fully_async_policy/shell/geo3k_qwen25vl_7b_megatron_4_4.sh @@ -99,8 +99,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs="${total_epochs}" \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh index 1b95a5becd8..ebcb634ff72 100644 --- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh +++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32.sh @@ -217,8 +217,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh index 3ea5196f1c6..c04a09d3266 100644 --- a/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh +++ b/verl/experimental/fully_async_policy/shell/grpo_30b_a3b_base_math_megatron_96_32_mis.sh @@ -226,8 +226,8 @@ python -m verl.experimental.fully_async_policy.fully_async_main \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml index 19d77597dc1..0e4677be368 100644 --- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml +++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_megatron_trainer.yaml @@ -9,6 +9,13 @@ defaults: trainer: use_legacy_worker_impl: disable +# config for the rollout (only for resource isolation) +rollout: + # Number of nodes used in the rollout + nnodes: 1 + # Number of GPUs per node + n_gpus_per_node: 8 + # To adapt to the current logic of AgentLoopManager actor_rollout_ref: rollout: diff --git a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml index 1a74af3df34..dc784b2ae73 100644 --- a/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml +++ b/verl/experimental/one_step_off_policy/config/one_step_off_ppo_trainer.yaml @@ -9,6 +9,13 @@ defaults: trainer: use_legacy_worker_impl: disable +# config for the rollout (only for resource isolation) +rollout: + # Number of nodes used in the rollout + nnodes: 1 + # Number of GPUs per node + n_gpus_per_node: 8 + # To adapt to the current logic of AgentLoopManager actor_rollout_ref: rollout: diff --git a/verl/experimental/one_step_off_policy/main_ppo.py b/verl/experimental/one_step_off_policy/main_ppo.py index 2c2fe6108ea..0c6ecaedf0e 100644 --- a/verl/experimental/one_step_off_policy/main_ppo.py +++ b/verl/experimental/one_step_off_policy/main_ppo.py @@ -182,6 +182,10 @@ def main(config): # Automatically set `config.trainer.device = npu` when running on Ascend NPU. auto_set_device(config) + # TODO: unify rollout config with actor_rollout_ref + config.actor_rollout_ref.rollout.nnodes = config.rollout.nnodes + config.actor_rollout_ref.rollout.n_gpus_per_node = config.rollout.n_gpus_per_node + run_ppo(config, task_runner_class=OneStepTaskRunner) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/verl/experimental/one_step_off_policy/ray_trainer.py b/verl/experimental/one_step_off_policy/ray_trainer.py index d2c64ec577d..144632dead5 100644 --- a/verl/experimental/one_step_off_policy/ray_trainer.py +++ b/verl/experimental/one_step_off_policy/ray_trainer.py @@ -183,10 +183,7 @@ def _init_async_rollout_manager(self): self.async_rollout_mode = True self.async_rollout_manager = OneStepOffAgentLoopManager.create( - rollout_config=self.config.actor_rollout_ref.rollout, - model_config=self.config.actor_rollout_ref.model, - data_config=self.config.data, - reward_loop_worker_handles=reward_loop_worker_handles, + config=self.config, reward_loop_worker_handles=reward_loop_worker_handles ) def _create_continuous_iterator(self): diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh index 4df41235c03..cbefe87424b 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -135,5 +135,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh index e785e02c6e7..c35513cf9f2 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -136,5 +136,5 @@ python -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ No newline at end of file + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh index 6a462aeca91..10ce9122269 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_64_64_ris.sh @@ -146,8 +146,8 @@ python -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES_TRAIN}" \ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - actor_rollout_ref.rollout.nnodes="${NNODES_ROLLOUT}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ algorithm.rollout_correction.rollout_is=null \ algorithm.rollout_correction.rollout_is_threshold=null \ algorithm.rollout_correction.rollout_rs=seq_mean_k1 \ diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh index c92a2ad6bca..2725bb5bc3d 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_fsdp2_sglang_4_12.sh @@ -136,5 +136,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh index 03fb457c090..a0da86affea 100644 --- a/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh +++ b/verl/experimental/one_step_off_policy/shell/dapo_7b_math_megatron_4_12.sh @@ -142,5 +142,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.log_val_generations=10 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh index 971e77e583e..facabdf58e8 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_2_6.sh @@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh index 6a5338e2269..5c959f49961 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_0.6b_gsm8k_fsdp2_sglang_2_6.sh @@ -61,5 +61,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh index 935869c0575..c5c5eb11d2a 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_3b_gsm8k_fsdp2_2_6.sh @@ -60,5 +60,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.total_epochs=2 \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh index 756c4009ad1..d6f884ad53a 100644 --- a/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh +++ b/verl/experimental/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh @@ -89,5 +89,5 @@ python3 -m verl.experimental.one_step_off_policy.main_ppo \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ - actor_rollout_ref.rollout.nnodes="${NNODES}" \ - actor_rollout_ref.rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" $@ \ No newline at end of file diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 8ecaa1c0c70..ae43d2bad5c 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -832,9 +832,7 @@ def init_workers(self): # to stream reward computation with actor rollout reward_loop_worker_handles = self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None self.async_rollout_manager = AgentLoopManager.create( - rollout_config=self.config.actor_rollout_ref.rollout, - model_config=self.config.actor_rollout_ref.model, - data_config=self.config.actor_rollout_ref.data, + config=self.config, worker_group=self.actor_rollout_wg, rollout_resource_pool=actor_rollout_resource_pool, reward_loop_worker_handles=reward_loop_worker_handles, From 273f8e66b0182f6ae7813096b36bc08f0f3a0c31 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 01:37:25 +0800 Subject: [PATCH 06/10] fix --- verl/experimental/fully_async_policy/agent_loop/agent_loop.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py index d23c700d7c6..89b8cb0fe86 100644 --- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py @@ -28,6 +28,7 @@ AsyncLLMServerManager, DictConfigWrap, _agent_loop_registry, + _get_rollout_and_model_config, get_trajectory_info, ) from verl.protocol import DataProto @@ -222,7 +223,7 @@ def __init__( reward_loop_worker_handles: list[ray.actor.ActorHandle] = None, ): self.config = config - self.rollout_config = config.actor_rollout_ref.rollout + self.rollout_config, self.model_config = _get_rollout_and_model_config(config) self.worker_group = worker_group self.reward_loop_worker_handles = reward_loop_worker_handles self.agent_loop_workers_class = FullyAsyncAgentLoopWorker From 7a6417e90e86dd89e99cd25f9f85a075092d8cf2 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 01:44:01 +0800 Subject: [PATCH 07/10] fix --- verl/experimental/fully_async_policy/agent_loop/agent_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py index 89b8cb0fe86..88a012224eb 100644 --- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py +++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py @@ -191,7 +191,7 @@ async def _partial_run_agent_loop( tokenizer=self.tokenizer, processor=self.processor, dataset_cls=self.dataset_cls, - dataset_config=DictConfigWrap(config=self.config.data), + data_config=DictConfigWrap(config=self.config.data), ) output: AgentLoopOutput = await agent_loop.run( sampling_params, cancellation_event=self.cancellation_event, **kwargs From 380467c8a16145f5c164958522adcb27bd3a16da Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 01:47:21 +0800 Subject: [PATCH 08/10] revert --- tests/special_npu/run_fully_async_policy.sh | 4 ++-- tests/special_npu/run_one_step_off_policy.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/special_npu/run_fully_async_policy.sh b/tests/special_npu/run_fully_async_policy.sh index e5908798bcf..fa517e81ae4 100644 --- a/tests/special_npu/run_fully_async_policy.sh +++ b/tests/special_npu/run_fully_async_policy.sh @@ -124,8 +124,8 @@ common_params=( trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} trainer.log_val_generations=10 - actor_rollout_ref.rollout.nnodes=1 - actor_rollout_ref.rollout.n_gpus_per_node=${n_gpus_rollout} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 rollout.test_freq=${test_freq} diff --git a/tests/special_npu/run_one_step_off_policy.sh b/tests/special_npu/run_one_step_off_policy.sh index 4c1ad9ce204..2426a380fec 100644 --- a/tests/special_npu/run_one_step_off_policy.sh +++ b/tests/special_npu/run_one_step_off_policy.sh @@ -108,8 +108,8 @@ common_params=( trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_npus_training} - actor_rollout_ref.rollout.nnodes=1 - actor_rollout_ref.rollout.n_gpus_per_node=${n_npus_rollout} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_npus_rollout} ) From 16e1f8782dc1acc2f103260c75b16327adc69b48 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 16:14:20 +0800 Subject: [PATCH 09/10] fix ci --- .../reward_loop/test_agent_reward_loop_standalone.py | 1 + verl/experimental/transfer_queue/ray_trainer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py index 99af766cbbe..80a0945bec7 100644 --- a/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py +++ b/tests/experimental/reward_loop/test_agent_reward_loop_standalone.py @@ -56,6 +56,7 @@ def test_agent_reward_loop_standalone(): config.actor_rollout_ref.rollout.prompt_length = 1024 config.actor_rollout_ref.rollout.response_length = 4096 config.actor_rollout_ref.rollout.skip_tokenizer_init = True + config.actor_rollout_ref.rollout.nnodes = 1 config.trainer.n_gpus_per_node = 4 config.trainer.nnodes = 1 diff --git a/verl/experimental/transfer_queue/ray_trainer.py b/verl/experimental/transfer_queue/ray_trainer.py index 96c6d181334..dfb2e721d66 100644 --- a/verl/experimental/transfer_queue/ray_trainer.py +++ b/verl/experimental/transfer_queue/ray_trainer.py @@ -817,7 +817,7 @@ def init_workers(self): reward_loop_worker_handles = ( self.reward_loop_manager.reward_loop_workers if enable_agent_reward_loop else None ) - self.async_rollout_manager = AgentLoopManager( + self.async_rollout_manager = AgentLoopManager.create( config=self.config, worker_group=self.actor_rollout_wg, reward_loop_worker_handles=reward_loop_worker_handles, From 66a9c5e17d759d52be93180e032f74b488be9a79 Mon Sep 17 00:00:00 2001 From: wuxibin Date: Fri, 27 Feb 2026 01:48:35 +0800 Subject: [PATCH 10/10] fix --- verl/experimental/agent_loop/agent_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 6f089033b6c..c60baa6abbb 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -53,7 +53,7 @@ logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) -def _get_rollout_and_model_config(config: DictConfig) -> RolloutConfig: +def _get_rollout_and_model_config(config: DictConfig) -> tuple[DictConfig, DictConfig]: # TODO: backward compatibility, remove this once we switch to new trainer. if config.get("actor_rollout_ref"): return config.actor_rollout_ref.rollout, config.actor_rollout_ref.model