[vllm] feat: implement chunked weight handling in vllm rollout for large tensors

jianjunzhong · jianjunzhong · commit 5d76fbcb1387 · 2026-02-24T11:21:16.000+08:00
Signed-off-by: jianjunzhong &lt;jianjunzhong@foxmail.com&gt;
diff --git a/tests/checkpoint_engine/test_naive_correctness.py b/tests/checkpoint_engine/test_naive_correctness.py
@@ -0,0 +1,154 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import pytest
+import ray
+from omegaconf import DictConfig
+
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.experimental.agent_loop.agent_loop import AgentLoopManager
+from verl.protocol import DataProto
+from verl.single_controller.ray import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+)
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.utils.device import get_device_name
+from verl.utils.tokenizer import hf_tokenizer
+from verl.workers.engine_workers import ActorRolloutRefWorker
+
+
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+
+    config.trainer.n_gpus_per_node = 8
+    config.trainer.nnodes = 1
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    config.actor_rollout_ref.model.path = os.path.expanduser("~/models/Qwen/Qwen3-VL-2B-Instruct")
+    config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"]
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = False
+    config.actor_rollout_ref.rollout.max_num_seqs = 256
+    config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.8
+    config.actor_rollout_ref.rollout.agent.num_workers = 2
+    config.actor_rollout_ref.rollout.checkpoint_engine.backend = "naive"
+    config.actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes = 256
+    config.actor_rollout_ref.rollout.enforce_eager = True
+
+    return config
+
+
+@pytest.mark.skip(reason="This test costs too much to run in CI.")
+@pytest.mark.asyncio
+def test_server_adapter_colocated_weight_update(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+                "VLLM_DISABLE_COMPILE_CACHE": "1",
+                "HCCL_HOST_SOCKET_PORT_RANGE": "60000-60050",
+                "HCCL_NPU_SOCKET_PORT_RANGE": "61000-61050",
+            }
+        }
+    )
+
+    # 0. init actor rollout worker group
+    resource_pool = RayResourcePool(
+        process_on_nodes=[init_config.trainer.n_gpus_per_node] * init_config.trainer.nnodes, max_colocate_count=3
+    )
+    actor_rollout_cls = ray.remote(ActorRolloutRefWorker)
+    cls_dict = {
+        "actor_rollout": RayClassWithInitArgs(
+            cls=actor_rollout_cls, config=init_config.actor_rollout_ref, role="actor_rollout"
+        )
+    }
+    ray_cls_with_init = create_colocated_worker_cls(cls_dict)
+    wg_dict = RayWorkerGroup(
+        resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()
+    )
+    spawn_wg = wg_dict.spawn(prefix_set=cls_dict.keys())
+    actor_rollout_wg = spawn_wg["actor_rollout"]
+    actor_rollout_wg.init_model()
+
+    # 1. create AgentLoopManager
+    agent_loop_manager = AgentLoopManager(
+        config=init_config,
+        worker_group=actor_rollout_wg,
+        rollout_resource_pool=resource_pool,
+    )
+
+    # 2. create CheckpointEngineManager
+    checkpoint_manager = CheckpointEngineManager(
+        backend=init_config.actor_rollout_ref.rollout.checkpoint_engine.backend,
+        trainer=actor_rollout_wg,
+        replicas=agent_loop_manager.rollout_replicas,
+    )
+    checkpoint_manager.sleep_replicas()
+
+    # 3. generate prompts
+    raw_prompts = [
+        [
+            {
+                "role": "user",
+                "content": "This is a test for weight update. If the weight has been correctly "
+                'updated and you understand my meaning, please respond with "Test Passed".',
+            }
+        ],
+        [
+            {
+                "role": "user",
+                "content": "This is a test for weight update. If the weight has been correctly "
+                'updated and you understand my meaning, please respond with "Test Passed".',
+            }
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array(raw_prompts),
+            "agent_name": np.array(["single_turn_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+
+    # 4. update weights and generate sequences, check if the responses are correct
+    for _ in range(3):
+        checkpoint_manager.update_weights()
+        result = agent_loop_manager.generate_sequences(batch)
+        checkpoint_manager.sleep_replicas()
+
+        # Check response
+        tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+        responses = result.batch["responses"]
+        response_mask = result.batch["response_mask"]
+
+        for i in range(len(responses)):
+            valid_tokens = responses[i][response_mask[i].bool()]
+            response = tokenizer.decode(valid_tokens)
+            assert "test passed" in response.lower(), f"Response does not contain 'test passed': {response}"
+
+            print("=========================")
+            print("[OUTPUT]:", response)
+            print("---")
+
+    ray.shutdown()
diff --git a/verl/workers/rollout/vllm_rollout/utils.py b/verl/workers/rollout/vllm_rollout/utils.py
@@ -236,6 +236,8 @@ def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False
             patch_vllm_moe_model_weight_loader(self.model_runner.model)
 
         # receive bucket and update weights
+        # Buffer to collect chunks for weights that were sliced
+        pending_chunks = {}  # name -> {chunk_idx: tensor, ...}
         while True:
             metadata = socket.recv_pyobj()
             weights, tensor = [], None
@@ -250,14 +252,40 @@ def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False
                     tensor = tensor.clone()
                 else:
                     tensor = tensor.to(self.device)
-                weights.append((name, tensor))
+
+                # Check if this is a chunk of a sliced weight
+                if "chunk_idx" in meta and "total_chunks" in meta:
+                    # This is a chunk, store it for later merging
+                    original_name = meta["name"]
+                    chunk_idx = meta["chunk_idx"]
+                    if original_name not in pending_chunks:
+                        pending_chunks[original_name] = {}
+                    pending_chunks[original_name][chunk_idx] = tensor
+
+                    # Check if we have all chunks for this weight
+                    if len(pending_chunks[original_name]) == meta["total_chunks"]:
+                        # Merge all chunks back into one tensor
+                        chunks_dict = pending_chunks[original_name]
+                        sorted_chunks = [chunks_dict[i] for i in range(meta["total_chunks"])]
+                        merged_tensor = torch.cat(sorted_chunks, dim=0)
+                        weights.append((original_name, merged_tensor))
+                        del pending_chunks[original_name]
+                else:
+                    weights.append((name, tensor))
+
             get_torch_device().synchronize()
             socket.send(b"")
             self._update_weights(weights, peft_config=peft_config, base_sync_done=base_sync_done)
             del weights, tensor
             if metadata["is_last"]:
                 break
 
+        # Check if there are any remaining chunks that weren't processed
+        if pending_chunks:
+            raise RuntimeError(
+                f"Received chunks for weights {list(pending_chunks.keys())} but did not receive all chunks for them."
+            )
+
         if self._is_qat_model:
             # QAT: call process_weights_after_loading AFTER all buckets are received
             from verl.utils.qat import manual_process_weights_after_loading
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout.py b/verl/workers/rollout/vllm_rollout/vllm_rollout.py
@@ -30,6 +30,7 @@
 import logging
 import os
 import time
+from functools import reduce
 from typing import Any, Generator, Optional
 
 import ray
@@ -198,27 +199,71 @@ async def update_weights(self, weights: Generator[tuple[str, torch.Tensor], None
             # transfer volume.
             # weight = weight.to(dtype, non_blocking=True)
 
+            # Check if the weight needs to be sliced into chunks
+            # (e.g., large embedding layer that exceeds bucket_size)
+            weight_size = weight.nbytes
+            if weight_size > bucket_size:
+                # Slice the weight along the first dimension into chunks
+                dtype_size = weight.element_size()
+                numel_per_chunk = bucket_size // dtype_size
+
+                # Calculate chunk size along the first dimension
+                first_dim_size = weight.shape[0]
+                chunk_dim_size = numel_per_chunk // reduce(lambda x, y: x * y, weight.shape[1:], 1)
+
+                num_chunks = (first_dim_size + chunk_dim_size - 1) // chunk_dim_size
+                logger.info(
+                    f"Slicing weight {name} ({weight.shape}, {weight.dtype}, {weight_size} bytes) "
+                    f"into {num_chunks} chunks"
+                )
+
+                start_idx = 0
+                for chunk_idx in range(num_chunks):
+                    end_idx = min(start_idx + chunk_dim_size, first_dim_size)
+
+                    # Extract chunk along first dimension
+                    chunk = weight[start_idx:end_idx]
+                    chunk_size = chunk.nbytes
+
+                    # Fill bucket with chunk
+                    if offset + chunk_size > bucket_size:
+                        get_torch_device().synchronize()
+                        s.send_pyobj({"bucket_meta": bucket_meta, "is_last": False})
+                        s.recv()
+                        bucket_meta = {}
+                        offset = 0
+
+                    bucket_meta[f"{name}_chunk_{chunk_idx}"] = {
+                        "name": name,
+                        "shape": chunk.shape,
+                        "dtype": chunk.dtype,
+                        "offset": offset,
+                        "chunk_idx": chunk_idx,
+                        "total_chunks": num_chunks,
+                    }
+                    buffer[offset : offset + chunk_size].copy_(chunk.view(-1).view(torch.uint8), non_blocking=True)
+                    offset += chunk_size
+
+                    start_idx = end_idx
+
+                continue
+
             # fill the tensor bucket
-            if offset + weight.nbytes > bucket_size:
+            if offset + weight_size > bucket_size:
                 get_torch_device().synchronize()
                 s.send_pyobj({"bucket_meta": bucket_meta, "is_last": False})
                 s.recv()
                 bucket_meta = {}
                 offset = 0
 
-            # TODO: slice embedding layer weight into chunks
-            assert offset + weight.nbytes <= bucket_size, (
-                f"Weight {name}({weight.shape}, {weight.dtype}) is too large to fit in the bucket."
-                f"Please increase rollout.update_weights_bucket_megabytes({bucket_size_mb} MB)."
-            )
             bucket_meta[name] = {
                 "name": name,
                 "shape": weight.shape,
                 "dtype": weight.dtype,
                 "offset": offset,
             }
-            buffer[offset : offset + weight.nbytes].copy_(weight.view(-1).view(torch.uint8), non_blocking=True)
-            offset += weight.nbytes
+            buffer[offset : offset + weight_size].copy_(weight.view(-1).view(torch.uint8), non_blocking=True)
+            offset += weight_size
 
         # send the last bucket
         get_torch_device().synchronize()