Skip to content

Commit ae13dde

Browse files
authored
Merge branch 'verl-project:main' into main
2 parents 529e576 + 32705dc commit ae13dde

File tree

10 files changed

+75
-28
lines changed

10 files changed

+75
-28
lines changed

docs/ascend_tutorial/ascend_sglang_quick_start.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ Atlas 800T A3
7676
git clone https://github.com/volcengine/verl.git
7777
# Make sure you have activated verl conda env
7878
# NPU_DEVICE=A3 or A2 depends on your device
79-
NPU_DEVICE=A3 bash verl/scripts/install_sglang_mcore_npu.sh
79+
# USE_MEGATRON=1 if you need to install megatron backend
80+
NPU_DEVICE=A3 USE_MEGATRON=1 bash verl/scripts/install_sglang_mcore_npu.sh
8081
8182
**4. 安装verl**
8283

docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,11 @@ SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生
4343
^^^^^^^^^^^
4444
**下载模型权重**
4545

46-
--local-dir: 模型保存路径
47-
48-
.. code-block:: bash
49-
50-
export HF_ENDPOINT=https://hf-mirror.com
51-
hf download --resume-download Qwen/Qwen3-30B-A3B --local-dir /path/to/local_dir
46+
Qwen3-30B: https://huggingface.co/Qwen/Qwen3-30B-A3B
5247

5348
**下载数据集**
5449

55-
.. code-block:: bash
56-
57-
git clone https://www.modelscope.cn/datasets/AI-ModelScope/DAPO-Math-17k.git
50+
DAPO-Math-17k: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
5851

5952
**HuggingFace To Megatron权重转换(可选)**
6053

examples/dppo_trainer/run_qwen30b_dppo.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ bypass_mode=True
5858
# We recommand using Dr.GRPO to remove the length and difficulty bias in original GRPO.
5959
# See Section 3.1 in https://arxiv.org/pdf/2503.20783 for more details.
6060
norm_adv_by_std_in_grpo=False # remove the difficulty bias
61-
loss_agg_mode="seq-mean-token-sum" # remove the length bias
61+
loss_agg_mode="seq-mean-token-sum-norm" # remove the length bias
6262

6363
# reference policy
6464
use_kl_in_reward=False

scripts/install_sglang_mcore_npu.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
22
set -e
33
NPU_DEVICE=${NPU_DEVICE:=A3}
4+
USE_MEGATRON=${USE_MEGATRON:-1}
45

56
export MAX_JOBS=32
67

verl/models/mcore/util.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,19 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import logging
1617
import math
18+
import os
1719

1820
import torch
1921
from megatron.core import parallel_state as mpu
2022
from megatron.core.packed_seq_params import PackedSeqParams
2123

2224
from verl.utils.model import CausalLMOutputForPPO
2325

26+
logger = logging.getLogger(__file__)
27+
logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
28+
2429

2530
def preprocess_packed_seqs(
2631
input_ids: torch.Tensor, attention_mask: torch.Tensor, pre_process: bool = True, use_fp8_padding=False
@@ -333,6 +338,19 @@ def preprocess_thd_no_padding(
333338
start_idx = cu_seqlens_padded_cpu[i] // cp_size
334339
# split to 2 chunks
335340
d = input_ids[i]
341+
# If the number of elements in `d` is smaller than the required
342+
# alignment size, pad the tensor with zeros so that its total
343+
# length matches `align_size`. This ensures size alignment for
344+
# downstream operations (e.g., communication or memory alignment).
345+
if d.numel() < align_size:
346+
original_size = d.numel()
347+
pad = torch.zeros(align_size - d.numel(), dtype=d.dtype, device=d.device)
348+
d = torch.cat([d, pad], dim=0)
349+
logger.warning_once(
350+
f"Padding tensor for context parallel alignment, original_size={original_size}, "
351+
f"align_size={align_size}"
352+
)
353+
336354
input_ids_rmpad[start_idx : start_idx + half_seqlen] = d[
337355
half_seqlen * cp_rank : half_seqlen * (cp_rank + 1)
338356
]

verl/trainer/ppo/core_algos.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,14 +1058,19 @@ def agg_loss(
10581058
raise ValueError("(global) batch_num_tokens is required when dp_size > 1")
10591059
batch_num_tokens = loss_mask.sum()
10601060
loss = verl_F.masked_sum(loss_mat, loss_mask) / batch_num_tokens * dp_size
1061-
elif loss_agg_mode == "seq-mean-token-sum":
1061+
elif loss_agg_mode in ["seq-mean-token-sum", "seq-mean-token-sum-norm"]:
10621062
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum
10631063
seq_mask = (torch.sum(loss_mask, dim=-1) > 0).float() # exclude fully masked sequences
10641064
if global_batch_size is None:
10651065
if dp_size > 1:
10661066
raise ValueError("global_batch_size is required when dp_size > 1")
10671067
global_batch_size = seq_mask.sum()
10681068
loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size # seq-mean
1069+
if loss_agg_mode == "seq-mean-token-sum-norm":
1070+
if loss_scale_factor is None:
1071+
horizon = loss_mask.shape[-1]
1072+
loss_scale_factor = horizon
1073+
loss /= loss_scale_factor
10691074
elif loss_agg_mode == "seq-mean-token-mean":
10701075
seq_mask = torch.sum(loss_mask, dim=-1) # per-sequence token count
10711076
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / (seq_mask + 1e-8) # token-mean
@@ -1075,14 +1080,6 @@ def agg_loss(
10751080
raise ValueError("global_batch_size is required when dp_size > 1")
10761081
global_batch_size = seq_mask.sum()
10771082
loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size # seq-mean
1078-
elif loss_agg_mode == "seq-mean-token-sum-norm":
1079-
if loss_scale_factor is None:
1080-
raise ValueError(
1081-
f"{loss_agg_mode=} but {loss_scale_factor=}. "
1082-
'If not intented for custom scaling factor, try setting loss_agg_mode="seq-mean-token-sum".'
1083-
)
1084-
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)
1085-
loss = torch.sum(seq_losses) / loss_scale_factor * dp_size
10861083
else:
10871084
raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
10881085

verl/utils/megatron_utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,11 @@ def offload_megatron_model_to_cpu(models):
442442
# if the grad_data size is already zero, we assume that it is already offloaded
443443
buffer.grad_data_size = buffer.grad_data.storage().size()
444444
buffer.grad_data.storage().resize_(0)
445+
# Offload frozen parameters not in DDP buffers (e.g. base model in LoRA/PEFT)
446+
# DDP buffers only contain requires_grad=True params, so frozen params must be offloaded separately.
447+
for param in model_chunk.module.parameters():
448+
if not param.requires_grad and param.device.type != "cpu":
449+
param.data = param.data.to("cpu", non_blocking=True)
445450
else:
446451
# we need this for ref module
447452
for _, param in model_chunk.named_parameters():
@@ -453,7 +458,14 @@ def offload_megatron_model_to_cpu(models):
453458

454459

455460
@torch.no_grad()
456-
def load_megatron_model_to_gpu(models, load_grad=True):
461+
def load_megatron_model_to_gpu(models, load_grad=True, load_frozen_params=True):
462+
"""
463+
Load megatron model to GPU.
464+
Args:
465+
models: The model to load.
466+
load_grad: Whether to load gradients.
467+
load_frozen_params: Whether to load frozen parameters.
468+
"""
457469
for model_chunk in models:
458470
if isinstance(model_chunk, DDP):
459471
model_chunk_all_buffers = [model_chunk.buffers, model_chunk.expert_parallel_buffers]
@@ -468,6 +480,13 @@ def load_megatron_model_to_gpu(models, load_grad=True):
468480
buffer.param_data.storage().resize_(buffer.param_data_size)
469481
# copy data from cpu to cuda
470482
buffer.param_data.copy_(buffer.param_data.cpu_data, non_blocking=True)
483+
484+
# Load frozen parameters that were offloaded (e.g. base model in LoRA/PEFT)
485+
if load_frozen_params:
486+
device_id = get_device_id()
487+
for param in model_chunk.module.parameters():
488+
if not param.requires_grad and param.device.type == "cpu":
489+
param.data = param.data.to(device_id, non_blocking=True)
471490
else:
472491
# we need this for ref module
473492
device_id = get_device_id()

verl/utils/profiler/torch_profile.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import functools
1616
import os
17+
from datetime import datetime, timezone
1718
from typing import Callable, Optional
1819

1920
import torch
@@ -34,7 +35,11 @@ def get_torch_profiler(
3435

3536
os.makedirs(save_path, exist_ok=True)
3637

37-
save_file_name = f"prof_rank-{rank}.json.gz"
38+
current_time = datetime.now(tz=timezone.utc).astimezone()
39+
timestamp = current_time.strftime("%Y%m%d%H%M%S%f")[:-3]
40+
pid = os.getpid()
41+
42+
save_file_name = f"prof_rank-{rank}_{pid}_{timestamp}.json.gz"
3843
if save_file_prefix:
3944
save_file_name = f"{save_file_prefix}_{save_file_name}"
4045
save_path = os.path.join(save_path, save_file_name)

verl/workers/engine/megatron/transformer_impl.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ def initialize(self):
319319
if self.engine_config.forward_only:
320320
self.optimizer = None
321321
self.lr_scheduler = None
322+
self.to(device="cpu", model=self._is_offload_param, optimizer=False, grad=False)
323+
log_gpu_memory_usage("After offload model during init (forward_only)", logger=logger)
322324
return
323325

324326
self.optimizer = self._build_optimizer()
@@ -602,12 +604,14 @@ def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forw
602604
return {}
603605

604606
def get_per_tensor_param(self, base_sync_done=False, **kwargs):
605-
load_megatron_model_to_gpu(self.module, load_grad=False)
606607
peft_config = None
607608
non_merge_lora_sync = self.peft_cls is not None and not self.model_config.lora.get("merge", False)
609+
adapter_only = base_sync_done and non_merge_lora_sync
610+
# when lora adapter only, we only load adapter weights when base sync is done, otherwise load all weights
611+
load_megatron_model_to_gpu(self.module, load_grad=False, load_frozen_params=not adapter_only)
608612
if self.vanilla_bridge:
609613
per_tensor_param = self.bridge.export_weights(self.module)
610-
elif base_sync_done and non_merge_lora_sync:
614+
elif adapter_only:
611615
# Only export adapter weights
612616
peft_config = build_peft_config_for_vllm(self.model_config.lora)
613617
per_tensor_param = self.bridge.export_adapter_weights(self.module)

verl/workers/rollout/vllm_rollout/vllm_async_server.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ def get_server_address(self):
180180
assert self._server_port is not None, "http server is not launched, port is None"
181181
return self._server_address, self._server_port
182182

183+
@property
184+
def lora_as_adapter(self) -> bool:
185+
return (
186+
self.model_config.lora_rank > 0 or self.model_config.lora.get("rank", 0) > 0
187+
) and not self.model_config.lora.get("merge", False)
188+
183189
async def collective_rpc(
184190
self,
185191
method: str | Callable,
@@ -543,9 +549,7 @@ async def generate(
543549

544550
# Add lora request
545551
lora_request = None
546-
if (
547-
self.model_config.lora_rank > 0 or self.model_config.lora.get("rank", 0) > 0
548-
) and not self.model_config.lora.get("merge", False):
552+
if self.lora_as_adapter:
549553
# Make sure we also check that the lora is already loaded in the engine
550554
lora_loaded = VLLM_LORA_INT_ID in await self.engine.list_loras()
551555
if lora_loaded:
@@ -618,7 +622,12 @@ async def sleep(self):
618622

619623
if self.rollout_mode == RolloutMode.HYBRID:
620624
# Don't use engine.sleep(level=2) here
621-
await self.engine.collective_rpc("sleep", kwargs={"level": 2})
625+
# lora only update adapter weights, so set sleep level to 1
626+
if self.lora_as_adapter:
627+
sleep_level = 1
628+
else:
629+
sleep_level = 2
630+
await self.engine.collective_rpc("sleep", kwargs={"level": sleep_level})
622631

623632
# clear encoder cache: https://github.com/vllm-project/vllm/pull/33452
624633
# await self.engine.reset_encoder_cache()

0 commit comments

Comments
 (0)