Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions verl/trainer/config/_generated_ppo_megatron_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,16 @@ actor_rollout_ref:
mode: disabled
record_file: null
replay_file: null
qat:
enable: false
mode: w4a16
group_size: 16
ignore_patterns:
- lm_head
- embed_tokens
- re:.*mlp.gate$
activation_observer: static_minmax
quantization_config_path: null
load_weight: true
ref:
rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
Expand Down
10 changes: 10 additions & 0 deletions verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ actor_rollout_ref:
mode: disabled
record_file: null
replay_file: null
qat:
enable: false
mode: w4a16
group_size: 16
ignore_patterns:
- lm_head
- embed_tokens
- re:.*mlp.gate$
activation_observer: static_minmax
quantization_config_path: null
ref:
optim:
_target_: verl.workers.config.TorchtitanOptimizerConfig
Expand Down
14 changes: 7 additions & 7 deletions verl/trainer/config/_generated_ppo_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,6 @@ actor_rollout_ref:
mode: disabled
record_file: null
replay_file: null
grad_clip: 1.0
ulysses_sequence_parallel_size: 1
entropy_from_logits_with_chunking: false
entropy_checkpointing: false
use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
calculate_sum_pi_squared: false
sum_pi_squared_checkpointing: false
qat:
enable: false
mode: w4a16
Expand All @@ -137,6 +130,13 @@ actor_rollout_ref:
- re:.*mlp.gate$
activation_observer: static_minmax
quantization_config_path: null
grad_clip: 1.0
ulysses_sequence_parallel_size: 1
entropy_from_logits_with_chunking: false
entropy_checkpointing: false
use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
calculate_sum_pi_squared: false
sum_pi_squared_checkpointing: false
ref:
rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
strategy: ${actor_rollout_ref.actor.strategy}
Expand Down
10 changes: 10 additions & 0 deletions verl/trainer/config/_generated_ppo_veomni_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ actor_rollout_ref:
mode: disabled
record_file: null
replay_file: null
qat:
enable: false
mode: w4a16
group_size: 16
ignore_patterns:
- lm_head
- embed_tokens
- re:.*mlp.gate$
activation_observer: static_minmax
quantization_config_path: null
ref:
rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
strategy: veomni
Expand Down
32 changes: 32 additions & 0 deletions verl/trainer/config/actor/actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,35 @@ router_replay:
# Required when mode is 'replay'
replay_file: null

# QAT (Quantization-Aware Training) configuration
# When enabled:
# - QAT is automatically applied to actor model during training
# - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
# - Fast quantization is used when syncing weights to vLLM rollout
# Supported modes: "w4a16" (NVFP4 weight-only)
# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
qat:

# Whether to enable QAT
enable: false

# Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
mode: "w4a16"

# Quantization group size (NVFP4 requires 16)
group_size: 16

# Patterns to ignore (e.g., lm_head, embed_tokens)
ignore_patterns:

- "lm_head"
- "embed_tokens"
- "re:.*mlp.gate$"

# Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
activation_observer: "static_minmax"

# Path to vLLM quantization config JSON file
quantization_config_path: null

32 changes: 0 additions & 32 deletions verl/trainer/config/actor/dp_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,35 +48,3 @@ calculate_sum_pi_squared: False

# Enable gradient checkpointing for sum_pi_squared computation (saves memory)
sum_pi_squared_checkpointing: False

# QAT (Quantization-Aware Training) configuration
# When enabled:
# - QAT is automatically applied to actor model during training
# - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
# - Fast quantization is used when syncing weights to vLLM rollout
# Supported modes: "w4a16" (NVFP4 weight-only)
# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
qat:

# Whether to enable QAT
enable: false

# Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
mode: "w4a16"

# Quantization group size (NVFP4 requires 16)
group_size: 16

# Patterns to ignore (e.g., lm_head, embed_tokens)
ignore_patterns:

- "lm_head"
- "embed_tokens"
- "re:.*mlp.gate$"

# Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
activation_observer: "static_minmax"

# Path to vLLM quantization config JSON file
quantization_config_path: null
42 changes: 42 additions & 0 deletions verl/utils/modelopt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2025 Bytedance Ltd. and/or its affiliates
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ModelOpt integration for NVFP4 quantization with Megatron QAT training and vLLM inference."""

from verl.utils.modelopt.megatron_qat_patch import (
apply_qat_patch,
revert_qat_patch,
)
from verl.utils.modelopt.qat_weight_exporter import QATWeightExporter
from verl.utils.modelopt.quantize import (
apply_qat,
build_quantize_config,
)
from verl.utils.modelopt.vllm_modelopt_patch import (
apply_modelopt_nvfp4_patches,
modelopt_process_weights_after_loading,
prepare_modelopt_for_weight_reload,
)

__all__ = [
"build_quantize_config",
"apply_qat",
"QATWeightExporter",
"apply_modelopt_nvfp4_patches",
"prepare_modelopt_for_weight_reload",
"modelopt_process_weights_after_loading",
"apply_qat_patch",
"revert_qat_patch",
]
Loading