verl-project
diff --git a/‎verl/trainer/config/actor/actor.yaml‎
Lines changed: 32 additions & 0 deletions b/‎verl/trainer/config/actor/actor.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 0 additions & 32 deletions b/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎verl/trainer/config/engine/megatron.yaml‎
Lines changed: 0 additions & 6 deletions b/‎verl/trainer/config/engine/megatron.yaml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎verl/utils/modelopt/__init__.py‎
Lines changed: 44 additions & 0 deletions b/‎verl/utils/modelopt/__init__.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎verl/utils/modelopt/qat.py‎
Lines changed: 81 additions & 0 deletions b/‎verl/utils/modelopt/qat.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎verl/utils/modelopt_vllm_utils.py‎ ‎verl/utils/modelopt/vllm_patch.py‎verl/utils/modelopt_vllm_utils.py renamed to verl/utils/modelopt/vllm_patch.py
Lines changed: 1 addition & 67 deletions b/‎verl/utils/modelopt_vllm_utils.py‎ ‎verl/utils/modelopt/vllm_patch.py‎verl/utils/modelopt_vllm_utils.py renamed to verl/utils/modelopt/vllm_patch.py
Lines changed: 1 addition & 67 deletions
@@ -259,3 +259,35 @@ router_replay:
   # Required when mode is 'replay'
   replay_file: null
 
+# QAT (Quantization-Aware Training) configuration
+# When enabled:
+#   - QAT is automatically applied to actor model during training
+#   - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
+#   - Fast quantization is used when syncing weights to vLLM rollout
+# Supported modes: "w4a16" (NVFP4 weight-only)
+# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
+# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
+qat:
+
+  # Whether to enable QAT
+  enable: false
+
+  # Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
+  mode: "w4a16"
+
+  # Quantization group size (NVFP4 requires 16)
+  group_size: 16
+
+  # Patterns to ignore (e.g., lm_head, embed_tokens)
+  ignore_patterns:
+
+    - "lm_head"
+    - "embed_tokens"
+    - "re:.*mlp.gate$"
+
+  # Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
+  activation_observer: "static_minmax"
+
+  # Path to vLLM quantization config JSON file
+  quantization_config_path: null
+
@@ -48,35 +48,3 @@ calculate_sum_pi_squared: False
 
 # Enable gradient checkpointing for sum_pi_squared computation (saves memory)
 sum_pi_squared_checkpointing: False
-
-# QAT (Quantization-Aware Training) configuration
-# When enabled:
-#   - QAT is automatically applied to actor model during training
-#   - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
-#   - Fast quantization is used when syncing weights to vLLM rollout
-# Supported modes: "w4a16" (NVFP4 weight-only)
-# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
-# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
-qat:
-
-  # Whether to enable QAT
-  enable: false
-
-  # Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
-  mode: "w4a16"
-
-  # Quantization group size (NVFP4 requires 16)
-  group_size: 16
-
-  # Patterns to ignore (e.g., lm_head, embed_tokens)
-  ignore_patterns:
-
-    - "lm_head"
-    - "embed_tokens"
-    - "re:.*mlp.gate$"
-
-  # Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
-  activation_observer: "static_minmax"
-
-  # Path to vLLM quantization config JSON file
-  quantization_config_path: null
@@ -79,12 +79,6 @@ override_transformer_config:
   # Attention backend to use (flash,fused,unfused,local,auto). Defaults to auto in mcore, flash in verl
   attention_backend: flash
 
-# # Quantization method. None for no quantization, "nvfp4" for NVFP4 quantization
-quantization: null
-
-# Whether to enable Quantization-Aware Training (QAT). Default False.
-enable_qat: False
-
 override_mcore_model_config: {}
 
 # oc.select: default val for ref.megatron.use_mbridge
 
@@ -0,0 +1,44 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+ModelOpt integration for verl.
+
+Supports NVFP4 quantization with Megatron QAT training + vLLM low-precision inference.
+
+Module Structure:
+- qat.py: QAT quantization config, apply_qat, QuantizationMetadata
+- weight_processor.py: QATWeightPostProcessor for converting QAT weights to quantized format
+- vllm_patch.py: vLLM monkey patches for NVFP4 inference (Linear, MoE, KV Cache)
+
+Usage:
+    # Training side
+    from verl.utils.modelopt import apply_qat, QATWeightPostProcessor
+
+    # Inference side
+    from verl.utils.modelopt import apply_vllm_modelopt_patches
+"""
+
+from verl.utils.modelopt.qat import NVFP4_WEIGHT_ONLY_CFG, QuantizationMetadata, apply_qat
+from verl.utils.modelopt.vllm_patch import apply_vllm_modelopt_patches
+from verl.utils.modelopt.weight_processor import QATWeightPostProcessor
+
+__all__ = [
+    "NVFP4_WEIGHT_ONLY_CFG",
+    "apply_qat",
+    "QuantizationMetadata",
+    "QATWeightPostProcessor",
+    "apply_vllm_modelopt_patches",
+]
@@ -0,0 +1,81 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg
+
+# ---------------------------------------------------------------------------
+# NVFP4 quantization config
+# ---------------------------------------------------------------------------
+
+NVFP4_WEIGHT_ONLY_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        "*input_quantizer": {"enable": False},
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": "max",
+}
+
+# ---------------------------------------------------------------------------
+# QAT application
+# ---------------------------------------------------------------------------
+
+
+def apply_qat(model: nn.Module, qat_mode: str):
+    """Apply Quantization-Aware Training to the model.
+
+    Args:
+        model: The Megatron model to apply QAT to.
+        qat_mode: QAT mode, now only support "w4a16" for weight-only quantization.
+
+    Returns:
+        The quantized model.
+    """
+    if qat_mode != "w4a16":
+        raise ValueError(f"Only 'w4a16' is supported, got: {qat_mode}")
+
+    mtq.quantize(model, NVFP4_WEIGHT_ONLY_CFG)
+    return model
+
+
+@dataclass
+class QuantizationMetadata:
+    """Metadata for a quantized module."""
+
+    qformat: str
+    weight_quantizer: Any
+    input_quantizer: Any
+    module: torch.nn.Module
+    vpp_idx: int
+    block_size: int = 16  # Default NVFP4 block size
+    # Fields for EP synchronization - store amax values for non-local experts
+    weight_amax: Optional[torch.Tensor] = None
+    input_amax: Optional[torch.Tensor] = None
+    is_local: bool = True  # Whether this expert is local to current EP rank
+    global_expert_idx: Optional[int] = None  # Global expert index for MoE experts
+    local_expert_idx: Optional[int] = None  # Local expert index on this EP rank
@@ -23,72 +23,6 @@
 from torch.nn import Parameter
 
 
-def generate_nvfp4_ignore_list(num_layers: int, is_moe: bool) -> list[str]:
-    """
-    Generate the ignore list for NVFP4 quantization based on model configuration.
-    
-    Args:
-        num_layers: Number of hidden layers in the model (from hf_config.num_hidden_layers)
-        is_moe: Whether the model is a Mixture of Experts model
-        
-    Returns:
-        List of layer names to ignore during quantization
-    """
-    ignore_list = []
-    
-    # For MoE models, ignore the gate layers (routing layers)
-    if is_moe:
-        for layer_idx in range(num_layers):
-            ignore_list.append(f"model.layers.{layer_idx}.mlp.gate")
-    
-    # Always ignore lm_head for stability
-    ignore_list.append("lm_head")
-    
-    return ignore_list
-
-
-def get_nvfp4_block_quant_kwargs(num_layers: int, is_moe: bool) -> dict:
-    """
-    Generate complete NVFP4 quantization configuration based on model properties.
-    Args:
-        num_layers: Number of hidden layers in the model (from hf_config.num_hidden_layers)
-        is_moe: Whether the model is a Mixture of Experts model
-        
-    Returns:
-        Complete quantization configuration dictionary compatible with ModelOpt
-    """
-    ignore_list = generate_nvfp4_ignore_list(num_layers, is_moe)
-    
-    return {
-        "config_groups": {
-            "group_0": {
-                "input_activations": {
-                    "dynamic": "false",
-                    "num_bits": 4,
-                    "type": "float",
-                    "group_size": 16
-                },
-                "weights": {
-                    "dynamic": "false",
-                    "num_bits": 4,
-                    "type": "float",
-                    "group_size": 16
-                },
-                "targets": [
-                    "Linear"
-                ]
-            }
-        },
-        "ignore": ignore_list,
-        "quant_algo": "NVFP4",
-        "producer": {
-            "name": "modelopt",
-        },
-        "quant_method": "modelopt"
-    }
-
-
-
 def _create_param_from_subclass_attributes(custom_data: torch.Tensor, custom_weight) -> Parameter:
     """
     Helper to preserve custom attributes from ModelWeightParameter and
@@ -838,4 +772,4 @@ def apply_vllm_modelopt_patches():
     # Static scales mode: patch process_weights_after_loading to preserve k_scale/v_scale for manual updates
     func5_path = "vllm.model_executor.layers.quantization.kv_cache.BaseKVCacheMethod.process_weights_after_loading"
     patcher5 = patch(func5_path, process_weights_after_loading_kv)
-    patcher5.start()
+    patcher5.start()