verl-project · jQizhang · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/recipe b/recipe
@@ -139,6 +139,16 @@ actor_rollout_ref:
       mode: disabled
       record_file: null
       replay_file: null
+    qat:
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
     load_weight: true
   ref:
     rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}

@@ -121,6 +121,16 @@ actor_rollout_ref:
       mode: disabled
       record_file: null
       replay_file: null
+    qat:
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
   ref:
     optim:
       _target_: verl.workers.config.TorchtitanOptimizerConfig

@@ -120,13 +120,6 @@ actor_rollout_ref:
       mode: disabled
       record_file: null
       replay_file: null
-    grad_clip: 1.0
-    ulysses_sequence_parallel_size: 1
-    entropy_from_logits_with_chunking: false
-    entropy_checkpointing: false
-    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
-    calculate_sum_pi_squared: false
-    sum_pi_squared_checkpointing: false
     qat:
       enable: false
       mode: w4a16
@@ -137,6 +130,13 @@ actor_rollout_ref:
       - re:.*mlp.gate$
       activation_observer: static_minmax
       quantization_config_path: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
   ref:
     rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
     strategy: ${actor_rollout_ref.actor.strategy}

@@ -120,6 +120,16 @@ actor_rollout_ref:
       mode: disabled
       record_file: null
       replay_file: null
+    qat:
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
   ref:
     rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
     strategy: veomni

@@ -259,3 +259,35 @@ router_replay:
   # Required when mode is 'replay'
   replay_file: null
 
+# QAT (Quantization-Aware Training) configuration
+# When enabled:
+#   - QAT is automatically applied to actor model during training
+#   - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
+#   - Fast quantization is used when syncing weights to vLLM rollout
+# Supported modes: "w4a16" (NVFP4 weight-only)
+# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
+# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
+qat:
+
+  # Whether to enable QAT
+  enable: false
+
+  # Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
+  mode: "w4a16"
+
+  # Quantization group size (NVFP4 requires 16)
+  group_size: 16
+
+  # Patterns to ignore (e.g., lm_head, embed_tokens)
+  ignore_patterns:
+
+    - "lm_head"
+    - "embed_tokens"
+    - "re:.*mlp.gate$"
+
+  # Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
+  activation_observer: "static_minmax"
+
+  # Path to vLLM quantization config JSON file
+  quantization_config_path: null
+
@@ -48,35 +48,3 @@ calculate_sum_pi_squared: False
 
 # Enable gradient checkpointing for sum_pi_squared computation (saves memory)
 sum_pi_squared_checkpointing: False
-
-# QAT (Quantization-Aware Training) configuration
-# When enabled:
-#   - QAT is automatically applied to actor model during training
-#   - Fused scales (QKV/GateUp) are automatically enabled for training-inference consistency
-#   - Fast quantization is used when syncing weights to vLLM rollout
-# Supported modes: "w4a16" (NVFP4 weight-only)
-# Note: "w4a4" mode is included in the code but currently has KL divergence issues and is NOT recommended for use.
-# For usage examples, see: https://github.com/verl-project/verl-recipe/blob/main/qat/README.md
-qat:
-
-  # Whether to enable QAT
-  enable: false
-
-  # Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
-  mode: "w4a16"
-
-  # Quantization group size (NVFP4 requires 16)
-  group_size: 16
-
-  # Patterns to ignore (e.g., lm_head, embed_tokens)
-  ignore_patterns:
-
-    - "lm_head"
-    - "embed_tokens"
-    - "re:.*mlp.gate$"
-
-  # Activation observer for W4A4 mode: "static_minmax", "memoryless_minmax", or "minmax"
-  activation_observer: "static_minmax"
-
-  # Path to vLLM quantization config JSON file
-  quantization_config_path: null
diff --git a/verl/utils/modelopt/__init__.py b/verl/utils/modelopt/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ModelOpt integration for NVFP4 quantization with Megatron QAT training and vLLM inference."""
+
+from verl.utils.modelopt.megatron_qat_patch import (
+    apply_qat_patch,
+    revert_qat_patch,
+)
+from verl.utils.modelopt.qat_weight_exporter import QATWeightExporter
+from verl.utils.modelopt.quantize import (
+    apply_qat,
+    build_quantize_config,
+)
+from verl.utils.modelopt.vllm_modelopt_patch import (
+    apply_modelopt_nvfp4_patches,
+    modelopt_process_weights_after_loading,
+    prepare_modelopt_for_weight_reload,
+)
+
+__all__ = [
+    "build_quantize_config",
+    "apply_qat",
+    "QATWeightExporter",
+    "apply_modelopt_nvfp4_patches",
+    "prepare_modelopt_for_weight_reload",
+    "modelopt_process_weights_after_loading",
+    "apply_qat_patch",
+    "revert_qat_patch",
+]
+49 −0		qat/config/dapo_qat_megatron_trainer.yaml
+29 −0		qat/config/nvfp4_w4a16_megatron.json
+243 −0		qat/run_qwen3_30b_w4a16_megatron.sh