feat(qat): support QAT in FSDPEngine for the new unified engine_workers architecture

zhangyimi · zhangyimi · commit 0442d6f830f6 · 2026-02-26T00:03:47.000-08:00
diff --git a/recipe b/recipe
@@ -1 +1 @@
-Subproject commit 3490a22a0a3adeb7e4787fe70b1060b642efbae4
+Subproject commit 75a4507abf6dd5cc434ce07cd6041ac9c37ab589
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -324,7 +324,7 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
-    qat: ${oc.select:actor_rollout_ref.actor.qat,null}
+    qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,null}
     layer_name_map:
       qkv_layer_name: qkv
       gate_proj_layer_name: gate_up
diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
@@ -313,7 +313,7 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
-    qat: ${oc.select:actor_rollout_ref.actor.qat,null}
+    qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,null}
     layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -45,6 +45,17 @@ actor_rollout_ref:
       forward_only: false
       strategy: fsdp
       dtype: bfloat16
+      qat:
+        _target_: verl.workers.config.QATEngineConfig
+        enable: false
+        mode: w4a16
+        group_size: 16
+        ignore_patterns:
+        - lm_head
+        - embed_tokens
+        - re:.*mlp.gate$
+        activation_observer: static_minmax
+        quantization_config_path: null
     _target_: verl.workers.config.FSDPActorConfig
     rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
     strategy: fsdp
@@ -196,6 +207,17 @@ actor_rollout_ref:
       forward_only: true
       strategy: fsdp
       dtype: bfloat16
+      qat:
+        _target_: verl.workers.config.QATEngineConfig
+        enable: false
+        mode: w4a16
+        group_size: 16
+        ignore_patterns:
+        - lm_head
+        - embed_tokens
+        - re:.*mlp.gate$
+        activation_observer: static_minmax
+        quantization_config_path: null
     _target_: verl.workers.config.FSDPActorConfig
     ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
     entropy_from_logits_with_chunking: false
@@ -312,7 +334,7 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
-    qat: ${oc.select:actor_rollout_ref.actor.qat,null}
+    qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,null}
     layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
@@ -436,6 +458,17 @@ critic:
       forward_only: false
       strategy: fsdp
       dtype: bfloat16
+      qat:
+        _target_: verl.workers.config.QATEngineConfig
+        enable: false
+        mode: w4a16
+        group_size: 16
+        ignore_patterns:
+        - lm_head
+        - embed_tokens
+        - re:.*mlp.gate$
+        activation_observer: static_minmax
+        quantization_config_path: null
     path: ~/models/deepseek-llm-7b-chat
     tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
     override_config: {}
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -294,7 +294,7 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
-    qat: ${oc.select:actor_rollout_ref.actor.qat,null}
+    qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,null}
     layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/engine/fsdp.yaml b/verl/trainer/config/engine/fsdp.yaml
@@ -61,3 +61,31 @@ strategy: fsdp
 
 # Mixed precision training param dtype
 dtype: bfloat16 # ["bfloat16", "float16"]
+
+# QAT (Quantization-Aware Training) configuration
+qat:
+
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.workers.config.QATEngineConfig
+
+  # Whether to enable QAT
+  enable: false
+
+  # Quantization mode: "w4a16" (weight-only). "w4a4" is experimental and not recommended.
+  mode: "w4a16"
+
+  # Quantization group size (NVFP4 requires 16)
+  group_size: 16
+
+  # Patterns to ignore (e.g., lm_head, embed_tokens)
+  ignore_patterns:
+
+    - "lm_head"
+    - "embed_tokens"
+    - "re:.*mlp.gate$"
+
+  # Activation observer for W4A4 mode
+  activation_observer: "static_minmax"
+
+  # Path to vLLM quantization config JSON file
+  quantization_config_path: null
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
@@ -387,5 +387,5 @@ quantization_config_file: null
 # MTP configuration, reuse model configuration
 mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
 
-# QAT configuration (inherited from actor.qat)
-qat: ${oc.select:actor_rollout_ref.actor.qat,null}
+# QAT configuration (inherited from actor's engine config)
+qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,null}
diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
@@ -31,6 +31,7 @@
     "VeOmniEngineConfig",
     "EngineConfig",
     "EngineRouterReplayConfig",
+    "QATEngineConfig",
 ]
 
 
@@ -177,6 +178,27 @@ def __post_init__(self) -> None:
             self.sequence_parallel = False
 
 
+@dataclass
+class QATEngineConfig(BaseConfig):
+    """Configuration for QAT (Quantization-Aware Training) within an engine.
+
+    Args:
+        enable (bool): Whether to enable QAT, default False
+        mode (str): Quantization mode, "w4a16" or "w4a4", default "w4a16"
+        group_size (int): Group size for blockwise quantization, default 16
+        ignore_patterns (list[str]): Module name patterns to exclude from quantization
+        activation_observer (str): Observer strategy for activation global_scale (W4A4 only)
+        quantization_config_path (Optional[str]): Path to quantization config JSON for vLLM
+    """
+
+    enable: bool = False
+    mode: str = "w4a16"
+    group_size: int = 16
+    ignore_patterns: list[str] = field(default_factory=lambda: ["lm_head", "embed_tokens", "re:.*mlp.gate$"])
+    activation_observer: str = "static_minmax"
+    quantization_config_path: Optional[str] = None
+
+
 @dataclass
 class FSDPEngineConfig(EngineConfig):
     """Configuration for FSDP (Fully Sharded Data Parallel).
@@ -199,6 +221,7 @@ class FSDPEngineConfig(EngineConfig):
             debugging.
         mixed_precision (Optional[dict[str, Any]]): Mixed precision configuration for FSDP, default None
         dtype (str): Mixed precision training param dtype, default "bfloat16"
+        qat (QATEngineConfig): QAT configuration, default disabled
     """
 
     # ulysses_sequence_parallel_size is mutable for backward compatibility
@@ -218,6 +241,7 @@ class FSDPEngineConfig(EngineConfig):
     use_torch_compile: bool = True
     entropy_checkpointing: bool = False
     strategy: str = "fsdp"
+    qat: QATEngineConfig = field(default_factory=QATEngineConfig)
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/verl/workers/engine/fsdp/transformer_impl.py b/verl/workers/engine/fsdp/transformer_impl.py
@@ -133,6 +133,12 @@ def __init__(
         self._is_offload_optimizer = self.engine_config.optimizer_offload
         self._is_lora = self.model_config.lora_rank > 0
 
+        # QAT (Quantization-Aware Training)
+        self._qat_config = getattr(self.engine_config, "qat", None)
+        self._qat_enabled = self._qat_config is not None and getattr(self._qat_config, "enable", False)
+        if self._qat_enabled:
+            logger.info(f"QAT enabled: mode={self._qat_config.mode}, group_size={self._qat_config.group_size}")
+
         if self.engine_config.entropy_from_logits_with_chunking:
             entropy_from_logits = verl_F.entropy_from_logits_with_chunking
         else:
@@ -435,6 +441,58 @@ def _build_lr_scheduler(self, optimizer):
             raise NotImplementedError(f"LR scheduler type {lr_scheduler_type} is not supported")
         return lr_scheduler
 
+    def _apply_qat(self, module):
+        """Apply QAT transformations to the model before FSDP wrapping."""
+        from verl.utils.qat.core import apply_qat, enable_qat_fuse
+
+        module = apply_qat(
+            module,
+            {
+                "enable": self._qat_config.enable,
+                "mode": self._qat_config.mode,
+                "group_size": self._qat_config.group_size,
+                "ignore_patterns": list(self._qat_config.ignore_patterns),
+                "activation_observer": self._qat_config.activation_observer,
+            },
+        )
+        enable_qat_fuse(module)
+
+        if self._qat_config.mode == "w4a4":
+            self._restore_w4a4_input_scales(module, self.model_config.local_path)
+
+        return module
+
+    def _restore_w4a4_input_scales(self, model, model_path):
+        """Restore input_global_scale and input_amax from checkpoint for W4A4 mode."""
+        import glob
+
+        from safetensors import safe_open
+
+        safetensor_files = glob.glob(f"{model_path}/model*.safetensors")
+        loaded_count = 0
+
+        for sf_path in safetensor_files:
+            with safe_open(sf_path, framework="pt") as f:
+                for key in f.keys():
+                    if "input_global_scale" in key:
+                        module_path = key.replace(".input_global_scale", "")
+                        amax_key = f"{module_path}.input_amax"
+
+                        module = model
+                        for part in module_path.split("."):
+                            module = getattr(module, part)
+
+                        scale_val = f.get_tensor(key)
+                        val = scale_val.item() if scale_val.numel() == 1 else scale_val.max().item()
+                        module.input_global_scale.fill_(val)
+
+                        amax_val = f.get_tensor(amax_key)
+                        amax = amax_val.item() if amax_val.numel() == 1 else amax_val.max().item()
+                        module.input_amax.fill_(amax)
+                        loaded_count += 1
+
+        logger.info(f"[QAT W4A4] Restored {loaded_count} input_global_scale/input_amax from {model_path}")
+
     def _build_model_optimizer(self):
         from verl.utils.model import print_model_size
 
@@ -444,6 +502,10 @@ def _build_model_optimizer(self):
         if self._is_lora:
             module = self._build_lora_module(module)
 
+        # Apply QAT before FSDP wrapping (training only)
+        if self._qat_enabled and not self.engine_config.forward_only:
+            module = self._apply_qat(module)
+
         # Synchronize all distributed processes before proceeding
         torch.distributed.barrier()
         if self.rank == 0:
@@ -567,6 +629,12 @@ def optimizer_step(self):
             self.optimizer.zero_grad()
         else:
             self.optimizer.step()
+
+        if self._qat_enabled:
+            from verl.utils.qat.core import invalidate_all_scales
+
+            invalidate_all_scales(self.module)
+
         return grad_norm.item()
 
     def lr_scheduler_step(self):
@@ -699,8 +767,29 @@ def get_per_tensor_param(self, layered_summon=False, base_sync_done=False, **kwa
                 )
                 for name, param in params.items()
             )
-        # return per_tensor_param, peft_config
-        # Convert peft_config to dict for vLLM compatibility (PEFTHelper.from_dict expects dict)
+
+        if self._qat_enabled:
+            from verl.utils.qat.quantizer import QATQuantizer
+            from verl.utils.torch_dtypes import PrecisionType
+
+            mixed_precision_config = self.engine_config.mixed_precision
+            if mixed_precision_config is not None:
+                param_dtype = PrecisionType.to_dtype(mixed_precision_config.get("param_dtype", "bf16"))
+            else:
+                param_dtype = torch.bfloat16
+
+            quantizer = QATQuantizer(
+                mode=self._qat_config.mode,
+                group_size=self._qat_config.group_size,
+                ignore_patterns=list(self._qat_config.ignore_patterns),
+                device=torch.device(get_device_id()),
+                param_dtype=param_dtype,
+            )
+            per_tensor_param = quantizer.quantize_with_fusion(
+                per_tensor_param,
+                target_device=torch.device("cpu"),
+            )
+
         peft_config_dict = peft_config.to_dict() if peft_config is not None else None
         return per_tensor_param, peft_config_dict