Re-upload

MrZ20 · MrZ20 · commit e3b9333a5b9a · 2025-11-21T17:14:53.000+08:00
Signed-off-by: MrZ20 &lt;2609716663@qq.com&gt;
diff --git a/.github/workflows/_e2e_nightly.yaml b/.github/workflows/_e2e_nightly.yaml
diff --git a/.github/workflows/vllm_ascend_test_nightly.yaml b/.github/workflows/vllm_ascend_test_nightly.yaml
diff --git a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md
@@ -12,6 +12,13 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa
 - Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
 - Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
 
+## Support Scenarios
+
+### Models:
+DeepseekV3/V3.1/R1、Qwen3-MOE
+### MOE QuantType:
+W8A8-dynamic
+
 ## How to Use EPLB
 
 ### Dynamic EPLB
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -38,6 +38,8 @@
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.moe.experts_selector import select_experts
 from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
+from vllm_ascend.quantization.w8a8_dynamic import \
+    AscendW8A8DynamicFusedMoEMethod
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
                                is_enable_nz, npu_stream_switch,
                                shared_expert_dp_enabled,
@@ -247,6 +249,11 @@ def __init__(self, *args, **kwargs):
             self.moe_load = torch.zeros(local_num_experts,
                                         dtype=torch.int64).npu()
 
+        eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
+        if eplb_enable and (not isinstance(self.quant_method,
+                                           AscendW8A8DynamicFusedMoEMethod)):
+            raise ValueError("Eplb supports only w8a8_dynamic quantization.")
+
         self.moe_config.num_experts = self.global_num_experts
         self.moe_config.num_local_experts = self.local_num_experts
         self.moe_config.original_num_experts = num_experts
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -32,7 +32,8 @@
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
-                               update_aclgraph_sizes)
+                               is_vl_model, update_aclgraph_sizes,
+                               update_default_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -182,6 +183,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         # set cudaprah sizes before extending `compilation_config.splitting_ops`
         vllm_config._set_cudagraph_sizes()
+        # There are cases where default cudagraph_capture_sizes are not friendly
+        # to ascend ops && hardwares. We update these sizes here to improve
+        # default performance.
+        update_default_aclgraph_sizes(vllm_config)
         # TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
         # is supported by vllm-ascend.
         if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
@@ -298,6 +303,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 vllm_config.scheduler_config)
             vllm_config.scheduler_config = recompute_scheduler_config
 
+        if is_vl_model(vllm_config):
+            if bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))) or \
+               bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM1", '0'))):
+                raise ValueError(
+                    "Currently, VL models doesn't support "
+                    "FLASHCOMM in vllm-ascend. We will fix this in the future. "
+                    "Please set VLLM_ASCEND_ENABLE_FLASHCOMM1=0.")
+
     @classmethod
     def get_attn_backend_cls(
         cls,
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -57,6 +57,7 @@
 _DEFAULT_BUFFER_SIZE = 200
 _MIN_DP_BUFFER_SIZE = 50
 _IS_MOE_MODEL = None
+_IS_VL_MODEL = None
 _ENABLE_SP = None
 _HAS_LAYER_IDX = None
 _ENABLE_NZ = None
@@ -319,6 +320,53 @@ def _rec_find(d):
     return max(layer_counts)
 
 
+def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
+    """
+    Check whether it is vLLM default capture sizes.
+    """
+
+    cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
+    if len(cuda_graph_sizes) == 1:
+        default_size_capture_list = [1, 2, 4] + [
+            i for i in range(8, cuda_graph_sizes[0] + 1, 8)
+        ]
+
+        if sorted(default_size_capture_list, reverse=True) == \
+            vllm_config.compilation_config.cudagraph_capture_sizes:
+            return True
+
+    return False
+
+
+def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
+    """
+    Update ACL graph default capture sizes, so that new sizes
+    are more friendly to ascend ops && hardware.
+    """
+
+    if vllm_config.model_config is None or \
+        vllm_config.model_config.enforce_eager or \
+        not _is_default_capture_sizes(vllm_config):
+        return
+
+    # modify the default capture_sizes for Qwen3-MoE models on dp settings.
+    # this is mainly because performance of _npu_paged_attention might degrades
+    # on special shapes.
+    # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
+    # replaced by npu_fused_infer_attention_score which does not contain such bugs.
+    if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
+        and vllm_config.parallel_config.tensor_parallel_size == 1 \
+        and vllm_config.parallel_config.data_parallel_size > 1 :
+        max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
+        new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
+            i for i in range(24, max_capture_size + 1, 8)
+        ]
+
+        vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
+        vllm_config.compilation_config.init_with_cudagraph_sizes(
+            new_cudagraph_capture_sizes)
+
+
 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
     """Update ACL graph capture sizes based on hardware limitations"""
     # NOTE: Currently, we can only capture 1800 graphs at most,
@@ -649,6 +697,15 @@ def _is_contain_expert(config: Any):
     return False
 
 
+def is_vl_model(vllm_config: VllmConfig):
+    """Checks if the model is a VL model by config"""
+    global _IS_VL_MODEL
+    if _IS_VL_MODEL is None:
+        model_configs = vllm_config.model_config.hf_config.to_dict()
+        _IS_VL_MODEL = "VL" in model_configs["architectures"][0]
+    return _IS_VL_MODEL
+
+
 def weak_ref_tensor(tensor: Any) -> Any:
     """
     Create a weak reference to a tensor.