Skip to content

Commit e3b9333

Browse files
committed
Re-upload
Signed-off-by: MrZ20 <2609716663@qq.com>
2 parents 4264067 + 218bc70 commit e3b9333

File tree

6 files changed

+85
-219
lines changed

6 files changed

+85
-219
lines changed

.github/workflows/_e2e_nightly.yaml

Lines changed: 0 additions & 115 deletions
This file was deleted.

.github/workflows/vllm_ascend_test_nightly.yaml

Lines changed: 0 additions & 103 deletions
This file was deleted.

docs/source/user_guide/feature_guide/eplb_swift_balancer.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa
1212
- Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
1313
- Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
1414

15+
## Support Scenarios
16+
17+
### Models:
18+
DeepseekV3/V3.1/R1、Qwen3-MOE
19+
### MOE QuantType:
20+
W8A8-dynamic
21+
1522
## How to Use EPLB
1623

1724
### Dynamic EPLB

vllm_ascend/ops/common_fused_moe.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
3939
from vllm_ascend.ops.moe.experts_selector import select_experts
4040
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
41+
from vllm_ascend.quantization.w8a8_dynamic import \
42+
AscendW8A8DynamicFusedMoEMethod
4143
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
4244
is_enable_nz, npu_stream_switch,
4345
shared_expert_dp_enabled,
@@ -247,6 +249,11 @@ def __init__(self, *args, **kwargs):
247249
self.moe_load = torch.zeros(local_num_experts,
248250
dtype=torch.int64).npu()
249251

252+
eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
253+
if eplb_enable and (not isinstance(self.quant_method,
254+
AscendW8A8DynamicFusedMoEMethod)):
255+
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
256+
250257
self.moe_config.num_experts = self.global_num_experts
251258
self.moe_config.num_local_experts = self.local_num_experts
252259
self.moe_config.original_num_experts = num_experts

vllm_ascend/platform.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
3333
delete_torchair_cache_file)
3434
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
35-
update_aclgraph_sizes)
35+
is_vl_model, update_aclgraph_sizes,
36+
update_default_aclgraph_sizes)
3637

3738
if TYPE_CHECKING:
3839
from vllm.config import ModelConfig, VllmConfig
@@ -182,6 +183,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
182183

183184
# set cudaprah sizes before extending `compilation_config.splitting_ops`
184185
vllm_config._set_cudagraph_sizes()
186+
# There are cases where default cudagraph_capture_sizes are not friendly
187+
# to ascend ops && hardwares. We update these sizes here to improve
188+
# default performance.
189+
update_default_aclgraph_sizes(vllm_config)
185190
# TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
186191
# is supported by vllm-ascend.
187192
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
@@ -298,6 +303,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
298303
vllm_config.scheduler_config)
299304
vllm_config.scheduler_config = recompute_scheduler_config
300305

306+
if is_vl_model(vllm_config):
307+
if bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))) or \
308+
bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM1", '0'))):
309+
raise ValueError(
310+
"Currently, VL models doesn't support "
311+
"FLASHCOMM in vllm-ascend. We will fix this in the future. "
312+
"Please set VLLM_ASCEND_ENABLE_FLASHCOMM1=0.")
313+
301314
@classmethod
302315
def get_attn_backend_cls(
303316
cls,

vllm_ascend/utils.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
_DEFAULT_BUFFER_SIZE = 200
5858
_MIN_DP_BUFFER_SIZE = 50
5959
_IS_MOE_MODEL = None
60+
_IS_VL_MODEL = None
6061
_ENABLE_SP = None
6162
_HAS_LAYER_IDX = None
6263
_ENABLE_NZ = None
@@ -319,6 +320,53 @@ def _rec_find(d):
319320
return max(layer_counts)
320321

321322

323+
def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
324+
"""
325+
Check whether it is vLLM default capture sizes.
326+
"""
327+
328+
cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
329+
if len(cuda_graph_sizes) == 1:
330+
default_size_capture_list = [1, 2, 4] + [
331+
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
332+
]
333+
334+
if sorted(default_size_capture_list, reverse=True) == \
335+
vllm_config.compilation_config.cudagraph_capture_sizes:
336+
return True
337+
338+
return False
339+
340+
341+
def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
342+
"""
343+
Update ACL graph default capture sizes, so that new sizes
344+
are more friendly to ascend ops && hardware.
345+
"""
346+
347+
if vllm_config.model_config is None or \
348+
vllm_config.model_config.enforce_eager or \
349+
not _is_default_capture_sizes(vllm_config):
350+
return
351+
352+
# modify the default capture_sizes for Qwen3-MoE models on dp settings.
353+
# this is mainly because performance of _npu_paged_attention might degrades
354+
# on special shapes.
355+
# TODO(Angazenn): we will remove this once _npu_paged_attention is fully
356+
# replaced by npu_fused_infer_attention_score which does not contain such bugs.
357+
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
358+
and vllm_config.parallel_config.tensor_parallel_size == 1 \
359+
and vllm_config.parallel_config.data_parallel_size > 1 :
360+
max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
361+
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
362+
i for i in range(24, max_capture_size + 1, 8)
363+
]
364+
365+
vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
366+
vllm_config.compilation_config.init_with_cudagraph_sizes(
367+
new_cudagraph_capture_sizes)
368+
369+
322370
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
323371
"""Update ACL graph capture sizes based on hardware limitations"""
324372
# NOTE: Currently, we can only capture 1800 graphs at most,
@@ -649,6 +697,15 @@ def _is_contain_expert(config: Any):
649697
return False
650698

651699

700+
def is_vl_model(vllm_config: VllmConfig):
701+
"""Checks if the model is a VL model by config"""
702+
global _IS_VL_MODEL
703+
if _IS_VL_MODEL is None:
704+
model_configs = vllm_config.model_config.hf_config.to_dict()
705+
_IS_VL_MODEL = "VL" in model_configs["architectures"][0]
706+
return _IS_VL_MODEL
707+
708+
652709
def weak_ref_tensor(tensor: Any) -> Any:
653710
"""
654711
Create a weak reference to a tensor.

0 commit comments

Comments
 (0)