@@ -245,14 +245,19 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
245245@patch .dict (os .environ , {"ASCEND_AGGREGATE_ENABLE" : "1" })
246246@patch .dict (os .environ , {"HCCL_BUFFSIZE" : "1024" })
247247def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep ():
248- example_prompts = [
249- "Hello, my name is " ,
248+ short_example_prompts = [
249+ "Hello " ,
250250 ]
251- max_tokens = 5
251+ # "max_position_embeddings": 163840,
252+ long_example_prompts = [
253+ "Hello " * (163839 - 500 ) + "Hello"
254+ ]
255+ max_tokens = 500
252256 with VllmRunner ("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning" ,
253257 tensor_parallel_size = 2 ,
254258 quantization = "ascend" ,
255259 enable_expert_parallel = True ,
260+ max_model_len = 163840 ,
256261 compilation_config = {
257262 "cudagraph_capture_sizes" : [3 , 6 , 9 , 12 ],
258263 "cudagraph_mode" : "FULL_DECODE_ONLY"
@@ -266,7 +271,8 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
266271 },
267272 reasoning_parser = "deepseek_v3" ,
268273 tokenizer_mode = "deepseek_v32" ) as vllm_model :
269- vllm_model .generate_greedy (example_prompts , max_tokens )
274+ vllm_model .generate_greedy (short_example_prompts , max_tokens )
275+ vllm_model .generate_greedy (long_example_prompts , max_tokens )
270276
271277
272278@pytest .mark .parametrize ("model" , QWEN_W4A4_MODELS )
0 commit comments