|
24 | 24 | from tests.e2e.conftest import RemoteOpenAIServer, MooncakeLauncher |
25 | 25 | from tools.aisbench import run_aisbench_cases, maybe_download_from_modelscope |
26 | 26 |
|
27 | | - |
28 | 27 | MODELS = [ |
29 | 28 | "vllm-ascend/Qwen3-30B-A3B-W8A8", |
30 | 29 | ] |
@@ -93,30 +92,37 @@ async def test_models(model: str, tp_size: int) -> None: |
93 | 92 | "kv_connector": "AscendStoreConnector", |
94 | 93 | "kv_role": "kv_both", |
95 | 94 | "kv_connector_extra_config": { |
96 | | - "register_buffer": True, |
97 | | - "use_layerwise": False, |
98 | | - "mooncake_rpc_port":"0" |
| 95 | + "register_buffer": True, |
| 96 | + "use_layerwise": False, |
| 97 | + "mooncake_rpc_port": "0" |
99 | 98 | } |
100 | 99 | } |
101 | | - speculative_config = {"method": "eagle3","model": eagle_model, "num_speculative_tokens": 3} |
| 100 | + speculative_config = { |
| 101 | + "method": "eagle3", |
| 102 | + "model": eagle_model, |
| 103 | + "num_speculative_tokens": 3 |
| 104 | + } |
102 | 105 | server_args = [ |
103 | | - "--trust-remote-code", "--max-num-seqs", "100", "--max-model-len", "37364", |
104 | | - "--max-num-batched-tokens", "16384", "--tensor-parallel-size", |
| 106 | + "--trust-remote-code", "--max-num-seqs", "100", "--max-model-len", |
| 107 | + "37364", "--max-num-batched-tokens", "16384", "--tensor-parallel-size", |
105 | 108 | str(tp_size), "--enable-expert-parallel", "--port", |
106 | | - str(port), "--distributed_executor_backend", "mp", "--async-scheduling", "True", |
107 | | - "--quantization", "ascend", "--compilation-config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}', |
108 | | - "--gpu-memory-utilization", "0.95", "--speculative-config", json.dumps(speculative_config), |
109 | | - "--kv-transfer-config", json.dumps(kv_transfer_config) |
| 109 | + str(port), "--distributed_executor_backend", "mp", |
| 110 | + "--async-scheduling", "--quantization", "ascend", |
| 111 | + "--compilation-config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}', |
| 112 | + "--gpu-memory-utilization", "0.95", "--speculative-config", |
| 113 | + json.dumps(speculative_config), "--kv-transfer-config", |
| 114 | + json.dumps(kv_transfer_config) |
110 | 115 | ] |
111 | 116 | request_keyword_args: dict[str, Any] = { |
112 | 117 | **api_keyword_args, |
113 | 118 | } |
114 | | - with MooncakeLauncher(mooncake_port, mooncake_metrics_port) as mooncake_server: |
| 119 | + with MooncakeLauncher(mooncake_port, |
| 120 | + mooncake_metrics_port) as mooncake_server: |
115 | 121 | with RemoteOpenAIServer(model, |
116 | | - server_args, |
117 | | - server_port=port, |
118 | | - env_dict=env_dict, |
119 | | - auto_port=False) as server: |
| 122 | + server_args, |
| 123 | + server_port=port, |
| 124 | + env_dict=env_dict, |
| 125 | + auto_port=False) as server: |
120 | 126 | client = server.get_async_client() |
121 | 127 | for _ in range(2): |
122 | 128 | batch = await client.completions.create( |
|
0 commit comments