[v0.13.0][Lora][BugFix] Fix crash on base model requests with LoRA enabled (#6457)

SkychenLee · l00832868 · web-flow · commit dc225e1bb00d · 2026-02-02T19:31:58.000+08:00
problem: when use lora in compile mode, request lora module is successful,but request base model will be failed and model will be core down. ### What this PR does / why we need it? when we start a model with lora , request lora is ok, but requesting base model will case the model process core down【dangerous problem】 Related-issues: #6279 ### Does this PR introduce _any_ user-facing change? not introduce_any_user-facing change ### How was this patch tested? vLLM version: v0.13.0 rc2 Signed-off-by: l00832868 <litianchen2@huawei.com> Co-authored-by: l00832868 <litianchen2@huawei.com>
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2131,9 +2131,8 @@ def _dummy_run(
         if self.is_kv_producer and not self.is_kv_consumer:
             with_prefill = True
 
-        has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
         _ag_mode, batch_descriptor = \
-            self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
+            self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=activate_lora)
 
         # Padding for DP
         (num_tokens, num_tokens_across_dp, with_prefill,
@@ -2189,7 +2188,7 @@ def _dummy_run(
             _ag_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
                 uniform_decode=uniform_decode,
-                has_lora=has_lora,
+                has_lora=activate_lora,
                 disable_full=synced_cudagraph_mode
                 <= CUDAGraphMode.PIECEWISE.value)