vllm-project
diff --git a/‎.buildkite/test-nightly.yaml‎
Lines changed: 23 additions & 1 deletion b/‎.buildkite/test-nightly.yaml‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 18 additions & 15 deletions b/‎tests/conftest.py‎
Lines changed: 18 additions & 15 deletions
diff --git a/‎tests/e2e/stage_configs/qwen2_5_omni_ci.yaml‎
Lines changed: 106 additions & 0 deletions b/‎tests/e2e/stage_configs/qwen2_5_omni_ci.yaml‎
Lines changed: 106 additions & 0 deletions
@@ -10,12 +10,13 @@ steps:
       queue: "cpu_queue_premerge"
 
   - label: "Omni Model Test with H100"
-    timeout_in_minutes: 180
+    timeout_in_minutes: 60
     depends_on: image-build
     if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
       - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py
+      - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py
     agents:
       queue: "mithril-h100-pool"
     plugins:
@@ -44,3 +45,24 @@ steps:
                 hostPath:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
+
+
+
+  - label: "Omni Model Test"
+    timeout_in_minutes: 60
+    depends_on: image-build
+    commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        always-pull: true
+        propagate-environment: true
+        environment:
+        - "HF_HOME=/fsx/hf_cache"
+        volumes:
+        - "/fsx/hf_cache:/fsx/hf_cache"
@@ -588,22 +588,31 @@ def convert_audio_to_text(audio_data):
     """
     Convert base64 encoded audio data to text using speech recognition.
     """
-    import whisper
-
     audio_data = base64.b64decode(audio_data)
     output_path = f"./test_{int(time.time())}"
     with open(output_path, "wb") as audio_file:
         audio_file.write(audio_data)
 
     print(f"audio data is saved: {output_path}")
 
+    text = convert_audio_file_to_text(output_path=output_path)
+    return text
+
+
+def convert_audio_file_to_text(output_path):
+    import whisper
+
     model = whisper.load_model("base")
     text = model.transcribe(
         output_path,
         temperature=0.0,
         word_timestamps=True,
         condition_on_previous_text=False,
     )["text"]
+    del model
+    if torch.cuda.is_available():
+        gc.collect()
+        torch.cuda.empty_cache()
     if text:
         return text
     else:
@@ -614,7 +623,6 @@ def merge_base64_and_convert_to_text(base64_list):
     """
     Merge a list of base64 encoded audio data and convert to text.
     """
-    import whisper
     from pydub import AudioSegment
 
     merged_audio = None
@@ -627,17 +635,8 @@ def merge_base64_and_convert_to_text(base64_list):
             merged_audio += seg
     output_path = f"./test_{int(time.time())}"
     merged_audio.export(output_path, format="wav")
-    model = whisper.load_model("base")
-    text = model.transcribe(
-        output_path,
-        temperature=0.0,
-        word_timestamps=True,
-        condition_on_previous_text=False,
-    )["text"]
-    if text:
-        return text
-    else:
-        return ""
+    text = convert_audio_file_to_text(output_path)
+    return text
 
 
 def modify_stage_config(
@@ -886,6 +885,7 @@ def __init__(
         model: str,
         serve_args: list[str],
         *,
+        port: int | None = None,
         env_dict: dict[str, str] | None = None,
     ) -> None:
         _run_pre_test_cleanup(enable_force=True)
@@ -896,7 +896,10 @@ def __init__(
         self.env_dict = env_dict
         self.proc: subprocess.Popen | None = None
         self.host = "127.0.0.1"
-        self.port = get_open_port()
+        if port is None:
+            self.port = get_open_port()
+        else:
+            self.port = port
 
     def _start_server(self) -> None:
         """Start the vLLM-Omni server subprocess."""
 
@@ -0,0 +1,106 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 32768
+      max_num_batched_tokens: 32768
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 32768
+      max_num_batched_tokens: 32768
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+      max_num_batched_tokens: 4069
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1