Skip to content

Commit b81e0a8

Browse files
yenuo26wangyu31577hsliuustc0106Copilot
authored
[Test] Add example test cases for omni online (#1086)
Signed-off-by: wangyu31577 <wangyu31577@hundsun.com> Signed-off-by: Hongsheng Liu <liuhongsheng4@huawei.com> Signed-off-by: yenuo26 <410167048@qq.com> Co-authored-by: wangyu31577 <wangyu31577@hundsun.com> Co-authored-by: Hongsheng Liu <liuhongsheng4@huawei.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 9af6fb9 commit b81e0a8

File tree

5 files changed

+648
-16
lines changed

5 files changed

+648
-16
lines changed

.buildkite/test-nightly.yaml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,13 @@ steps:
1010
queue: "cpu_queue_premerge"
1111

1212
- label: "Omni Model Test with H100"
13-
timeout_in_minutes: 180
13+
timeout_in_minutes: 60
1414
depends_on: image-build
1515
if: build.env("NIGHTLY") == "1"
1616
commands:
1717
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
1818
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py
19+
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py
1920
agents:
2021
queue: "mithril-h100-pool"
2122
plugins:
@@ -44,3 +45,24 @@ steps:
4445
hostPath:
4546
path: /mnt/hf-cache
4647
type: DirectoryOrCreate
48+
49+
50+
51+
- label: "Omni Model Test"
52+
timeout_in_minutes: 60
53+
depends_on: image-build
54+
commands:
55+
- export VLLM_LOGGING_LEVEL=DEBUG
56+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
57+
- pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py
58+
agents:
59+
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
60+
plugins:
61+
- docker#v5.2.0:
62+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
63+
always-pull: true
64+
propagate-environment: true
65+
environment:
66+
- "HF_HOME=/fsx/hf_cache"
67+
volumes:
68+
- "/fsx/hf_cache:/fsx/hf_cache"

tests/conftest.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -588,22 +588,31 @@ def convert_audio_to_text(audio_data):
588588
"""
589589
Convert base64 encoded audio data to text using speech recognition.
590590
"""
591-
import whisper
592-
593591
audio_data = base64.b64decode(audio_data)
594592
output_path = f"./test_{int(time.time())}"
595593
with open(output_path, "wb") as audio_file:
596594
audio_file.write(audio_data)
597595

598596
print(f"audio data is saved: {output_path}")
599597

598+
text = convert_audio_file_to_text(output_path=output_path)
599+
return text
600+
601+
602+
def convert_audio_file_to_text(output_path):
603+
import whisper
604+
600605
model = whisper.load_model("base")
601606
text = model.transcribe(
602607
output_path,
603608
temperature=0.0,
604609
word_timestamps=True,
605610
condition_on_previous_text=False,
606611
)["text"]
612+
del model
613+
if torch.cuda.is_available():
614+
gc.collect()
615+
torch.cuda.empty_cache()
607616
if text:
608617
return text
609618
else:
@@ -614,7 +623,6 @@ def merge_base64_and_convert_to_text(base64_list):
614623
"""
615624
Merge a list of base64 encoded audio data and convert to text.
616625
"""
617-
import whisper
618626
from pydub import AudioSegment
619627

620628
merged_audio = None
@@ -627,17 +635,8 @@ def merge_base64_and_convert_to_text(base64_list):
627635
merged_audio += seg
628636
output_path = f"./test_{int(time.time())}"
629637
merged_audio.export(output_path, format="wav")
630-
model = whisper.load_model("base")
631-
text = model.transcribe(
632-
output_path,
633-
temperature=0.0,
634-
word_timestamps=True,
635-
condition_on_previous_text=False,
636-
)["text"]
637-
if text:
638-
return text
639-
else:
640-
return ""
638+
text = convert_audio_file_to_text(output_path)
639+
return text
641640

642641

643642
def modify_stage_config(
@@ -886,6 +885,7 @@ def __init__(
886885
model: str,
887886
serve_args: list[str],
888887
*,
888+
port: int | None = None,
889889
env_dict: dict[str, str] | None = None,
890890
) -> None:
891891
_run_pre_test_cleanup(enable_force=True)
@@ -896,7 +896,10 @@ def __init__(
896896
self.env_dict = env_dict
897897
self.proc: subprocess.Popen | None = None
898898
self.host = "127.0.0.1"
899-
self.port = get_open_port()
899+
if port is None:
900+
self.port = get_open_port()
901+
else:
902+
self.port = port
900903

901904
def _start_server(self) -> None:
902905
"""Start the vLLM-Omni server subprocess."""
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# stage config for running qwen2.5-omni with architecture of OmniLLM.
2+
3+
# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
4+
# This config is optimized for CI e2e tests.
5+
stage_args:
6+
- stage_id: 0
7+
runtime:
8+
process: true # Run this stage in a separate process
9+
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
10+
max_batch_size: 1
11+
engine_args:
12+
model_stage: thinker
13+
model_arch: Qwen2_5OmniForConditionalGeneration
14+
worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
15+
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
16+
max_model_len: 32768
17+
max_num_batched_tokens: 32768
18+
max_num_seqs: 1
19+
gpu_memory_utilization: 0.8
20+
skip_mm_profiling: true
21+
enforce_eager: true # Now we only support eager mode
22+
trust_remote_code: true
23+
engine_output_type: latent
24+
enable_prefix_caching: false
25+
is_comprehension: true
26+
final_output: true
27+
final_output_type: text
28+
default_sampling_params:
29+
temperature: 0.0
30+
top_p: 1.0
31+
top_k: -1
32+
max_tokens: 128
33+
seed: 42
34+
detokenize: True
35+
repetition_penalty: 1.1
36+
- stage_id: 1
37+
runtime:
38+
process: true
39+
devices: "1"
40+
max_batch_size: 1
41+
engine_args:
42+
model_stage: talker
43+
model_arch: Qwen2_5OmniForConditionalGeneration
44+
worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
45+
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
46+
max_model_len: 32768
47+
max_num_batched_tokens: 32768
48+
max_num_seqs: 1
49+
gpu_memory_utilization: 0.8
50+
skip_mm_profiling: true
51+
enforce_eager: true
52+
trust_remote_code: true
53+
enable_prefix_caching: false
54+
engine_output_type: latent
55+
engine_input_source: [0]
56+
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
57+
default_sampling_params:
58+
temperature: 0.9
59+
top_p: 0.8
60+
top_k: 40
61+
max_tokens: 128
62+
seed: 42
63+
detokenize: True
64+
repetition_penalty: 1.05
65+
stop_token_ids: [8294]
66+
- stage_id: 2
67+
runtime:
68+
process: true
69+
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
70+
max_batch_size: 1
71+
engine_args:
72+
model_stage: code2wav
73+
model_arch: Qwen2_5OmniForConditionalGeneration
74+
worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
75+
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
76+
gpu_memory_utilization: 0.15
77+
enforce_eager: true
78+
trust_remote_code: true
79+
enable_prefix_caching: false
80+
engine_output_type: audio
81+
max_num_batched_tokens: 4069
82+
engine_input_source: [1]
83+
final_output: true
84+
final_output_type: audio
85+
default_sampling_params:
86+
temperature: 0.0
87+
top_p: 1.0
88+
top_k: -1
89+
max_tokens: 128
90+
seed: 42
91+
detokenize: True
92+
repetition_penalty: 1.1
93+
94+
# Top-level runtime config (concise): default windows and stage edges
95+
runtime:
96+
enabled: true
97+
defaults:
98+
window_size: -1 # Simplified: trigger downstream only after full upstream completion
99+
max_inflight: 1 # Simplified: process serially within each stage
100+
edges:
101+
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
102+
to: 1
103+
window_size: -1
104+
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
105+
to: 2
106+
window_size: -1

0 commit comments

Comments
 (0)