Merge branch 'vllm-project:main' into feature_mimo_audio

qibaoyuan · web-flow · commit 6d92db41b5fc · 2026-02-03T18:24:34.000+08:00
diff --git a/docs/.nav.yml b/docs/.nav.yml
@@ -11,17 +11,23 @@ nav:
   - Examples:
     - examples/README.md
     - Offline Inference:
+      - BAGEL-7B-MoT: user_guide/examples/offline_inference/bagel.md
       - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
       - Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
+      - LoRA Inference Examples: user_guide/examples/offline_inference/lora_inference.md
       - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
       - Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
+      - Text-To-Audio: user_guide/examples/offline_inference/text_to_audio.md
       - Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
       - Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
     - Online Serving:
+      - BAGEL-7B-MoT: user_guide/examples/online_serving/bagel.md
       - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
+      - Online LoRA Inference (Diffusion): user_guide/examples/online_serving/lora_inference.md
       - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
+      - Qwen3-TTS Online Serving: user_guide/examples/online_serving/qwen3_tts.md
       - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
   - General:
     - usage/*
@@ -54,6 +60,7 @@ nav:
     - Feature Design:
       - design/feature/disaggregated_inference.md
       - design/feature/ray_based_execution.md
+      - design/feature/omni_connectors/
     - Module Design:
       - design/module/ar_module.md
       - design/module/dit_module.md
diff --git a/docs/api/README.md b/docs/api/README.md
@@ -10,6 +10,9 @@ Main entry points for vLLM-Omni inference and serving.
 - [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
 - [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
 - [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
+- [vllm_omni.entrypoints.cli.benchmark.base.OmniBenchmarkSubcommandBase][]
+- [vllm_omni.entrypoints.cli.benchmark.main.OmniBenchmarkSubcommand][]
+- [vllm_omni.entrypoints.cli.benchmark.serve.OmniBenchmarkServingSubcommand][]
 - [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
 - [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
 - [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
@@ -26,7 +29,9 @@ Main entry points for vLLM-Omni inference and serving.
 
 Input data structures for multi-modal inputs.
 
+- [vllm_omni.inputs.data.OmniDiffusionSamplingParams][]
 - [vllm_omni.inputs.data.OmniEmbedsPrompt][]
+- [vllm_omni.inputs.data.OmniTextPrompt][]
 - [vllm_omni.inputs.data.OmniTokenInputs][]
 - [vllm_omni.inputs.data.OmniTokensPrompt][]
 - [vllm_omni.inputs.parse.parse_singleton_prompt_omni][]
@@ -58,6 +63,7 @@ Core scheduling and caching components.
 - [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
 - [vllm_omni.core.sched.output.OmniCachedRequestData][]
 - [vllm_omni.core.sched.output.OmniNewRequestData][]
+- [vllm_omni.core.sched.output.OmniSchedulerOutput][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
@@ -88,20 +94,23 @@ Configuration classes.
 
 Worker classes and model runners for distributed inference.
 
-- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
-- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
-- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
-- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
-- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
+- [vllm_omni.diffusion.worker.diffusion_model_runner.DiffusionModelRunner][]
+- [vllm_omni.diffusion.worker.diffusion_worker.DiffusionWorker][]
+- [vllm_omni.diffusion.worker.diffusion_worker.WorkerProc][]
+- [vllm_omni.platforms.npu.worker.npu_ar_model_runner.ExecuteModelState][]
+- [vllm_omni.platforms.npu.worker.npu_ar_model_runner.NPUARModelRunner][]
+- [vllm_omni.platforms.npu.worker.npu_ar_worker.NPUARWorker][]
+- [vllm_omni.platforms.npu.worker.npu_generation_model_runner.NPUGenerationModelRunner][]
+- [vllm_omni.platforms.npu.worker.npu_generation_worker.NPUGenerationWorker][]
+- [vllm_omni.platforms.npu.worker.npu_model_runner.OmniNPUModelRunner][]
+- [vllm_omni.platforms.xpu.worker.xpu_ar_model_runner.XPUARModelRunner][]
+- [vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker][]
+- [vllm_omni.platforms.xpu.worker.xpu_generation_model_runner.XPUGenerationModelRunner][]
+- [vllm_omni.platforms.xpu.worker.xpu_generation_worker.XPUGenerationWorker][]
 - [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]
 - [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
 - [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
 - [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
 - [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
 - [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
-- [vllm_omni.worker.npu.npu_ar_model_runner.ExecuteModelState][]
-- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][]
-- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][]
-- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][]
-- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][]
-- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][]
+- [vllm_omni.worker.mixins.OmniWorkerMixin][]
diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md
@@ -2,6 +2,7 @@
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/bagel>.
 
+
 ## Set up
 
 Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
@@ -99,7 +100,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \
 
 BAGEL-7B-MoT supports **multiple modality modes** for different use cases.
 
-The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)
+The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)
 
 #### 📌 Command Line Arguments (end2end.py)
 
@@ -177,3 +178,10 @@ sudo apt install ffmpeg
 | Stage-0 (Thinker)   | **15.04 GiB** **+ KV Cache** |
 | Stage-1 (DiT)       | **26.50 GiB**                |
 | Total               | **~42 GiB + KV Cache**       |
+
+## Example materials
+
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "examples/offline_inference/bagel/end2end.py"
+    ``````
diff --git a/docs/user_guide/examples/offline_inference/image_to_image.md b/docs/user_guide/examples/offline_inference/image_to_image.md
@@ -47,10 +47,15 @@ Key arguments:
 - `--image`: path(s) to the source image(s) (PNG/JPG, converted to RGB). Can specify multiple images.
 - `--prompt` / `--negative_prompt`: text description (string).
 - `--cfg_scale`: true classifier-free guidance scale (default: 4.0). Classifier-free guidance is enabled by setting cfg_scale > 1 and providing a negative_prompt. Higher guidance scale encourages images closely linked to the text prompt, usually at the expense of lower image quality.
-- `--cfg_parallel_size`: the number of devices to run CFG Parallel. CFG Parallel is valid only if classifier-free guidance is enabled and `cfg_parallel_size` is set to 2.
 - `--guidance_scale`: guidance scale for guidance-distilled models (default: 1.0, disabled). Unlike classifier-free guidance (--cfg_scale), guidance-distilled models take the guidance scale directly as an input parameter. Enabled when guidance_scale > 1. Ignored when not using guidance-distilled models.
 - `--num_inference_steps`: diffusion sampling steps (more steps = higher quality, slower).
 - `--output`: path to save the generated PNG.
+- `--vae_use_slicing`: enable VAE slicing for memory optimization.
+- `--vae_use_tiling`: enable VAE tiling for memory optimization.
+- `--cfg_parallel_size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel).
+- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
+
+> ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage.
 
 ## Example materials
 
diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md
@@ -52,12 +52,17 @@ Key arguments:
 - `--num_frames`: Number of frames (default 81).
 - `--guidance_scale` and `--guidance_scale_high`: CFG scale (applied to low/high-noise stages for MoE).
 - `--negative_prompt`: Optional list of artifacts to suppress.
-- `--cfg_parallel_size`: the number of devices to run CFG Parallel. CFG Parallel is valid only if classifier-free guidance is enabled and `cfg_parallel_size` is set to 2.
 - `--boundary_ratio`: Boundary split ratio for two-stage MoE models.
 - `--flow_shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p).
 - `--num_inference_steps`: Number of denoising steps (default 50).
 - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video).
 - `--output`: Path to save the generated video.
+- `--vae_use_slicing`: Enable VAE slicing for memory optimization.
+- `--vae_use_tiling`: Enable VAE tiling for memory optimization.
+- `--cfg_parallel_size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel).
+- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
+
+> ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage.
 
 ## Example materials
 
diff --git a/docs/user_guide/examples/offline_inference/lora_inference.md b/docs/user_guide/examples/offline_inference/lora_inference.md
@@ -1,8 +1,9 @@
-# LoRA-Inference
+# LoRA Inference Examples
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/lora_inference>.
 
-This contains examples for using LoRA (Low-Rank Adaptation) adapters with vLLM-omni diffusion models for offline inference.
+
+This directory contains examples for using LoRA (Low-Rank Adaptation) adapters with vLLM-omni diffusion models for offline inference.
 The example uses the  `stabilityai/stable-diffusion-3.5-medium` as the default model, but you can replace it with other models in vLLM-omni.
 
 ## Overview
diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md
@@ -16,6 +16,15 @@ Qwen3 TTS provides multiple task variants for speech generation:
 ## Setup
 Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
 
+### ROCm Dependencies
+
+You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
+
+```
+pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
+pip install onnxruntime-rocm sox
+```
+
 ## Quick Start
 
 Run a single sample for a task:
diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md
@@ -0,0 +1,47 @@
+# Text-To-Audio
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/text_to_audio>.
+
+
+The `stabilityai/stable-audio-open-1.0` pipeline generates audio from text prompts.
+
+## Prerequisites
+
+If you use a gated model (e.g., `stabilityai/stable-audio-open-1.0`), ensure you have access:
+
+1. **Accept Model License**: Visit the model page on Hugging Face (e.g., [stabilityai/stable-audio-open-1.0]) and accept the user agreement.
+2. **Authenticate**: Log in to Hugging Face locally to access the gated model.
+   ```bash
+   huggingface-cli login
+   ```
+
+## Local CLI Usage
+
+```bash
+python text_to_audio.py \
+  --model stabilityai/stable-audio-open-1.0 \
+  --prompt "The sound of a hammer hitting a wooden surface" \
+  --negative_prompt "Low quality" \
+  --seed 42 \
+  --guidance_scale 7.0 \
+  --audio_length 10.0 \
+  --num_inference_steps 100 \
+  --output stable_audio_output.wav
+```
+
+Key arguments:
+
+- `--prompt`: text description (string).
+- `--negative_prompt`: negative prompt for classifier-free guidance.
+- `--seed`: integer seed for deterministic generation.
+- `--guidance_scale`: classifier-free guidance scale.
+- `--audio_length`: audio duration in seconds.
+- `--num_inference_steps`: diffusion sampling steps.(more steps = higher quality, slower).
+- `--output`: path to save the generated WAV file.
+
+## Example materials
+
+??? abstract "text_to_audio.py"
+    ``````py
+    --8<-- "examples/offline_inference/text_to_audio/text_to_audio.py"
+    ``````
diff --git a/docs/user_guide/examples/offline_inference/text_to_image.md b/docs/user_guide/examples/offline_inference/text_to_image.md
@@ -51,7 +51,7 @@ if __name__ == "__main__":
 
     For diffusion pipelines, the stage config field `stage_args.[].runtime.max_batch_size` is 1 by default, and the input
     list is sliced into single-item requests before feeding into the diffusion pipeline. For models that do internally support
-    batched inputs, you can [modify this configuration](../../../configuration/stage_configs.md) to let the model accept a longer batch of prompts.
+    batched inputs, you can [modify this configuration](https://github.com/vllm-project/vllm-omni/tree/main/configuration/stage_configs.md) to let the model accept a longer batch of prompts.
 
 Apart from string prompt, vLLM-Omni also supports dictionary prompts in the same style as vLLM.
 This is useful for models that support negative prompts.
@@ -95,11 +95,16 @@ Key arguments:
 - `--prompt`: text description (string).
 - `--seed`: integer seed for deterministic sampling.
 - `--cfg_scale`: true CFG scale (model-specific guidance strength).
-- `--cfg_parallel_size`: the number of devices to run CFG Parallel. CFG Parallel is valid only if classifier-free guidance is enabled and `cfg_parallel_size` is set to 2.
 - `--num_images_per_prompt`: number of images to generate per prompt (saves as `output`, `output_1`, ...).
 - `--num_inference_steps`: diffusion sampling steps (more steps = higher quality, slower).
 - `--height/--width`: output resolution (defaults 1024x1024).
 - `--output`: path to save the generated PNG.
+- `--vae_use_slicing`: enable VAE slicing for memory optimization.
+- `--vae_use_tiling`: enable VAE tiling for memory optimization.
+- `--cfg_parallel_size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel).
+- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
+
+> ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage.
 
 > ℹ️ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes.
 
diff --git a/docs/user_guide/examples/offline_inference/text_to_video.md b/docs/user_guide/examples/offline_inference/text_to_video.md
@@ -12,10 +12,11 @@ python text_to_video.py \
   --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
   --negative_prompt "<optional quality filter>" \
   --height 480 \
-  --width 640 \
-  --num_frames 32 \
+  --width 832 \
+  --num_frames 33 \
   --guidance_scale 4.0 \
   --guidance_scale_high 3.0 \
+  --flow_shift 12.0 \
   --num_inference_steps 40 \
   --fps 16 \
   --output t2v_out.mp4
@@ -24,14 +25,19 @@ python text_to_video.py \
 Key arguments:
 
 - `--prompt`: text description (string).
-- `--height/--width`: output resolution (defaults 720x1280). Dimensions should align with Wan VAE downsampling (multiples of 8).
+- `--height/--width`: output resolution (defaults 480x832, i.e. 480P). Dimensions should align with Wan VAE downsampling (multiples of 8).
 - `--num_frames`: Number of frames (Wan default is 81).
-- `--guidance_scale` and `--guidance_scale_high`: CFG scale (applied to low/high)..
+- `--guidance_scale` and `--guidance_scale_high`: CFG scale (applied to low/high).
 - `--negative_prompt`: optional list of artifacts to suppress (the PR demo used a long Chinese string).
-- `--cfg_parallel_size`: the number of devices to run CFG Parallel. CFG Parallel is valid only if classifier-free guidance is enabled and `cfg_parallel_size` is set to 2.
-- `--boundary_ratio`: Boundary split ratio for low/high DiT.
+- `--boundary_ratio`: Boundary split ratio for low/high DiT. Default `0.875` uses both transformers for best quality. Set to `1.0` to load only the low-noise transformer (saves noticeable memory with good quality, recommended if memory is limited). Set to `0.0` loads only the high-noise transformer (not recommended, lower quality).
 - `--fps`: frames per second for the saved MP4 (requires `diffusers` export_to_video).
 - `--output`: path to save the generated video.
+- `--vae_use_slicing`: enable VAE slicing for memory optimization.
+- `--vae_use_tiling`: enable VAE tiling for memory optimization.
+- `--cfg_parallel_size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel).
+- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
+
+> ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage.
 
 ## Example materials
 
diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md
@@ -2,9 +2,10 @@
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/bagel>.
 
+
 ## 🛠️ Installation
 
-Please refer to [README.md](../../../README.md)
+Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md)
 
 ## Run examples (BAGEL-7B-MoT)
 
@@ -74,7 +75,7 @@ python openai_chat_client.py \
 
 BAGEL-7B-MoT supports **multiple modality modes** for different use cases.
 
-The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)
+The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)
 
 | Modality    | Input        | Output | Description                            |
 | ----------- | ------------ | ------ | -------------------------------------- |
@@ -230,3 +231,14 @@ sudo apt install ffmpeg
 | Stage-0 (Thinker)   | **15.04 GiB** **+ KV Cache** |
 | Stage-1 (DiT)       | **26.50 GiB**                |
 | Total               | **~42 GiB + KV Cache**       |
+
+## Example materials
+
+??? abstract "openai_chat_client.py"
+    ``````py
+    --8<-- "examples/online_serving/bagel/openai_chat_client.py"
+    ``````
+??? abstract "run_server.sh"
+    ``````sh
+    --8<-- "examples/online_serving/bagel/run_server.sh"
+    ``````
diff --git a/docs/user_guide/examples/online_serving/lora_inference.md b/docs/user_guide/examples/online_serving/lora_inference.md
@@ -1,7 +1,8 @@
-# LoRA-Inference
+# Online LoRA Inference (Diffusion)
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/lora_inference>.
 
+
 This example shows how to use **per-request LoRA** with vLLM-Omni diffusion models via the OpenAI-compatible Chat Completions API.
 
 > Note: The LoRA adapter path must be readable on the **server** machine (usually a local path or a mounted directory).
@@ -55,15 +56,17 @@ lora_adapter/
 └── adapter_model.safetensors
 ```
 
+## Example materials
+
 ??? abstract "openai_chat_client.py"
     ``````py
     --8<-- "examples/online_serving/lora_inference/openai_chat_client.py"
     ``````
 ??? abstract "run_curl_lora_inference.sh"
-    ``````py
+    ``````sh
     --8<-- "examples/online_serving/lora_inference/run_curl_lora_inference.sh"
     ``````
 ??? abstract "run_server.sh"
-    ``````py
+    ``````sh
     --8<-- "examples/online_serving/lora_inference/run_server.sh"
     ``````
diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py
diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py