verl-project
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/e2e_ascend.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/e2e_ascend.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/e2e_one_step_off_policy_ascend.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/e2e_one_step_off_policy_ascend.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/e2e_sft_llm.yml‎
Lines changed: 1 addition & 9 deletions b/‎.github/workflows/e2e_sft_llm.yml‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎.github/workflows/e2e_sft_llm_ascend.yml‎
Lines changed: 1 addition & 10 deletions b/‎.github/workflows/e2e_sft_llm_ascend.yml‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker/Dockerfile.stable.vllm‎
Lines changed: 3 additions & 0 deletions b/‎docker/Dockerfile.stable.vllm‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/advance/fully_async.md‎
Lines changed: 0 additions & 24 deletions b/‎docs/advance/fully_async.md‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎docs/advance/mtp.md‎
Lines changed: 5 additions & 3 deletions b/‎docs/advance/mtp.md‎
Lines changed: 5 additions & 3 deletions
@@ -20,6 +20,7 @@
 /verl/workers/actor/megatron_actor.py @ISEEKYAN @vermouth1992
 /verl/workers/critic/megatron_critic.py @ISEEKYAN @vermouth1992
 /verl/workers/megatron_workers.py @ISEEKYAN @vermouth1992
+/verl/experimental @wuxibin89 @ArronHZG
 
 /tests/single_controller @zw0610 @wuxibin89
 /tests/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6
 
@@ -6,7 +6,7 @@
 
 - [ ] Search for similar PRs. Paste at least one query link here: ...
 - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI)
-  - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`
+  - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`, `fully_async`, `one_step_off`
   - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]`
   - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
   - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title.
 
@@ -126,6 +126,10 @@ jobs:
           ray stop --force
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
           USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_grpo_mindspeed bash tests/special_npu/run_qwen3_30b_grpo_mindspeed.sh
+      - name: Running the E2E test with fully_async_policy algorithm (FSDP2)
+        run: |
+          ray stop --force
+          bash tests/special_npu/run_fully_async_policy.sh
 
   vlm_rl_job:
     if: github.repository_owner == 'verl-project'
 
@@ -68,7 +68,7 @@ on:
       # Entrypoints
       - ".github/workflows/e2e_one_step_off_policy_ascend.yml"
       - "examples/data_preprocess/gsm8k.py"
-      - "tests/special_e2e/run_one_step_off_policy.sh"
+      - "tests/special_npu/run_one_step_off_policy.sh"
 
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -122,7 +122,7 @@ jobs:
       - name: Running the E2E test with one_step_off_policy algorithm (FSDP2)
         run: |
           ray stop --force
-          bash tests/special_e2e/run_one_step_off_policy.sh
+          bash tests/special_npu/run_one_step_off_policy.sh
 
   # Test Megatron strategy
   e2e_one_step_off_policy_megatron_ascend:
@@ -167,4 +167,4 @@ jobs:
         run: |
           ray stop --force
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
-          bash tests/special_e2e/run_one_step_off_policy.sh
+          bash tests/special_npu/run_one_step_off_policy.sh
@@ -110,7 +110,7 @@ jobs:
       - name: Prepare gsm8k dataset
         run: |
           ray stop --force
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
+          python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm
         run: |
           ray stop --force
@@ -123,10 +123,6 @@ jobs:
         run: |
           ray stop --force
           SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
-      - name: Check loss difference between sequence parallel vs. default implementation
-        run: |
-          ray stop --force
-          ENTRYPOINT="tests/special_e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger
         run: |
           ray stop --force
@@ -140,10 +136,6 @@ jobs:
           ray stop --force
           LORA_RANK=32 RESUME_MODE=auto TOTAL_TRAIN_STEP=2 bash tests/special_e2e/sft/run_sft.sh
       # TODO: multiturn
-      - name: Prepare gsm8k dataset
-        run: |
-          ray stop --force
-          python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
       - name: Running GSM8K E2E training tests with multiturn and various configs and compare results
         run: |
           bash tests/special_e2e/sft/test_sft_engine_all.sh
 
@@ -109,7 +109,7 @@ jobs:
           ln -s /root/.cache/models ~/models
       - name: Prepare gsm8k dataset
         run: |
-          python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
+          python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
       - name: Running GSM8K E2E training tests on 8 NPUs with rmpad using function rm
         run: |
           ray stop --force
@@ -122,10 +122,6 @@ jobs:
         run: |
           ray stop --force
           SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
-      - name: Check loss difference between sequence parallel vs. default implementation
-        run: |
-          ray stop --force
-          ENTRYPOINT="tests/special_e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
       - name: Running GSM8K E2E training tests with LoRA
         run: |
           ray stop --force
@@ -134,11 +130,6 @@ jobs:
         run: |
           ray stop --force
           LORA_RANK=32 RESUME_MODE=auto TOTAL_TRAIN_STEP=2 bash tests/special_e2e/sft/run_sft.sh
-      # TODO: multiturn
-      - name: Prepare gsm8k dataset
-        run: |
-          ray stop --force
-          python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
       - name: Running GSM8K E2E training tests with multiturn and various configs and compare results
         run: |
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
 
@@ -8,6 +8,8 @@
 **/playground
 **/wandb
 
+/pyrightconfig.json
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -32,6 +32,9 @@ RUN pip install torch==2.9.1 torchvision torchaudio --index-url https://download
 RUN sed -i '/nvidia-cudnn-cu12/d' /usr/local/lib/python3.12/dist-packages/torch-2.9.1+cu129.dist-info/METADATA
 RUN pip install --no-deps --force-reinstall nvidia-cudnn-cu12==9.16.0.29
 
+# NOTE: This installs the `vllm` source code in `/vllm`.
+# This might break the (based)pyright type checking. To fix it, add `/vllm` to `extraPaths` in `pyrightconfig.json`.
+# c.f. https://docs.basedpyright.com/latest/configuration/config-files/
 RUN git clone --depth 1 -b v0.12.0 https://github.com/vllm-project/vllm.git && \
     cd vllm && \
     find requirements -name "*.txt" -print0 | xargs -0 sed -i '/torch/d' && \
 
@@ -106,9 +106,6 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 | `async_training.trigger_parameter_sync_step`                     | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
 | `async_training.staleness_threshold`                             | Freshness control                                                                              |
 | `async_training.partial_rollout`                                 | Whether to perform partial_rollout                                                             |
-| `async_training.checkpoint_engine.enable`                        | Whether to use checkpoint_engine for accelerating, default `True`                              |
-| `async_training.checkpoint_engine.overlap_broadcast_and_consume` | When use checkpoint_engine, whether to overlap broadcast and load_weights, default `False`     |
-| `async_training.checkpoint_engine.device_buffer_size_M`          | When use checkpoint_engine, the user-specific bucket size (MB), default `4096`                 |
 | `async_training.use_trainer_do_validate`                         | Whether use trainer node to do validate process, default `False`                               |
 
 **Further Explanation:**
@@ -182,27 +179,6 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
   mode d
   (async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`.
 
-* `async_training.checkpoint_engine.enable`
-
-  Enabling the checkpoint engine generally reduces synchronization time overhead by more than 60% compared to
-  the original per-tensor parameter synchronization method. However, assembling buckets incurs additional
-  temporary GPU memory overhead.
-
-* `async_training.checkpoint_engine.overlap_broadcast_and_consume`
-
-  Enabling pipeline between the broadcast and load_weights parameters will allocate additional GPU memory.
-  Since the main time consumption for parameter synchronization is not in the broadcast and load_weights phases,
-  but in the parameter generation phase (by megatron or FSDP), this option is off by default.
-
-* `async_training.checkpoint_engine.device_buffer_size_M`
-
-  It controls the size of the memory buffer used for synchronization when the checkpoint-engine is enabled.
-  The actual `bucket_size` = `max(device_buffer_size_M, maximum parameter tensor size)`.
-    * When enable `overlap_broadcast_and_consume`, the additional device memory overhead of
-      trainer rank is `3 * bucket_size`and rollout rank is `2 * bucket_size`。
-    * When disable `overlap_broadcast_and_consume`, the additional device memory overhead of
-      trainer rank is `2 * bucket_size`and rollout rank is `1 * bucket_size`。
-
 * `async_training.use_trainer_do_validate`
 
   It controls whether to use the trainer's `do_validate` method for validation.
 
@@ -2,19 +2,21 @@
 
 **Author**: `https://github.com/meituan-search`
 
-Last updated: 01/30/2026
+Last updated: 02/15/2026
 
 # 1. Scope of Support
 
 Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek series models based on the MTP architecture. The support rules for training and inference engines are as follows:
 
-- **Training Engine**: Only supports the `mbridge + megatron` combination; other training engines are not compatible at this time;
+- **Training Engine**: Only supports the `mbridge/Megatron-Bridge + megatron` combination; other training engines are not compatible at this time;
 
 - **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list;
 
 - **Dependency Versions**:
 
-    - mbridge: Use the specified branch: [https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp](https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp) (will be merged into the main branch in the future);
+    - mbridge: Apply the patches and review suggestions from PR: [#62](https://github.com/ISEEKYAN/mbridge/pull/62) (will be merged into the main branch in the future);
+
+    - Megatron-Bridge: Apply the patches and review suggestions from PR if you want to try out mimo-7B-RL: [#2387](https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/2387) (will be merged into the main branch in the future);
 
     - megatron: Use the latest dev version (commit: [23e092f41ec8bc659020e401ddac9576c1cfed7e](https://github.com/NVIDIA/Megatron-LM/tree/23e092f41ec8bc659020e401ddac9576c1cfed7e)), which supports MTP + CP training methods.