verl-project
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/e2e_ascend.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/e2e_ascend.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/e2e_one_step_off_policy_ascend.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/e2e_one_step_off_policy_ascend.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker/Dockerfile.stable.vllm‎
Lines changed: 3 additions & 0 deletions b/‎docker/Dockerfile.stable.vllm‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/advance/fully_async.md‎
Lines changed: 0 additions & 24 deletions b/‎docs/advance/fully_async.md‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎requirements-npu.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements-npu.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/checkpoint_engine/test_correctness_on_gpu.py‎
Lines changed: 6 additions & 2 deletions b/‎tests/checkpoint_engine/test_correctness_on_gpu.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tests/checkpoint_engine/test_correctness_on_npu.py‎
Lines changed: 3 additions & 1 deletion b/‎tests/checkpoint_engine/test_correctness_on_npu.py‎
Lines changed: 3 additions & 1 deletion
@@ -20,6 +20,7 @@
 /verl/workers/actor/megatron_actor.py @ISEEKYAN @vermouth1992
 /verl/workers/critic/megatron_critic.py @ISEEKYAN @vermouth1992
 /verl/workers/megatron_workers.py @ISEEKYAN @vermouth1992
+/verl/experimental @wuxibin89 @ArronHZG
 
 /tests/single_controller @zw0610 @wuxibin89
 /tests/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6
 
@@ -6,7 +6,7 @@
 
 - [ ] Search for similar PRs. Paste at least one query link here: ...
 - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI)
-  - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`
+  - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`, `fully_async`, `one_step_off`
   - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]`
   - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
   - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title.
 
@@ -126,6 +126,10 @@ jobs:
           ray stop --force
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
           USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_grpo_mindspeed bash tests/special_npu/run_qwen3_30b_grpo_mindspeed.sh
+      - name: Running the E2E test with fully_async_policy algorithm (FSDP2)
+        run: |
+          ray stop --force
+          bash tests/special_npu/run_fully_async_policy.sh
 
   vlm_rl_job:
     if: github.repository_owner == 'verl-project'
 
@@ -68,7 +68,7 @@ on:
       # Entrypoints
       - ".github/workflows/e2e_one_step_off_policy_ascend.yml"
       - "examples/data_preprocess/gsm8k.py"
-      - "tests/special_e2e/run_one_step_off_policy.sh"
+      - "tests/special_npu/run_one_step_off_policy.sh"
 
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -122,7 +122,7 @@ jobs:
       - name: Running the E2E test with one_step_off_policy algorithm (FSDP2)
         run: |
           ray stop --force
-          bash tests/special_e2e/run_one_step_off_policy.sh
+          bash tests/special_npu/run_one_step_off_policy.sh
 
   # Test Megatron strategy
   e2e_one_step_off_policy_megatron_ascend:
@@ -167,4 +167,4 @@ jobs:
         run: |
           ray stop --force
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
-          bash tests/special_e2e/run_one_step_off_policy.sh
+          bash tests/special_npu/run_one_step_off_policy.sh
@@ -8,6 +8,8 @@
 **/playground
 **/wandb
 
+/pyrightconfig.json
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -32,6 +32,9 @@ RUN pip install torch==2.9.1 torchvision torchaudio --index-url https://download
 RUN sed -i '/nvidia-cudnn-cu12/d' /usr/local/lib/python3.12/dist-packages/torch-2.9.1+cu129.dist-info/METADATA
 RUN pip install --no-deps --force-reinstall nvidia-cudnn-cu12==9.16.0.29
 
+# NOTE: This installs the `vllm` source code in `/vllm`.
+# This might break the (based)pyright type checking. To fix it, add `/vllm` to `extraPaths` in `pyrightconfig.json`.
+# c.f. https://docs.basedpyright.com/latest/configuration/config-files/
 RUN git clone --depth 1 -b v0.12.0 https://github.com/vllm-project/vllm.git && \
     cd vllm && \
     find requirements -name "*.txt" -print0 | xargs -0 sed -i '/torch/d' && \
 
@@ -106,9 +106,6 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
 | `async_training.trigger_parameter_sync_step`                     | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
 | `async_training.staleness_threshold`                             | Freshness control                                                                              |
 | `async_training.partial_rollout`                                 | Whether to perform partial_rollout                                                             |
-| `async_training.checkpoint_engine.enable`                        | Whether to use checkpoint_engine for accelerating, default `True`                              |
-| `async_training.checkpoint_engine.overlap_broadcast_and_consume` | When use checkpoint_engine, whether to overlap broadcast and load_weights, default `False`     |
-| `async_training.checkpoint_engine.device_buffer_size_M`          | When use checkpoint_engine, the user-specific bucket size (MB), default `4096`                 |
 | `async_training.use_trainer_do_validate`                         | Whether use trainer node to do validate process, default `False`                               |
 
 **Further Explanation:**
@@ -182,27 +179,6 @@ https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_rev
   mode d
   (async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`.
 
-* `async_training.checkpoint_engine.enable`
-
-  Enabling the checkpoint engine generally reduces synchronization time overhead by more than 60% compared to
-  the original per-tensor parameter synchronization method. However, assembling buckets incurs additional
-  temporary GPU memory overhead.
-
-* `async_training.checkpoint_engine.overlap_broadcast_and_consume`
-
-  Enabling pipeline between the broadcast and load_weights parameters will allocate additional GPU memory.
-  Since the main time consumption for parameter synchronization is not in the broadcast and load_weights phases,
-  but in the parameter generation phase (by megatron or FSDP), this option is off by default.
-
-* `async_training.checkpoint_engine.device_buffer_size_M`
-
-  It controls the size of the memory buffer used for synchronization when the checkpoint-engine is enabled.
-  The actual `bucket_size` = `max(device_buffer_size_M, maximum parameter tensor size)`.
-    * When enable `overlap_broadcast_and_consume`, the additional device memory overhead of
-      trainer rank is `3 * bucket_size`and rollout rank is `2 * bucket_size`。
-    * When disable `overlap_broadcast_and_consume`, the additional device memory overhead of
-      trainer rank is `2 * bucket_size`and rollout rank is `1 * bucket_size`。
-
 * `async_training.use_trainer_do_validate`
 
   It controls whether to use the trainer's `do_validate` method for validation.
 
@@ -18,4 +18,4 @@ torchdata
 einops
 qwen_vl_utils
 hf_transfer
-triton-ascend==3.2.0rc4
+triton-ascend==3.2.0
@@ -23,12 +23,14 @@
     split_resource_pool,
 )
 from verl.utils.device import get_device_name
+from verl.utils.ray_utils import auto_await
 from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("rebuild_group", [False, True])
 @pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+@auto_await
 async def test_nccl_checkpoint_engine(
     rebuild_group,
     num_trainer,
@@ -65,7 +67,7 @@ async def test_nccl_checkpoint_engine(
     rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
 
     # create checkpoint engine manager
-    checkpoint_manager = CheckpointEngineManager(backend="nccl", trainer=trainer, replicas=replicas)
+    checkpoint_manager = CheckpointEngineManager(config=checkpoint_engine_config, trainer=trainer, replicas=replicas)
     for _ in range(3):
         await checkpoint_manager.update_weights()
         rollout.check_weights()
@@ -77,6 +79,7 @@ async def test_nccl_checkpoint_engine(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 @pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+@auto_await
 async def test_nixl_checkpoint_engine(
     num_trainer,
     num_rollout,
@@ -120,7 +123,7 @@ async def test_nixl_checkpoint_engine(
     rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
 
     # create checkpoint engine manager
-    checkpoint_manager = CheckpointEngineManager(backend="nixl", trainer=trainer, replicas=replicas)
+    checkpoint_manager = CheckpointEngineManager(config=checkpoint_engine_config, trainer=trainer, replicas=replicas)
     for _ in range(3):
         await checkpoint_manager.update_weights()
         rollout.check_weights()
@@ -132,6 +135,7 @@ async def test_nixl_checkpoint_engine(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("rebuild_group", [False])
 @pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+@auto_await
 async def test_kimi_checkpoint_engine(
     rebuild_group,
     num_trainer,
 
@@ -23,12 +23,14 @@
     split_resource_pool,
 )
 from verl.utils.device import get_device_name
+from verl.utils.ray_utils import auto_await
 from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("rebuild_group", [False])
 @pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+@auto_await
 async def test_hccl_checkpoint_engine(
     rebuild_group,
     num_trainer,
@@ -66,7 +68,7 @@ async def test_hccl_checkpoint_engine(
     rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
 
     # create checkpoint engine manager
-    checkpoint_manager = CheckpointEngineManager(backend="hccl", trainer=trainer, replicas=replicas)
+    checkpoint_manager = CheckpointEngineManager(config=checkpoint_engine_config, trainer=trainer, replicas=replicas)
     for _ in range(3):
         await checkpoint_manager.update_weights()
         rollout.check_weights()