From 32ff542f5e2979999ba3e8b969c2e7fb31b4d573 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Mon, 2 Feb 2026 18:57:10 +0800
Subject: [PATCH 01/61] chore: point recipe submodule to fork

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index d5dd7a6aa57..af166615b4a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "recipe"]
 	path = recipe
-	url = https://github.com/verl-project/verl-recipe.git
+	url = https://github.com/khazic/verl-recipe_lao.git

From 27e354b44790a4fa1c38dc7f790dac6f5b2d31e6 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Mon, 2 Feb 2026 19:14:21 +0800
Subject: [PATCH 02/61] feat: add custom Qwen3-30BA3B translate recipe

---
 .../run_sft_qwen3moe_30b_a3b_megatron.sh      | 112 +++++++++++++++++
 .../run_sft_qwen3moe_30b_a3b_megatron_aux.sh  | 115 ++++++++++++++++++
 2 files changed, 227 insertions(+)
 create mode 100644 recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh
 create mode 100644 recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh

diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh
new file mode 100644
index 00000000000..5b0a7ea263d
--- /dev/null
+++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet}
+backend=${BACKEND:-megatron}
+project_name=verl_sft_translate_0109
+RESUME_MODE=disable
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507}
+
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-64}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-4}
+PP_SIZE=${PP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-8}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+MEGATRON_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=6e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=6e-7 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}
+fi
+
+CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
+NNODES=${WORLD_SIZE:-8}           
+NODE_RANK=${RANK:-0}              
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
+MASTER_PORT=${MASTER_PORT:-23457} 
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    mkdir -p "${CKPT_HOME}"
+fi
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl
+
+torchrun \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nproc-per-node=8 \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=512 \
+    data.max_length=8192 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=right \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=49152 \
+    data.messages_key=messages \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    +model.override_config.output_router_logits=True \
+    +model.override_config.router_dtype="float32" \
+    model.enable_gradient_checkpointing=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=5000 \
+    'trainer.logger=[console]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=2 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=3 \
+    'checkpoint.save_contents=[model,optimizer,extra]'
\ No newline at end of file
diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh
new file mode 100644
index 00000000000..bf59deb9bda
--- /dev/null
+++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet}
+backend=${BACKEND:-megatron}
+project_name=verl_sft_translate_0109_aux
+RESUME_MODE=disable
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507}
+
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-64}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-4}
+PP_SIZE=${PP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-8}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+MEGATRON_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=5e-7 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True \
+    +engine.override_transformer_config.moe_aux_loss_coeff=0.01 \
+    +engine.override_transformer_config.moe_z_loss_coeff=0.001 \
+    +engine.override_transformer_config.moe_router_load_balancing_type=aux_loss"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}
+fi
+
+CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
+NNODES=${WORLD_SIZE:-8}           
+NODE_RANK=${RANK:-0}              
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
+MASTER_PORT=${MASTER_PORT:-23457} 
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    mkdir -p "${CKPT_HOME}"
+fi
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl
+
+torchrun \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nproc-per-node=8 \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=512 \
+    data.max_length=8192 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=right \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=49152 \
+    data.messages_key=messages \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    +model.override_config.output_router_logits=True \
+    +model.override_config.router_dtype="float32" \
+    model.enable_gradient_checkpointing=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=5000 \
+    'trainer.logger=[console]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=2 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=3 \
+    'checkpoint.save_contents=[model,optimizer,extra]'
\ No newline at end of file

From 3b2a4564a1c49f389a21ee2116716cf30266d954 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 3 Feb 2026 14:22:44 +0800
Subject: [PATCH 03/61] Add RLVR_ABCDE_dense scripts

---
 .../RLVR_ABCDE_dense/create_dataset.py        | 198 ++++++++++++++++++
 .../RLVR_ABCDE_dense/reward_function.py       |  65 ++++++
 .../RLVR_ABCDE_dense/run_grpo_dlc.sh          |  82 ++++++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh |  92 ++++++++
 4 files changed, 437 insertions(+)
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/create_dataset.py
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/reward_function.py
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh

diff --git a/recipes_custom/RLVR_ABCDE_dense/create_dataset.py b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py
new file mode 100644
index 00000000000..754dacde603
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py
@@ -0,0 +1,198 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Task description:
+Given a random word and a random char, count the number of occurrence of char in the word.
+
+Create CoT dataset that split the word into separate char. Then list the char and count the occurrence.
+
+The word set comes from shakespeare
+"""
+
+import os.path
+import random
+
+prompt_template = "How many {} are there in word {}?"
+
+
+def generate_random_char():
+    return chr(97 + random.randint(0, 25))
+
+
+def create_prompt_response(min_length=3, max_length=5):
+    # randomly generate a length
+    word_length = random.randint(min_length, max_length)
+    # randomly generate a target count number. This makes the target number
+    target_count_number = random.randint(1, word_length)
+
+    char_lst = []
+    # generate the word
+    # step 1: generate the target word
+    target_char = generate_random_char()
+
+    for _ in range(target_count_number):
+        char_lst.append(target_char)
+
+    # step 2: generate other words
+    for _ in range(word_length - target_count_number):
+        while True:
+            char = generate_random_char()
+            if char != target_char:
+                char_lst.append(char)
+                break
+
+    # step 3: random permute char_lst
+    random.shuffle(char_lst)
+
+    word = "-".join(char_lst)
+
+    prompt = prompt_template.format(target_char, word)
+    final_answer = []
+
+    # cot
+    number = 0
+    for i, char in enumerate(char_lst):
+        cot = f"{char}"
+        if char != target_char:
+            cot += " != "
+        else:
+            cot += " = "
+            number += 1
+        cot += f"{target_char}."
+
+        final_answer.append(cot)
+
+    conclusion = f"\\boxed{{{number}}} {target_char} in {word}."
+
+    final_answer.append(conclusion)
+
+    final_answer = "\n".join(final_answer)
+
+    return prompt, final_answer
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--total_number", type=int, default=10000)
+    parser.add_argument("--min_length", type=int, default=5)
+    parser.add_argument("--max_length", type=int, default=20)
+    parser.add_argument("--data_path", type=str, default="~/data/char_count")
+
+    args = vars(parser.parse_args())
+
+    total_number = args["total_number"]
+    min_length = args["min_length"]
+    max_length = args["max_length"]
+    data_path = args["data_path"]
+    data_path = os.path.expanduser(data_path)
+
+    full_output = []
+    for _ in range(total_number):
+        output = create_prompt_response(min_length=min_length, max_length=max_length)
+        full_output.append(output)
+
+    # random reorder
+    random.shuffle(full_output)
+
+    # split for train and test
+    train_split_len = int(0.9 * len(full_output))
+    train_outputs = full_output[:train_split_len]
+    test_output = full_output[train_split_len:]
+
+    sft_train_dataset = {"messages": []}
+
+    for o in train_outputs:
+        messages = [
+            {"role": "user", "content": o[0]},
+            {"role": "assistant", "content": o[1]},
+        ]
+
+        sft_train_dataset["messages"].append(messages)
+
+    sft_test_dataset = {"messages": []}
+
+    for o in test_output:
+        messages = [
+            {"role": "user", "content": o[0]},
+            {"role": "assistant", "content": o[1]},
+        ]
+        sft_test_dataset["messages"].append(messages)
+
+    import pandas as pd
+
+    sft_train_dataset = pd.DataFrame(data=sft_train_dataset)
+    sft_test_dataset = pd.DataFrame(data=sft_test_dataset)
+
+    folder = os.path.join(data_path, "sft")
+
+    os.makedirs(folder, exist_ok=True)
+
+    sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
+    sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))
+
+    # build RL dataset
+    rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
+
+    rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
+
+    from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
+
+    for o in train_outputs:
+        prompt = o[0]
+        response = o[1]
+        prompt_with_template = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+
+        rl_train_dataset["prompt"].append(prompt_with_template)
+        rl_train_dataset["data_source"].append("char_count")
+        rl_train_dataset["ability"].append("other")
+        rl_train_dataset["reward_model"].append(
+            {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
+        )
+        rl_train_dataset["extra_info"].append({"response": response})
+
+    for o in test_output:
+        prompt = o[0]
+        response = o[1]
+        prompt_with_template = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+
+        rl_test_dataset["prompt"].append(prompt_with_template)
+        rl_test_dataset["data_source"].append("char_count")
+        rl_test_dataset["ability"].append("other")
+        rl_test_dataset["reward_model"].append(
+            {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
+        )
+        rl_test_dataset["extra_info"].append({"response": response})
+
+    rl_train_dataset = pd.DataFrame(data=rl_train_dataset)
+    rl_test_dataset = pd.DataFrame(data=rl_test_dataset)
+
+    folder = os.path.join(data_path, "rl")
+
+    os.makedirs(folder, exist_ok=True)
+
+    rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
+    rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))
diff --git a/recipes_custom/RLVR_ABCDE_dense/reward_function.py b/recipes_custom/RLVR_ABCDE_dense/reward_function.py
new file mode 100644
index 00000000000..61fe81bf207
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/reward_function.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Reward function
+"""
+
+import re
+
+DEFAULT_CHOICES = ("A", "B", "C", "D", "E")
+BOXED_PATTERN = re.compile(r"\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}")
+CHOICE_PATTERN = re.compile(
+    r"(?:answer|option|choice)?\s*[:=]?\s*([A-Za-z])\b", re.IGNORECASE
+)
+
+
+def _extract_boxed_answer(text: str) -> str:
+    matches = BOXED_PATTERN.findall(text)
+    return matches[-1] if matches else ""
+
+
+def _normalize_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str:
+    text = (text or "").strip().upper()
+    for char in text:
+        if char in valid_choices:
+            return char
+    return ""
+
+
+def extract_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str:
+    """
+    Extract a single-letter choice, preferring \\boxed{} values but falling back
+    to phrases like "Answer: C" or the first standalone letter.
+    """
+    text = str(text or "")
+    candidate = _normalize_choice(_extract_boxed_answer(text), valid_choices)
+    if candidate:
+        return candidate
+    match = CHOICE_PATTERN.search(text)
+    if match:
+        candidate = _normalize_choice(match.group(1), valid_choices)
+        if candidate:
+            return candidate
+    return _normalize_choice(text, valid_choices)
+
+
+def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None):
+    try:
+        model_choice = extract_choice(solution_str)
+        gold_choice = extract_choice(ground_truth)
+        return 1 if model_choice and gold_choice and model_choice == gold_choice else 0
+    except Exception:
+        print(ground_truth, solution_str)
+        return 0
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
new file mode 100644
index 00000000000..40eead0aa96
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json}
+VAL_FILES=${VAL_FILES:-}
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000}
+PROJECT_NAME=${PROJECT_NAME:-rlvr}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo}
+
+NNODES=${PET_NNODES:-${WORLD_SIZE:-30}}
+NODE_RANK=${PET_NODE_RANK:-${RANK:-0}}
+MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}}
+MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}}
+N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}}
+
+RAY_PORT=${RAY_PORT:-6379}
+RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
+RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT}
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+echo ">>> Ray 地址: $RAY_ADDRESS"
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+
+if [ "$NODE_RANK" -eq 0 ]; then
+  ray start --head \
+    --node-ip-address="$MASTER_ADDR" \
+    --port="$RAY_PORT" \
+    --dashboard-port="$RAY_DASHBOARD_PORT"
+else
+  ray start --address="$RAY_ADDRESS" --block &
+fi
+
+# Give Ray a moment to settle
+sleep 5
+
+python3 $ENTRYPOINT \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$TRAIN_FILES \
+    data.train_batch_size=2048 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$MODEL_ID \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((2048 + 1024)) \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.rollout.enforce_eager=True \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0.05 \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.val_before_train=False \
+    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=-1 \
+    trainer.test_freq=-1 \
+    trainer.total_epochs=5 \
+    trainer.use_legacy_worker_impl=disable \
+    ray_kwargs.ray_init.address=$RAY_ADDRESS \
+    custom_reward_function.path=./reward_function.py \
+    custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
new file mode 100644
index 00000000000..46b031b8d41
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VERL_USE_GPT_OSS=0
+export VERL_DISABLE_HARMONY=1
+export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json}
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000}
+PROJECT_NAME=${PROJECT_NAME:-rlvr}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_megatron}
+
+NNODES=${PET_NNODES:-${WORLD_SIZE:-28}}
+NODE_RANK=${PET_NODE_RANK:-${RANK:-0}}
+MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}}
+MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}}
+N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}}
+
+TP_SIZE=${TP_SIZE:-8}
+PP_SIZE=${PP_SIZE:-1}
+
+rollout_mode=${ROLLOUT_MODE:-async}
+USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
+RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+
+RAY_PORT=${RAY_PORT:-6379}
+RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
+RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT}
+
+if [ "$NODE_RANK" -eq 0 ]; then
+  ray start --head \
+    --node-ip-address="$MASTER_ADDR" \
+    --port="$RAY_PORT" \
+    --dashboard-port="$RAY_DASHBOARD_PORT"
+else
+  ray start --address="$RAY_ADDRESS" --block
+  exit 0
+fi
+
+sleep 5
+
+python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$TRAIN_FILES \
+    data.val_files=$TRAIN_FILES \
+    data.val_max_samples=512 \
+    data.return_raw_chat=$RETURN_RAW_CHAT \
+    data.train_batch_size=224 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$MODEL_ID \
+    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=224 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.val_before_train=False \
+    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=100 \
+    trainer.test_freq=100 \
+    trainer.total_epochs=5 \
+    +ray_kwargs.ray_init.address=$RAY_ADDRESS \
+    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \
+    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \
+    custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \
+    custom_reward_function.name=char_count_reward_function

From 8ef1cb89b1a883a2be559c25dc6460ed052bebcd Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 5 Feb 2026 15:00:45 +0800
Subject: [PATCH 04/61] chore: adjust GRPO launch scripts and trainer defaults

- add FSDP GRPO launcher with vLLM rollout settings
- update Megatron launcher to keep workers running and log to W&B
- increase Megatron NCCL timeout to 1200s
- log validation generations by default in PPO trainer
- remove legacy GRPO DLC script
---
 .../RLVR_ABCDE_dense/run_grpo_dlc.sh          |  82 --------------
 .../RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh     | 101 ++++++++++++++++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh |   4 +-
 verl/trainer/config/ppo_megatron_trainer.yaml |   2 +-
 verl/trainer/config/ppo_trainer.yaml          |   2 +-
 5 files changed, 105 insertions(+), 86 deletions(-)
 delete mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
deleted file mode 100644
index 40eead0aa96..00000000000
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-
-ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
-TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json}
-VAL_FILES=${VAL_FILES:-}
-MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000}
-PROJECT_NAME=${PROJECT_NAME:-rlvr}
-EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo}
-
-NNODES=${PET_NNODES:-${WORLD_SIZE:-30}}
-NODE_RANK=${PET_NODE_RANK:-${RANK:-0}}
-MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}}
-MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}}
-N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}}
-
-RAY_PORT=${RAY_PORT:-6379}
-RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
-RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT}
-
-echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
-echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
-echo ">>> Ray 地址: $RAY_ADDRESS"
-
-export WANDB_MODE=offline
-export NCCL_DEBUG=WARN
-
-if [ "$NODE_RANK" -eq 0 ]; then
-  ray start --head \
-    --node-ip-address="$MASTER_ADDR" \
-    --port="$RAY_PORT" \
-    --dashboard-port="$RAY_DASHBOARD_PORT"
-else
-  ray start --address="$RAY_ADDRESS" --block &
-fi
-
-# Give Ray a moment to settle
-sleep 5
-
-python3 $ENTRYPOINT \
-    algorithm.adv_estimator=grpo \
-    data.train_files=$TRAIN_FILES \
-    data.train_batch_size=2048 \
-    data.max_prompt_length=2048 \
-    data.max_response_length=1024 \
-    data.filter_overlong_prompts=False \
-    data.truncation='error' \
-    actor_rollout_ref.model.path=$MODEL_ID \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
-    actor_rollout_ref.actor.use_dynamic_bsz=True \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((2048 + 1024)) \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.actor.kl_loss_coef=0.0 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.fsdp_config.param_offload=True \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
-    actor_rollout_ref.rollout.n=8 \
-    actor_rollout_ref.rollout.enforce_eager=True \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0.05 \
-    trainer.logger='["console","tensorboard"]' \
-    trainer.project_name=$PROJECT_NAME \
-    trainer.experiment_name=$EXPERIMENT_NAME \
-    trainer.val_before_train=False \
-    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
-    trainer.nnodes=$NNODES \
-    trainer.save_freq=-1 \
-    trainer.test_freq=-1 \
-    trainer.total_epochs=5 \
-    trainer.use_legacy_worker_impl=disable \
-    ray_kwargs.ray_init.address=$RAY_ADDRESS \
-    custom_reward_function.path=./reward_function.py \
-    custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh
new file mode 100644
index 00000000000..42786ea6300
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VERL_USE_GPT_OSS=0
+export VERL_DISABLE_HARMONY=1
+export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json}
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000}
+PROJECT_NAME=${PROJECT_NAME:-rlvr}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_fsdp}
+
+NNODES=${PET_NNODES:-${WORLD_SIZE:-28}}
+NODE_RANK=${PET_NODE_RANK:-${RANK:-0}}
+MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}}
+MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}}
+N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}}
+
+FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2}
+FSDP_SIZE=${FSDP_SIZE:-8}
+ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False}
+REF_OFFLOAD=${REF_OFFLOAD:-False}
+CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False}
+
+rollout_mode=${ROLLOUT_MODE:-async}
+USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
+RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+
+RAY_PORT=${RAY_PORT:-6379}
+RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
+RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT}
+
+if [ "$NODE_RANK" -eq 0 ]; then
+  ray start --head \
+    --node-ip-address="$MASTER_ADDR" \
+    --port="$RAY_PORT" \
+    --dashboard-port="$RAY_DASHBOARD_PORT"
+else
+  ray start --address="$RAY_ADDRESS"
+  exit 0
+fi
+
+sleep 5
+
+python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \
+    --config-name='ppo_trainer.yaml' \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$TRAIN_FILES \
+    data.val_files=$TRAIN_FILES \
+    data.val_max_samples=512 \
+    data.return_raw_chat=$RETURN_RAW_CHAT \
+    data.train_batch_size=224 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$MODEL_ID \
+    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
+    actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=224 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
+    actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    critic.strategy=$FSDP_STRATEGY \
+    critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \
+    critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \
+    critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.val_before_train=False \
+    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=100 \
+    trainer.test_freq=100 \
+    trainer.total_epochs=5 \
+    +ray_kwargs.ray_init.address=$RAY_ADDRESS \
+    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \
+    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \
+    custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \
+    custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
index 46b031b8d41..c3b0bff0457 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
@@ -36,7 +36,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     --port="$RAY_PORT" \
     --dashboard-port="$RAY_DASHBOARD_PORT"
 else
-  ray start --address="$RAY_ADDRESS" --block
+  ray start --address="$RAY_ADDRESS"
   exit 0
 fi
 
@@ -76,7 +76,7 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
-    trainer.logger='["console","tensorboard"]' \
+    trainer.logger='["console","wandb"]' \
     trainer.project_name=$PROJECT_NAME \
     trainer.experiment_name=$EXPERIMENT_NAME \
     trainer.val_before_train=False \
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
index 76ba4c57575..3c2505592ad 100644
--- a/verl/trainer/config/ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -26,7 +26,7 @@ defaults:
 actor_rollout_ref:
   hybrid_engine: True
 
-  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+  nccl_timeout: 1200 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
 
   model:
     override_config:
diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
index 7489b522fa2..a9cf8e6c650 100644
--- a/verl/trainer/config/ppo_trainer.yaml
+++ b/verl/trainer/config/ppo_trainer.yaml
@@ -141,7 +141,7 @@ trainer:
   logger: ["console", "wandb"]
 
   # Number of generations to log during validation
-  log_val_generations: 0
+  log_val_generations: 10
 
   # Directory for logging rollout data; no dump if null
   rollout_data_dir: null

From 3c3288c2377b97b7c8e3b1897e191a252b1fe8cf Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 5 Feb 2026 16:17:58 +0800
Subject: [PATCH 05/61] feat: add single-node Megatron GRPO launcher

- add single-node 8xGPU Megatron GRPO script with TP/PP=1
- tune batch sizes and validation defaults for single-node runs
- update existing GRPO launch scripts to match latest paths/settings
---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh     |  2 -
 .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh | 12 ++-
 .../run_grpo_megatron_single_node.sh          | 74 +++++++++++++++++++
 3 files changed, 79 insertions(+), 9 deletions(-)
 create mode 100755 recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh
index 42786ea6300..6ab8523d75b 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh
@@ -4,7 +4,6 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export VERL_DISABLE_HARMONY=1
 export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-}
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
@@ -96,6 +95,5 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \
-    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \
     custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
index c3b0bff0457..1b0659fc7d0 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh
@@ -4,14 +4,13 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export VERL_DISABLE_HARMONY=1
-export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-}
+export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
-TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json}
-MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000}
-PROJECT_NAME=${PROJECT_NAME:-rlvr}
-EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_megatron}
+TRAIN_FILES=${TRAIN_FILES:-/llm-alignment/liuchonghan/all_data_merged_rlhf.json}
+MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
+PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron}
 
 NNODES=${PET_NNODES:-${WORLD_SIZE:-28}}
 NODE_RANK=${PET_NODE_RANK:-${RANK:-0}}
@@ -87,6 +86,5 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \
-    +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \
     custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
new file mode 100755
index 00000000000..141cca62476
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VERL_USE_GPT_OSS=0
+export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
+TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
+MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
+PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single}
+
+# Single node, 8 GPUs
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=${MASTER_PORT:-23457}
+N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
+
+TP_SIZE=1
+PP_SIZE=1
+
+rollout_mode=${ROLLOUT_MODE:-async}
+USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
+RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+
+python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$TRAIN_FILES \
+    data.val_files=$TRAIN_FILES \
+    data.val_max_samples=2048 \
+    data.return_raw_chat=$RETURN_RAW_CHAT \
+    data.train_batch_size=32 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$MODEL_ID \
+    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.val_before_train=True \
+    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=300 \
+    trainer.test_freq=300 \
+    trainer.total_epochs=5 \
+    +ray_kwargs.ray_init.num_cpus=32 \
+    custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
+    custom_reward_function.name=char_count_reward_function

From c79bebee0b34bbd0f539436e1671f2ac197abe24 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 5 Feb 2026 16:28:14 +0800
Subject: [PATCH 06/61] chore: run single-node GRPO in W&B offline mode

- set WANDB_MODE=offline in single-node Megatron script
- avoid proxy failures during W&B logging
---
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 141cca62476..a91b644c315 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -4,6 +4,7 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
+export WANDB_MODE=offline
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
@@ -12,7 +13,6 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single}
 
-# Single node, 8 GPUs
 NNODES=1
 NODE_RANK=0
 MASTER_ADDR=127.0.0.1

From 56ba579ad720b0e72c9276dc9dd7f38537824b98 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 5 Feb 2026 17:07:27 +0800
Subject: [PATCH 07/61] chore: lower single-node GRPO memory footprint

- reduce batch sizes and sequence lengths for Megatron single-node
- align FSDP single-node script with safer rollout settings
- keep vLLM utilization low for constrained free memory
---
 .../run_grpo_fsdp_single_node.sh              | 83 +++++++++++++++++++
 .../run_grpo_megatron_single_node.sh          | 14 ++--
 2 files changed, 90 insertions(+), 7 deletions(-)
 create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
new file mode 100644
index 00000000000..1850e7ecf99
--- /dev/null
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VERL_USE_GPT_OSS=0
+export WANDB_MODE=offline
+export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
+TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
+MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
+PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
+EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
+
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=${MASTER_PORT:-23457}
+N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
+
+FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2}
+FSDP_SIZE=${FSDP_SIZE:-8}
+ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False}
+REF_OFFLOAD=${REF_OFFLOAD:-False}
+CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False}
+
+rollout_mode=${ROLLOUT_MODE:-async}
+USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
+RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+
+python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
+    --config-name='ppo_trainer.yaml' \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$TRAIN_FILES \
+    data.val_files=$TRAIN_FILES \
+    data.val_max_samples=2048 \
+    data.return_raw_chat=$RETURN_RAW_CHAT \
+    data.train_batch_size=16 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$MODEL_ID \
+    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
+    actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \
+    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
+    actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    critic.strategy=$FSDP_STRATEGY \
+    critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \
+    critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \
+    critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.val_before_train=True \
+    trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
+    trainer.nnodes=$NNODES \
+    trainer.save_freq=300 \
+    trainer.test_freq=300 \
+    trainer.total_epochs=5 \
+    +ray_kwargs.ray_init.num_cpus=32 \
+    custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
+    custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index a91b644c315..9ac4d5f94df 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -33,16 +33,16 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_files=$TRAIN_FILES \
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
-    data.train_batch_size=32 \
-    data.max_prompt_length=1024 \
-    data.max_response_length=1024 \
+    data.train_batch_size=16 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
     actor_rollout_ref.model.path=$MODEL_ID \
     actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
     actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.actor.use_kl_loss=False \
@@ -53,8 +53,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
-    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \
+    actor_rollout_ref.rollout.n=4 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \

From 8e8deedc99e83aba78e6e51aa9ffa0439b5f0a3c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 5 Feb 2026 17:13:34 +0800
Subject: [PATCH 08/61] chore: tune vLLM rollout memory for single-node

- raise vLLM gpu_memory_utilization to 0.30 for KV cache
- lower rollout.n and cap max batched tokens for stability
- apply settings to both Megatron and FSDP single-node scripts
---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh        | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 1850e7ecf99..15bf94ced00 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -58,8 +58,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \
-    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
+    actor_rollout_ref.rollout.n=2 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 9ac4d5f94df..03c472be798 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -53,8 +53,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \
-    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
+    actor_rollout_ref.rollout.n=2 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \

From cfafe22b018aa49ffa5e5097ab3207d016847b56 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:32:23 +0800
Subject: [PATCH 09/61] Update GRPO scripts for 4-node Ray

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh          | 7 ++++---
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh      | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 15bf94ced00..d13f49f6a6a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,9 +13,9 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
 
-NNODES=1
-NODE_RANK=0
-MASTER_ADDR=127.0.0.1
+NNODES=${NNODES:-4}
+NODE_RANK=${NODE_RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 
@@ -61,6 +61,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
     actor_rollout_ref.rollout.n=2 \
     actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
+    actor_rollout_ref.rollout.max_model_len=8192 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 03c472be798..915fda505e9 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,9 +13,9 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single}
 
-NNODES=1
-NODE_RANK=0
-MASTER_ADDR=127.0.0.1
+NNODES=${NNODES:-4}
+NODE_RANK=${NODE_RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 
@@ -56,6 +56,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
     actor_rollout_ref.rollout.n=2 \
     actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
+    actor_rollout_ref.rollout.max_model_len=8192 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \

From 4b005cb58617d73ffc6057f81d98d25933b3e916 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:36:33 +0800
Subject: [PATCH 10/61] Use Ray address for existing cluster

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh   | 3 ++-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh          | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index d13f49f6a6a..9e4247d4a9a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -28,6 +28,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False}
 rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+RAY_ADDRESS=${RAY_ADDRESS:-auto}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \
@@ -80,6 +81,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.save_freq=300 \
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
-    +ray_kwargs.ray_init.num_cpus=32 \
+    +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 915fda505e9..c4d730e0e72 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -25,6 +25,7 @@ PP_SIZE=1
 rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
+RAY_ADDRESS=${RAY_ADDRESS:-auto}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_megatron_trainer.yaml' \
@@ -71,6 +72,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.save_freq=300 \
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
-    +ray_kwargs.ray_init.num_cpus=32 \
+    +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 787a9eb3f2caf924896f382f0cc97092889196fc Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:38:09 +0800
Subject: [PATCH 11/61] Add Ray runtime_env for code import

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh   | 3 +++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh          | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 9e4247d4a9a..663198e4c5c 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -29,6 +29,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-auto}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \
@@ -82,5 +83,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
+    +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
+    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index c4d730e0e72..08f0dc1287c 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -26,6 +26,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-auto}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_megatron_trainer.yaml' \
@@ -73,5 +74,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
+    +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
+    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 4f360e1673db63dd615e0d76baf4c62c8271bc01 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:42:35 +0800
Subject: [PATCH 12/61] Set socket IFNAME and increase batch size

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 +++-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 663198e4c5c..10ce6d8e2a6 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -6,6 +6,8 @@ export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -38,7 +40,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_files=$TRAIN_FILES \
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
-    data.train_batch_size=16 \
+    data.train_batch_size=64 \
     data.max_prompt_length=512 \
     data.max_response_length=512 \
     data.filter_overlong_prompts=False \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 08f0dc1287c..856ec593c93 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -6,6 +6,8 @@ export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -35,7 +37,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_files=$TRAIN_FILES \
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
-    data.train_batch_size=16 \
+    data.train_batch_size=64 \
     data.max_prompt_length=512 \
     data.max_response_length=512 \
     data.filter_overlong_prompts=False \

From 5acfc8763362e4e94f8e485deb3d7ab90239b307 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:52:46 +0800
Subject: [PATCH 13/61] Propagate env to Ray workers and adjust batch

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 ++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 10ce6d8e2a6..2cda3cc18bd 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -87,5 +87,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 856ec593c93..8083bfc3fec 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -78,5 +78,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From d41a15780ce2062ad2ba57596f60606d96b0235d Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 15:54:33 +0800
Subject: [PATCH 14/61] Quote MASTER_PORT in Ray runtime env

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 2cda3cc18bd..55221bdb32a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -88,7 +88,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 8083bfc3fec..c23b6584bd8 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -79,7 +79,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \

From 6fa835f7462fadaadb792d51e65d7260d31a1a58 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 16:02:11 +0800
Subject: [PATCH 15/61] Add WANDB proxy env vars to RLVR scripts

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh        | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 55221bdb32a..4dca55e595d 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -5,6 +5,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
+export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index c23b6584bd8..44bec63f62e 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -5,6 +5,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
+export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}

From 069746fed6fcc3a954942ed47e6f359e9fda0a9d Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 16:05:43 +0800
Subject: [PATCH 16/61] Remove WANDB proxy envs and keep offline mode

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 -----
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh        | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 4dca55e595d..55221bdb32a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -5,11 +5,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
-export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 44bec63f62e..c23b6584bd8 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -5,11 +5,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=offline
-export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}

From afa0f415668503713ba00ec0748e0329be068906 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 16:15:32 +0800
Subject: [PATCH 17/61] Enable WANDB logging via proxy in RLVR scripts

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh   | 13 ++++++++++++-
 .../run_grpo_megatron_single_node.sh                | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 55221bdb32a..51a9be58c37 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -4,7 +4,12 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export WANDB_MODE=offline
+export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
@@ -91,5 +96,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index c23b6584bd8..605b23d754c 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -4,7 +4,12 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export WANDB_MODE=offline
+export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
@@ -82,5 +87,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 042c4c8bb48ad778f7165785fb9c354366d5e050 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 16:48:32 +0800
Subject: [PATCH 18/61] Increase max prompt length to 2048

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 51a9be58c37..7189bbc30c3 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -46,7 +46,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
-    data.max_prompt_length=512 \
+    data.max_prompt_length=2048 \
     data.max_response_length=512 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 605b23d754c..5862c1d4829 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -43,7 +43,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
-    data.max_prompt_length=512 \
+    data.max_prompt_length=2048 \
     data.max_response_length=512 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \

From 232e77a9e59633892e80e55dc9d3997afe83bc2b Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 17:52:47 +0800
Subject: [PATCH 19/61] Tune RLVR GRPO configs for LR decay and larger rollout
 batches

---
 .../run_grpo_fsdp_single_node.sh              | 21 ++++++++++++-------
 .../run_grpo_megatron_single_node.sh          | 21 ++++++++++++-------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 7189bbc30c3..e14abbb7c64 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -37,6 +37,9 @@ USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-auto}
 RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
+ACTOR_LR=${ACTOR_LR:-1e-6}
+MIN_LR=${MIN_LR:-1e-7}
+LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \
@@ -47,7 +50,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
     data.max_prompt_length=2048 \
-    data.max_response_length=512 \
+    data.max_response_length=2048 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
     actor_rollout_ref.model.path=$MODEL_ID \
@@ -56,24 +59,26 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
+    actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
+    actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0.0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
-    actor_rollout_ref.rollout.n=2 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=81920 \
     actor_rollout_ref.rollout.max_model_len=8192 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
     critic.strategy=$FSDP_STRATEGY \
     critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \
     critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 5862c1d4829..5605525960d 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -34,6 +34,9 @@ USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-auto}
 RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
+ACTOR_LR=${ACTOR_LR:-1e-6}
+MIN_LR=${MIN_LR:-1e-7}
+LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_megatron_trainer.yaml' \
@@ -44,29 +47,31 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
     data.max_prompt_length=2048 \
-    data.max_response_length=512 \
+    data.max_response_length=2048 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
     actor_rollout_ref.model.path=$MODEL_ID \
     actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
+    actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
+    actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0.0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
-    actor_rollout_ref.rollout.n=2 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=81920 \
     actor_rollout_ref.rollout.max_model_len=8192 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
     algorithm.use_kl_in_reward=False \

From 2e10ab5e0a8465d1c9451b69ed96bfa67c38fd0a Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 18:03:52 +0800
Subject: [PATCH 20/61] Align FSDP and Megatron rollout settings

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh           | 6 +++---
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index e14abbb7c64..4a4d096753b 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -72,10 +72,10 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=81920 \
-    actor_rollout_ref.rollout.max_model_len=8192 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    actor_rollout_ref.rollout.max_model_len=4096 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 5605525960d..b6cb00ed073 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -67,10 +67,10 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=81920 \
-    actor_rollout_ref.rollout.max_model_len=8192 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    actor_rollout_ref.rollout.max_model_len=4096 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \

From 070f9f64d8c3eb66ab27317de532e89200012402 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 18:09:32 +0800
Subject: [PATCH 21/61] Lower vLLM GPU memory utilization to 0.35

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 4a4d096753b..085340cd9d3 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -72,7 +72,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
     actor_rollout_ref.rollout.max_model_len=4096 \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index b6cb00ed073..09e0a24834e 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -67,7 +67,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
     actor_rollout_ref.rollout.max_model_len=4096 \

From a9d04070390578ecfd3c6eede6e7dae93e332874 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 6 Feb 2026 18:19:20 +0800
Subject: [PATCH 22/61] Reduce rollout memory pressure while keeping n=16

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 12 ++++++------
 .../run_grpo_megatron_single_node.sh                 | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 085340cd9d3..1658725fb2f 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -50,7 +50,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
     data.max_prompt_length=2048 \
-    data.max_response_length=2048 \
+    data.max_response_length=1024 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
     actor_rollout_ref.model.path=$MODEL_ID \
@@ -63,22 +63,22 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
     actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0.0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
-    actor_rollout_ref.rollout.max_model_len=4096 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
+    actor_rollout_ref.rollout.max_model_len=3072 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     critic.strategy=$FSDP_STRATEGY \
     critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \
     critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 09e0a24834e..2ad36a0bd4f 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -47,7 +47,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.return_raw_chat=$RETURN_RAW_CHAT \
     data.train_batch_size=64 \
     data.max_prompt_length=2048 \
-    data.max_response_length=2048 \
+    data.max_response_length=1024 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
     actor_rollout_ref.model.path=$MODEL_ID \
@@ -56,22 +56,22 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
     actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0.0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
-    actor_rollout_ref.rollout.max_model_len=4096 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
+    actor_rollout_ref.rollout.max_model_len=3072 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
     algorithm.use_kl_in_reward=False \

From 06bf1743dd3979f2a970c378ab21092af26f0278 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 14:42:24 +0800
Subject: [PATCH 23/61] Align FSDP GRPO config and add Qwen3 recipes

---
 .../run_sft_qwen3moe_235b_a22b_megatron.sh    | 113 ++++++++++++++++++
 .../run_sft_qwen3moe_30b_a3b_megatron.sh      | 113 ++++++++++++++++++
 .../run_grpo_fsdp_single_node.sh              |  14 ++-
 .../run_grpo_megatron_single_node.sh          |  15 +--
 4 files changed, 242 insertions(+), 13 deletions(-)
 create mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
 create mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh

diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
new file mode 100644
index 00000000000..39d84d8beab
--- /dev/null
+++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet}
+backend=${BACKEND:-megatron}
+project_name=verl_sft_235ba22b_2507
+RESUME_MODE=disable
+MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B}
+
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-64}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-4}
+PP_SIZE=${PP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-8}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+MEGATRON_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=6e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=6e-7 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}
+fi
+
+CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
+NNODES=${WORLD_SIZE:-16}           
+NODE_RANK=${RANK:-0}              
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
+MASTER_PORT=${MASTER_PORT:-23457} 
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    mkdir -p "${CKPT_HOME}"
+fi
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
+
+torchrun \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nproc-per-node=8 \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=256 \
+    data.max_length=1024 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=right \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=10240 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    +model.override_config.router_dtype="float16" \
+    model.enable_gradient_checkpointing=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=1000 \
+    'trainer.logger=[console]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=2 \
+    'checkpoint.save_contents=[model,optimizer,extra,hf_model]'
diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh
new file mode 100644
index 00000000000..a45209ffcc1
--- /dev/null
+++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet}
+backend=${BACKEND:-megatron}
+project_name=verl_sft_235ba22b_2507
+RESUME_MODE=disable
+MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B}
+
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-64}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-4}
+PP_SIZE=${PP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-8}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+MEGATRON_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=6e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=6e-7 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}
+fi
+
+CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
+NNODES=${WORLD_SIZE:-16}           
+NODE_RANK=${RANK:-0}              
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
+MASTER_PORT=${MASTER_PORT:-23457} 
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    mkdir -p "${CKPT_HOME}"
+fi
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
+
+torchrun \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nproc-per-node=8 \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=512 \
+    data.max_length=4096 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=right \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=24576 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    +model.override_config.output_router_logits=True \
+    +model.override_config.router_dtype="float32" \
+    model.enable_gradient_checkpointing=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=1000 \
+    'trainer.logger=[console]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=1 \
+    'checkpoint.save_contents=[model,optimizer,extra,hf_model]'
\ No newline at end of file
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 1658725fb2f..ba41dc99650 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,6 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
+export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -22,7 +23,7 @@ EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
 
 NNODES=${NNODES:-4}
 NODE_RANK=${NODE_RANK:-0}
-MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
+MASTER_ADDR=${MASTER_ADDR:-10.178.131.202}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 
@@ -48,8 +49,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_files=$TRAIN_FILES \
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
-    data.train_batch_size=64 \
-    data.max_prompt_length=2048 \
+    data.train_batch_size=32 \
+    data.max_prompt_length=1024 \
     data.max_response_length=1024 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
@@ -62,7 +63,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
     actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
     actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0.0 \
@@ -74,8 +75,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
-    actor_rollout_ref.rollout.max_model_len=3072 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=10384 \
+    actor_rollout_ref.rollout.max_model_len=2048 \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
@@ -107,5 +108,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.RAY_TMPDIR=$RAY_TMPDIR \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 2ad36a0bd4f..b8f92c11202 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,6 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
+export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -26,7 +27,7 @@ MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 
-TP_SIZE=1
+TP_SIZE=4
 PP_SIZE=1
 
 rollout_mode=${ROLLOUT_MODE:-async}
@@ -45,8 +46,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     data.val_files=$TRAIN_FILES \
     data.val_max_samples=2048 \
     data.return_raw_chat=$RETURN_RAW_CHAT \
-    data.train_batch_size=64 \
-    data.max_prompt_length=2048 \
+    data.train_batch_size=32 \
+    data.max_prompt_length=1024 \
     data.max_response_length=1024 \
     data.filter_overlong_prompts=False \
     data.truncation='error' \
@@ -55,7 +56,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
     actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
     actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
@@ -69,8 +70,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.mode=$rollout_mode \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
     actor_rollout_ref.rollout.n=16 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
-    actor_rollout_ref.rollout.max_model_len=3072 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=10384 \
+    actor_rollout_ref.rollout.max_model_len=2048 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \
@@ -99,4 +100,4 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
-    custom_reward_function.name=char_count_reward_function
+    custom_reward_function.name=char_count_reward_function
\ No newline at end of file

From 888ece5a1b9334dae5508df5b058eeb222446e44 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 14:47:11 +0800
Subject: [PATCH 24/61] Fix FSDP min_lr override

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index ba41dc99650..15548606d44 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -61,7 +61,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
-    actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
+    +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
     actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
     actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \

From 470995b6f072c4751ac5d1849b6e33eccb4d90b8 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 14:49:08 +0800
Subject: [PATCH 25/61] Fix FSDP lr_decay_style override

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 15548606d44..d72aeaab182 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -62,7 +62,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
     +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
-    actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
+    +actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
     actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \

From 3de34930f3ad6d7123e8a79e95aa4aeb640669dd Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:02:33 +0800
Subject: [PATCH 26/61] Align FSDP Ray settings with Megatron

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index d72aeaab182..9690dc85fbb 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -23,7 +23,7 @@ EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
 
 NNODES=${NNODES:-4}
 NODE_RANK=${NODE_RANK:-0}
-MASTER_ADDR=${MASTER_ADDR:-10.178.131.202}
+MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 
@@ -108,6 +108,5 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.RAY_TMPDIR=$RAY_TMPDIR \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 23151041f29e69fbce71187d1cd538c1dcde4224 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:11:25 +0800
Subject: [PATCH 27/61] Unset Ray socket env vars before launch

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh           | 4 ++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh       | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 9690dc85fbb..be54472b495 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -1,6 +1,10 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
+unset RAYLET_SOCKET_NAME
+unset PLASMA_STORE_SOCKET_NAME
+unset RAY_SESSION_DIR
+
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index b8f92c11202..9a66a9c033a 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -1,6 +1,10 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
+unset RAYLET_SOCKET_NAME
+unset PLASMA_STORE_SOCKET_NAME
+unset RAY_SESSION_DIR
+
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
@@ -100,4 +104,4 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
-    custom_reward_function.name=char_count_reward_function
\ No newline at end of file
+    custom_reward_function.name=char_count_reward_function

From ed816dc54947ce8dcded5830332a686fedbe1dfb Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:23:39 +0800
Subject: [PATCH 28/61] =?UTF-8?q?RLVR=5FABCDE=5Fdense:=20=E5=AF=B9?=
 =?UTF-8?q?=E9=BD=90=20FSDP/Megatron=20=E6=B6=88=E8=9E=8D=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE=E4=B8=8E=E5=A4=9A=E8=8A=82=E7=82=B9=20checkpoint=20?=
 =?UTF-8?q?=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh          | 7 ++++++-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh      | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index be54472b495..dde1d2da2b3 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -24,6 +24,8 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
 MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
+# 与 Megatron 一致：多节点时 checkpoint 需写共享目录，用绝对路径
+DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}
 
 NNODES=${NNODES:-4}
 NODE_RANK=${NODE_RANK:-0}
@@ -45,6 +47,8 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
+# 与 Megatron 一致，消融实验用同一显存占用
+GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \
@@ -77,7 +81,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.rollout.max_num_batched_tokens=10384 \
     actor_rollout_ref.rollout.max_model_len=2048 \
@@ -93,6 +97,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.logger='["console","wandb"]' \
     trainer.project_name=$PROJECT_NAME \
     trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$DEFAULT_LOCAL_DIR \
     trainer.val_before_train=True \
     trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
     trainer.nnodes=$NNODES \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 9a66a9c033a..05bce754450 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -24,6 +24,7 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
 MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single}
+DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}
 
 NNODES=${NNODES:-4}
 NODE_RANK=${NODE_RANK:-0}
@@ -42,6 +43,8 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
+# 与 FSDP 一致，消融实验用同一显存占用
+GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_megatron_trainer.yaml' \
@@ -72,7 +75,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.mode=$rollout_mode \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.rollout.max_num_batched_tokens=10384 \
     actor_rollout_ref.rollout.max_model_len=2048 \
@@ -84,6 +87,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.logger='["console","wandb"]' \
     trainer.project_name=$PROJECT_NAME \
     trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$DEFAULT_LOCAL_DIR \
     trainer.val_before_train=True \
     trainer.n_gpus_per_node=$N_GPUS_PER_NODE \
     trainer.nnodes=$NNODES \

From e562bfb9f06da26ee008cc73581700a10b194294 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:27:31 +0800
Subject: [PATCH 29/61] =?UTF-8?q?RLVR:=20ray.init=20=5Ftemp=5Fdir=20?=
 =?UTF-8?q?=E6=8C=87=E5=90=91=20RAY=5FTMPDIR=20=E9=81=BF=E5=85=8D=20/tmp?=
 =?UTF-8?q?=20=E7=A3=81=E7=9B=98=E9=85=8D=E9=A2=9D=E4=B8=8D=E8=B6=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh   | 3 +--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh          | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index dde1d2da2b3..14b30835619 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -24,7 +24,6 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
 MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
 PROJECT_NAME=${PROJECT_NAME:-rlvr_8b}
 EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single}
-# 与 Megatron 一致：多节点时 checkpoint 需写共享目录，用绝对路径
 DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}
 
 NNODES=${NNODES:-4}
@@ -47,7 +46,6 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
-# 与 Megatron 一致，消融实验用同一显存占用
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
@@ -104,6 +102,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.save_freq=300 \
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
+    +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 05bce754450..84af56a5ba6 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -43,7 +43,6 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
-# 与 FSDP 一致，消融实验用同一显存占用
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
@@ -94,6 +93,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.save_freq=300 \
     trainer.test_freq=300 \
     trainer.total_epochs=5 \
+    +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \

From 21c9dd8bc6000a24e7d65ea72e0bd4004c8495b1 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:49:27 +0800
Subject: [PATCH 30/61] Fix ray address and master port in launch scripts

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh         | 4 ----
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh     | 8 ++------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 14b30835619..1f3638d06c7 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -1,10 +1,6 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
-unset RAYLET_SOCKET_NAME
-unset PLASMA_STORE_SOCKET_NAME
-unset RAY_SESSION_DIR
-
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 84af56a5ba6..2ec95d226f0 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -1,10 +1,6 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
-unset RAYLET_SOCKET_NAME
-unset PLASMA_STORE_SOCKET_NAME
-unset RAY_SESSION_DIR
-
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
@@ -38,7 +34,7 @@ PP_SIZE=1
 rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
-RAY_ADDRESS=${RAY_ADDRESS:-auto}
+RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379}
 RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
@@ -98,7 +94,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \

From 7ab6ed61c8e6a81048dc036468db9f206741dd68 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 15:52:20 +0800
Subject: [PATCH 31/61] Fix Ray env var types for master port

---
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 2ec95d226f0..dd291d20ab6 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -94,7 +94,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \
+    +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \

From fd018b69ef754d483e3bb2883e6542d49ce2da17 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:01:13 +0800
Subject: [PATCH 32/61] Update RLVR launch scripts

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 1f3638d06c7..564e48b20eb 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp
+export RAY_TMPDIR=/hbox2dir/ray_tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index dd291d20ab6..e2a820c0328 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp
+export RAY_TMPDIR=/hbox2dir/ray_tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}

From 52fc39ac2c2a1d3a7b07344f4255ca2eae3ad020 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:03:13 +0800
Subject: [PATCH 33/61] Set explicit Ray address for FSDP launch

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 564e48b20eb..14cc639e75a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -37,7 +37,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False}
 rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
-RAY_ADDRESS=${RAY_ADDRESS:-auto}
+RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379}
 RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}

From 9b26af596fef7ce396097240dbec0774a031c64d Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:03:55 +0800
Subject: [PATCH 34/61] Update FSDP Ray head address

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 14cc639e75a..b27f8ebedcc 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -37,7 +37,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False}
 rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
-RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379}
+RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379}
 RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}

From 11a2cd1aeb74f053b46aee8a59f7589874391f97 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:06:54 +0800
Subject: [PATCH 35/61] Shorten Ray temp and working dir paths

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 ++--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index b27f8ebedcc..8d2e0630fea 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/hbox2dir/ray_tmp
+export RAY_TMPDIR=/hbox2dir/r
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -38,7 +38,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379}
-RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index e2a820c0328..62f037890b7 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/hbox2dir/ray_tmp
+export RAY_TMPDIR=/hbox2dir/r
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -35,7 +35,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379}
-RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}

From 34ad8385fc7cd1b50547b9375d9be3a018c81489 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:12:31 +0800
Subject: [PATCH 36/61] Avoid Ray working_dir packaging to shorten IPC paths

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh   | 3 +--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh          | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 8d2e0630fea..2651e9d0a30 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -100,8 +100,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
-    +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
+    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 62f037890b7..6d87f7519d5 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -91,8 +91,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     trainer.total_epochs=5 \
     +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \
     +ray_kwargs.ray_init.address=$RAY_ADDRESS \
-    +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \
+    +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \

From b25034e8f1be61ffe9c628cb8070eba7420964ef Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:16:43 +0800
Subject: [PATCH 37/61] Use user-owned short paths for Ray temp and work dirs

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh        | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 2651e9d0a30..7afec051a12 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,7 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/hbox2dir/r
+export RAY_TMPDIR=/llm-align/liuchonghan/r
+export TMPDIR=/llm-align/liuchonghan/tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -38,7 +39,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379}
-RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 6d87f7519d5..033a13f7da9 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,7 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/hbox2dir/r
+export RAY_TMPDIR=/llm-align/liuchonghan/r
+export TMPDIR=/llm-align/liuchonghan/tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
@@ -35,7 +36,7 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379}
-RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w}
+RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}

From fff6f0900c039e6df9768c1f829840e87e7d5811 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:25:39 +0800
Subject: [PATCH 38/61] Move Ray temp and TMPDIR to /dev/shm

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 ++--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 7afec051a12..3b2dfff4cf3 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,8 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/llm-align/liuchonghan/r
-export TMPDIR=/llm-align/liuchonghan/tmp
+export RAY_TMPDIR=/dev/shm/ray
+export TMPDIR=/dev/shm/tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 033a13f7da9..b960a5e0906 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,8 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export RAY_TMPDIR=/llm-align/liuchonghan/r
-export TMPDIR=/llm-align/liuchonghan/tmp
+export RAY_TMPDIR=/dev/shm/ray
+export TMPDIR=/dev/shm/tmp
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}

From 568690f420634e6d3ca1005a687473d3e6dd3f61 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:28:27 +0800
Subject: [PATCH 39/61] Pass TMPDIR to Ray runtime env

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh     | 1 +
 recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 3b2dfff4cf3..44443beab7e 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -107,6 +107,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index b960a5e0906..14e19b718fc 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -98,6 +98,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \

From d736fa306887763695175c1de8c38ae45acaeef1 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:33:10 +0800
Subject: [PATCH 40/61] Set WANDB_DIR to shared path for Ray workers

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 ++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 44443beab7e..f898c7d3477 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
 export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
 export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
@@ -107,6 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 14e19b718fc..b56b8915a0d 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
 export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
 export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
@@ -98,6 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \

From a7d74602dcf35ed7075a8d34ded48fd3bfdf2a47 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:41:13 +0800
Subject: [PATCH 41/61] Disable Gloo IPv6 in RLVR launch scripts

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 ++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index f898c7d3477..cee7f208e54 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,6 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export GLOO_IPV6=${GLOO_IPV6:-0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp
@@ -107,6 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index b56b8915a0d..d0e37bd952c 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,6 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export GLOO_IPV6=${GLOO_IPV6:-0}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp
@@ -98,6 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \

From 1e9e40b4aaa62f8b0194867edf6a6358a9616770 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:42:39 +0800
Subject: [PATCH 42/61] Ensure GLOO_IPV6 is passed as string

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index cee7f208e54..7a53ffcaf0a 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -13,7 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
-export GLOO_IPV6=${GLOO_IPV6:-0}
+export GLOO_IPV6=${GLOO_IPV6:-"0"}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index d0e37bd952c..1b70f1ee86b 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,7 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
-export GLOO_IPV6=${GLOO_IPV6:-0}
+export GLOO_IPV6=${GLOO_IPV6:-"0"}
 export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp

From fb012a9e3270aa87a2df76ce36e82c6b0c340140 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 16:44:29 +0800
Subject: [PATCH 43/61] Quote GLOO_IPV6 for Ray runtime env

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 +-
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 7a53ffcaf0a..839b9ab4874 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -108,7 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
-    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 1b70f1ee86b..bd20150c868 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -99,7 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
-    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \
+    +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \

From 23098c06a8c46d283cf63ebf1be74d6c006baf4c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 17:11:30 +0800
Subject: [PATCH 44/61] Fix FSDP optimizer overrides

---
 .../run_grpo_fsdp_single_node.sh              | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 839b9ab4874..90e375dd528 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -41,12 +41,23 @@ rollout_mode=${ROLLOUT_MODE:-async}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True}
 RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379}
-RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
-LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine}
+LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} # constant|cosine
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
+# FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`).
+# Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided.
+MIN_LR_RATIO=${MIN_LR_RATIO:-}
+if [[ -z "${MIN_LR_RATIO}" ]]; then
+    MIN_LR_RATIO=$(python3 - <<PY
+actor_lr = float("${ACTOR_LR}")
+min_lr = float("${MIN_LR}")
+print(min_lr / actor_lr if actor_lr > 0 else 0.0)
+PY
+)
+fi
+
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \
     algorithm.adv_estimator=grpo \
@@ -66,8 +77,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
-    +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \
-    +actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \
+    +actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \
+    +actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \
     actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \

From 603824deaa18356720001a88185c824157939187 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 17:13:09 +0800
Subject: [PATCH 45/61] Fix Hydra overrides for FSDP optimizer

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 90e375dd528..f53317e200d 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -43,7 +43,7 @@ RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True}
 RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379}
 ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
-LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} # constant|cosine
+LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 
 # FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`).
@@ -77,8 +77,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \
     actor_rollout_ref.actor.optim.lr=$ACTOR_LR \
-    +actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \
-    +actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \
+    actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \
+    actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \
     actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \

From b8fba05419af0c9450d2b1ccbc756491bc668a64 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 17:23:43 +0800
Subject: [PATCH 46/61] Pass WANDB_API_KEY to Ray runtime env

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index f53317e200d..ed5cb78c173 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_API_KEY=${WANDB_API_KEY:-}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
 export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
@@ -121,6 +122,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
     +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_API_KEY=$WANDB_API_KEY \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \

From 253fe3fcac9b2d81559da07ed858527c15141983 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 17:40:06 +0800
Subject: [PATCH 47/61] Add JSON-to-parquet converter for VERL SFT

---
 scripts/json_qa_to_verl_sft_parquet.py | 192 +++++++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 scripts/json_qa_to_verl_sft_parquet.py

diff --git a/scripts/json_qa_to_verl_sft_parquet.py b/scripts/json_qa_to_verl_sft_parquet.py
new file mode 100644
index 00000000000..e475db2e361
--- /dev/null
+++ b/scripts/json_qa_to_verl_sft_parquet.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Convert a QA-style dataset (JSON array or JSONL) into a VERL SFT parquet file.
+
+Input item example:
+  {"question": "...", "response": "..."}
+
+Output schemas:
+- single_turn: columns `question` and `answer` (strings)
+  Use with `verl/trainer/config/sft_trainer.yaml` defaults:
+    data.prompt_key=question
+    data.response_key=answer
+
+- messages: column `messages` (list of {role, content})
+  Use with `verl/trainer/config/sft_trainer_engine.yaml` (MultiTurnSFTDataset):
+    data.messages_key=messages
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from typing import Any, Dict, Iterator, List, Optional
+
+
+def iter_items(path: str) -> Iterator[Dict[str, Any]]:
+    """
+    Iterate items from either:
+    - JSON array file: [ {...}, {...}, ... ]
+    - JSONL file: one JSON object per line
+
+    For huge JSON arrays, install `ijson` to stream:
+      pip install ijson
+    """
+    try:
+        import ijson  # type: ignore
+    except Exception:
+        ijson = None
+
+    with open(path, "rb") as f:
+        # Peek the first non-whitespace byte.
+        first = None
+        while True:
+            b = f.read(1)
+            if not b:
+                break
+            if b not in b" \t\r\n":
+                first = b
+                break
+        f.seek(0)
+
+        if first == b"[":
+            if ijson is None:
+                data = json.load(f)
+                if not isinstance(data, list):
+                    raise ValueError(f"Expected a JSON array in {path}")
+                for obj in data:
+                    if not isinstance(obj, dict):
+                        raise ValueError(f"Expected dict items, got {type(obj)}")
+                    yield obj
+                return
+
+            for obj in ijson.items(f, "item"):
+                if not isinstance(obj, dict):
+                    raise ValueError(f"Expected dict items, got {type(obj)}")
+                yield obj
+            return
+
+        # JSONL fallback
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            if not isinstance(obj, dict):
+                raise ValueError(f"Expected dict items, got {type(obj)}")
+            yield obj
+
+
+def make_row(
+    item: Dict[str, Any],
+    *,
+    input_key: str,
+    output_key: str,
+    out_format: str,
+    system_prompt: Optional[str],
+) -> Dict[str, Any]:
+    q = item.get(input_key)
+    a = item.get(output_key)
+    if q is None or a is None:
+        raise KeyError(f"Missing keys: {input_key!r} / {output_key!r}. Got keys={sorted(item.keys())}")
+    if not isinstance(q, str) or not isinstance(a, str):
+        raise TypeError(f"Expected strings; got {type(q)} / {type(a)}")
+
+    if out_format == "single_turn":
+        return {"question": q, "answer": a}
+
+    if out_format == "messages":
+        messages: List[Dict[str, str]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": q})
+        messages.append({"role": "assistant", "content": a})
+        return {"messages": messages}
+
+    raise ValueError(f"Unknown out_format: {out_format}")
+
+
+def write_parquet(
+    *,
+    input_path: str,
+    output_path: str,
+    input_key: str,
+    output_key: str,
+    out_format: str,
+    system_prompt: Optional[str],
+    batch_size: int,
+) -> int:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+
+    if out_format == "single_turn":
+        schema = pa.schema([("question", pa.string()), ("answer", pa.string())])
+    else:
+        msg_struct = pa.struct([("role", pa.string()), ("content", pa.string())])
+        schema = pa.schema([("messages", pa.list_(msg_struct))])
+
+    writer: Optional[pq.ParquetWriter] = None
+    buf: List[Dict[str, Any]] = []
+    total = 0
+
+    def flush() -> None:
+        nonlocal writer, buf, total
+        if not buf:
+            return
+        table = pa.Table.from_pylist(buf, schema=schema)
+        if writer is None:
+            writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")
+        writer.write_table(table)
+        total += len(buf)
+        buf = []
+
+    try:
+        for it in iter_items(input_path):
+            buf.append(
+                make_row(
+                    it,
+                    input_key=input_key,
+                    output_key=output_key,
+                    out_format=out_format,
+                    system_prompt=system_prompt,
+                )
+            )
+            if len(buf) >= batch_size:
+                flush()
+        flush()
+    finally:
+        if writer is not None:
+            writer.close()
+
+    return total
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    ap.add_argument("--input", required=True, help="Input JSON/JSONL path")
+    ap.add_argument("--output", required=True, help="Output parquet path")
+    ap.add_argument("--input_key", default="question", help="Field name for prompt text")
+    ap.add_argument("--output_key", default="response", help="Field name for response text")
+    ap.add_argument("--format", dest="out_format", choices=["single_turn", "messages"], default="single_turn")
+    ap.add_argument("--system_prompt", default=None, help="Optional system prompt (messages format only)")
+    ap.add_argument("--batch_size", type=int, default=4096, help="Write batch size")
+    args = ap.parse_args()
+
+    n = write_parquet(
+        input_path=args.input,
+        output_path=args.output,
+        input_key=args.input_key,
+        output_key=args.output_key,
+        out_format=args.out_format,
+        system_prompt=args.system_prompt,
+        batch_size=args.batch_size,
+    )
+    print(f"[OK] Wrote {n} rows -> {args.output}")
+
+
+if __name__ == "__main__":
+    main()
+

From 4b96b3b63676112bb698157b3a3ce1e419e50b13 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 17:49:26 +0800
Subject: [PATCH 48/61] Tune FSDP rollout weight-sync bucket

---
 .../run_grpo_fsdp_single_node.sh              |   2 +
 scripts/json_qa_to_verl_sft_parquet.py        | 192 ------------------
 2 files changed, 2 insertions(+), 192 deletions(-)
 delete mode 100644 scripts/json_qa_to_verl_sft_parquet.py

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index ed5cb78c173..da3598b1f6e 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -46,6 +46,7 @@ ACTOR_LR=${ACTOR_LR:-1e-6}
 MIN_LR=${MIN_LR:-1e-7}
 LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
+UPDATE_WEIGHTS_BUCKET_MB=${UPDATE_WEIGHTS_BUCKET_MB:-4096}
 
 # FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`).
 # Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided.
@@ -94,6 +95,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.rollout.max_num_batched_tokens=10384 \
     actor_rollout_ref.rollout.max_model_len=2048 \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=$UPDATE_WEIGHTS_BUCKET_MB \
     actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \
     actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
diff --git a/scripts/json_qa_to_verl_sft_parquet.py b/scripts/json_qa_to_verl_sft_parquet.py
deleted file mode 100644
index e475db2e361..00000000000
--- a/scripts/json_qa_to_verl_sft_parquet.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert a QA-style dataset (JSON array or JSONL) into a VERL SFT parquet file.
-
-Input item example:
-  {"question": "...", "response": "..."}
-
-Output schemas:
-- single_turn: columns `question` and `answer` (strings)
-  Use with `verl/trainer/config/sft_trainer.yaml` defaults:
-    data.prompt_key=question
-    data.response_key=answer
-
-- messages: column `messages` (list of {role, content})
-  Use with `verl/trainer/config/sft_trainer_engine.yaml` (MultiTurnSFTDataset):
-    data.messages_key=messages
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-from typing import Any, Dict, Iterator, List, Optional
-
-
-def iter_items(path: str) -> Iterator[Dict[str, Any]]:
-    """
-    Iterate items from either:
-    - JSON array file: [ {...}, {...}, ... ]
-    - JSONL file: one JSON object per line
-
-    For huge JSON arrays, install `ijson` to stream:
-      pip install ijson
-    """
-    try:
-        import ijson  # type: ignore
-    except Exception:
-        ijson = None
-
-    with open(path, "rb") as f:
-        # Peek the first non-whitespace byte.
-        first = None
-        while True:
-            b = f.read(1)
-            if not b:
-                break
-            if b not in b" \t\r\n":
-                first = b
-                break
-        f.seek(0)
-
-        if first == b"[":
-            if ijson is None:
-                data = json.load(f)
-                if not isinstance(data, list):
-                    raise ValueError(f"Expected a JSON array in {path}")
-                for obj in data:
-                    if not isinstance(obj, dict):
-                        raise ValueError(f"Expected dict items, got {type(obj)}")
-                    yield obj
-                return
-
-            for obj in ijson.items(f, "item"):
-                if not isinstance(obj, dict):
-                    raise ValueError(f"Expected dict items, got {type(obj)}")
-                yield obj
-            return
-
-        # JSONL fallback
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            obj = json.loads(line)
-            if not isinstance(obj, dict):
-                raise ValueError(f"Expected dict items, got {type(obj)}")
-            yield obj
-
-
-def make_row(
-    item: Dict[str, Any],
-    *,
-    input_key: str,
-    output_key: str,
-    out_format: str,
-    system_prompt: Optional[str],
-) -> Dict[str, Any]:
-    q = item.get(input_key)
-    a = item.get(output_key)
-    if q is None or a is None:
-        raise KeyError(f"Missing keys: {input_key!r} / {output_key!r}. Got keys={sorted(item.keys())}")
-    if not isinstance(q, str) or not isinstance(a, str):
-        raise TypeError(f"Expected strings; got {type(q)} / {type(a)}")
-
-    if out_format == "single_turn":
-        return {"question": q, "answer": a}
-
-    if out_format == "messages":
-        messages: List[Dict[str, str]] = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": q})
-        messages.append({"role": "assistant", "content": a})
-        return {"messages": messages}
-
-    raise ValueError(f"Unknown out_format: {out_format}")
-
-
-def write_parquet(
-    *,
-    input_path: str,
-    output_path: str,
-    input_key: str,
-    output_key: str,
-    out_format: str,
-    system_prompt: Optional[str],
-    batch_size: int,
-) -> int:
-    import pyarrow as pa
-    import pyarrow.parquet as pq
-
-    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
-
-    if out_format == "single_turn":
-        schema = pa.schema([("question", pa.string()), ("answer", pa.string())])
-    else:
-        msg_struct = pa.struct([("role", pa.string()), ("content", pa.string())])
-        schema = pa.schema([("messages", pa.list_(msg_struct))])
-
-    writer: Optional[pq.ParquetWriter] = None
-    buf: List[Dict[str, Any]] = []
-    total = 0
-
-    def flush() -> None:
-        nonlocal writer, buf, total
-        if not buf:
-            return
-        table = pa.Table.from_pylist(buf, schema=schema)
-        if writer is None:
-            writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")
-        writer.write_table(table)
-        total += len(buf)
-        buf = []
-
-    try:
-        for it in iter_items(input_path):
-            buf.append(
-                make_row(
-                    it,
-                    input_key=input_key,
-                    output_key=output_key,
-                    out_format=out_format,
-                    system_prompt=system_prompt,
-                )
-            )
-            if len(buf) >= batch_size:
-                flush()
-        flush()
-    finally:
-        if writer is not None:
-            writer.close()
-
-    return total
-
-
-def main() -> None:
-    ap = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    ap.add_argument("--input", required=True, help="Input JSON/JSONL path")
-    ap.add_argument("--output", required=True, help="Output parquet path")
-    ap.add_argument("--input_key", default="question", help="Field name for prompt text")
-    ap.add_argument("--output_key", default="response", help="Field name for response text")
-    ap.add_argument("--format", dest="out_format", choices=["single_turn", "messages"], default="single_turn")
-    ap.add_argument("--system_prompt", default=None, help="Optional system prompt (messages format only)")
-    ap.add_argument("--batch_size", type=int, default=4096, help="Write batch size")
-    args = ap.parse_args()
-
-    n = write_parquet(
-        input_path=args.input,
-        output_path=args.output,
-        input_key=args.input_key,
-        output_key=args.output_key,
-        out_format=args.out_format,
-        system_prompt=args.system_prompt,
-        batch_size=args.batch_size,
-    )
-    print(f"[OK] Wrote {n} rows -> {args.output}")
-
-
-if __name__ == "__main__":
-    main()
-

From 2be47a4eedb4f8cfc72e3f7b59ad68995df0941c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 18:19:00 +0800
Subject: [PATCH 49/61] Propagate proxy and tmp dirs to Ray env

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh           | 6 ++++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh       | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index da3598b1f6e..ec65374c59e 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -12,6 +12,8 @@ export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
 export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
 export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
+export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
+export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -19,6 +21,8 @@ export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp
 
+mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR"
+
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
 MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
@@ -132,5 +136,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index bd20150c868..97eeb9a7785 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -11,6 +11,8 @@ export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
 export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
 export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
+export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
+export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -18,6 +20,8 @@ export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
 export RAY_TMPDIR=/dev/shm/ray
 export TMPDIR=/dev/shm/tmp
 
+mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR"
+
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"}
 TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json}
 MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B}
@@ -108,5 +112,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 093ed14f9f36267a571cdec923e6f68bb3a17b5c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 18:35:41 +0800
Subject: [PATCH 50/61] Fix SFT Megatron lr scheduler steps

---
 .../run_sft_qwen3moe_235b_a22b_megatron.sh    | 47 +++++++++++++++++--
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
index 39d84d8beab..ed6b21b5546 100644
--- a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
+++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
@@ -2,11 +2,14 @@
 set -xeuo pipefail
 
 ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
-TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet}
+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
 backend=${BACKEND:-megatron}
 project_name=verl_sft_235ba22b_2507
 RESUME_MODE=disable
-MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B}
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-235B-A22B-Instruct-2507}
+TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
+TOTAL_TRAINING_STEPS=${TOTAL_TRAINING_STEPS:-}
 
 SP_SIZE=${SP_SIZE:-1}
 FSDP_SIZE=${FSDP_SIZE:-64}
@@ -80,6 +83,39 @@ export NCCL_DEBUG=WARN
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
 
+if [[ -z "${TOTAL_TRAINING_STEPS}" ]]; then
+    # Megatron's OptimizerParamScheduler asserts `lr_decay_steps > 0`.
+    # VERL SFT derives total steps from `len(train_dataloader)`, which can be 0/unknown with some samplers
+    # (e.g. dynamic-bsz). Provide a safe positive estimate based on parquet row count.
+    TOTAL_TRAINING_STEPS=$(python3 - <<'PY'
+import math
+import os
+
+train_files = os.environ.get("TRAIN_FILES", "")
+batch_size = int(os.environ.get("TRAIN_BATCH_SIZE", "256"))
+epochs = int(os.environ.get("TOTAL_EPOCHS", "1"))
+
+rows = None
+try:
+    import pyarrow.parquet as pq
+
+    rows = pq.ParquetFile(train_files).metadata.num_rows
+except Exception:
+    rows = None
+
+if rows is None:
+    steps = 1000 * max(1, epochs)
+else:
+    steps_per_epoch = max(1, math.ceil(rows / max(1, batch_size)))
+    steps = steps_per_epoch * max(1, epochs)
+
+print(steps)
+PY
+)
+fi
+
+echo ">>> SFT steps: total_epochs=${TOTAL_EPOCHS}, train_batch_size=${TRAIN_BATCH_SIZE}, total_training_steps=${TOTAL_TRAINING_STEPS}"
+
 torchrun \
     --nnodes=${NNODES} \
     --node_rank=${NODE_RANK} \
@@ -88,7 +124,7 @@ torchrun \
     --nproc-per-node=8 \
     ${ENTRYPOINT} \
     data.train_files="${TRAIN_FILES}" \
-    data.train_batch_size=256 \
+    data.train_batch_size=${TRAIN_BATCH_SIZE} \
     data.max_length=1024 \
     data.pad_mode=${PAD_MODE} \
     data.truncation=right \
@@ -102,11 +138,12 @@ torchrun \
     model.enable_gradient_checkpointing=True \
     ${ENGINE_CONFIG} \
     trainer.test_freq=-1 \
-    trainer.save_freq=1000 \
+    trainer.save_freq=2000 \
     'trainer.logger=[console]' \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
-    trainer.total_epochs=1 \
+    trainer.total_epochs=${TOTAL_EPOCHS} \
+    trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \
     trainer.default_local_dir="${CKPT_HOME}" \
     trainer.resume_mode=${RESUME_MODE} \
     trainer.max_ckpt_to_keep=2 \

From cba9e5ea9d15b29dc0f3a26112217003cd007a71 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 19:06:11 +0800
Subject: [PATCH 51/61] Add NO_PROXY for internal traffic

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 ++++
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index ec65374c59e..2862547f79d 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -14,6 +14,8 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
 export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
+export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
+export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -138,5 +140,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 97eeb9a7785..5996c52b710 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -13,6 +13,8 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
 export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
 export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
+export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
+export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -114,5 +116,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \
+    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 3cee17dbca58c251155e6710a27b957a0e932c90 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 10 Feb 2026 19:08:30 +0800
Subject: [PATCH 52/61] Quote NO_PROXY for Hydra overrides

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh  | 4 ++--
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 2862547f79d..f153c0698f6 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -140,7 +140,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 5996c52b710..9967d82d4c0 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -116,7 +116,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
     custom_reward_function.name=char_count_reward_function

From 70616b261770cdef6b95de4221ca7f08c4fb0fba Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 10:35:11 +0800
Subject: [PATCH 53/61] Force proxy env vars for Ray workers

---
 .../run_grpo_fsdp_single_node.sh                | 17 ++++++++++-------
 .../run_grpo_megatron_single_node.sh            | 17 ++++++++++-------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index f153c0698f6..524e5caa10c 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -8,14 +8,17 @@ export WANDB_MODE=${WANDB_MODE:-online}
 export WANDB_API_KEY=${WANDB_API_KEY:-}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
-export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
-export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
+# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`).
+export HTTP_PROXY="$WANDB_PROXY_URL"
+export HTTPS_PROXY="$WANDB_PROXY_URL"
+export http_proxy="$WANDB_PROXY_URL"
+export https_proxy="$WANDB_PROXY_URL"
+export ALL_PROXY="$WANDB_PROXY_URL"
+export all_proxy="$WANDB_PROXY_URL"
+
+# Ensure internal traffic never goes through the proxy.
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
-export no_proxy=${no_proxy:-"$NO_PROXY"}
+export no_proxy="$NO_PROXY"
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 9967d82d4c0..2baa5f24c5d 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -7,14 +7,17 @@ export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-online}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
-export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL}
-export all_proxy=${all_proxy:-$WANDB_PROXY_URL}
+# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`).
+export HTTP_PROXY="$WANDB_PROXY_URL"
+export HTTPS_PROXY="$WANDB_PROXY_URL"
+export http_proxy="$WANDB_PROXY_URL"
+export https_proxy="$WANDB_PROXY_URL"
+export ALL_PROXY="$WANDB_PROXY_URL"
+export all_proxy="$WANDB_PROXY_URL"
+
+# Ensure internal traffic never goes through the proxy.
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
-export no_proxy=${no_proxy:-"$NO_PROXY"}
+export no_proxy="$NO_PROXY"
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}

From 2cc92e97944b908c387b4760ff39183174241a6d Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 10:52:30 +0800
Subject: [PATCH 54/61] recipes: drop ALL_PROXY from GRPO scripts

---
 .../run_grpo_fsdp_single_node.sh                | 17 +++++------------
 .../run_grpo_megatron_single_node.sh            | 17 +++++------------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 524e5caa10c..63f83e8d0ca 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -8,17 +8,12 @@ export WANDB_MODE=${WANDB_MODE:-online}
 export WANDB_API_KEY=${WANDB_API_KEY:-}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`).
-export HTTP_PROXY="$WANDB_PROXY_URL"
-export HTTPS_PROXY="$WANDB_PROXY_URL"
-export http_proxy="$WANDB_PROXY_URL"
-export https_proxy="$WANDB_PROXY_URL"
-export ALL_PROXY="$WANDB_PROXY_URL"
-export all_proxy="$WANDB_PROXY_URL"
-
-# Ensure internal traffic never goes through the proxy.
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
-export no_proxy="$NO_PROXY"
+export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -141,8 +136,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 2baa5f24c5d..0a28cc6b5be 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -7,17 +7,12 @@ export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-online}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
 export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`).
-export HTTP_PROXY="$WANDB_PROXY_URL"
-export HTTPS_PROXY="$WANDB_PROXY_URL"
-export http_proxy="$WANDB_PROXY_URL"
-export https_proxy="$WANDB_PROXY_URL"
-export ALL_PROXY="$WANDB_PROXY_URL"
-export all_proxy="$WANDB_PROXY_URL"
-
-# Ensure internal traffic never goes through the proxy.
+export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
+export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
+export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
+export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
-export no_proxy="$NO_PROXY"
+export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
 export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
 export GLOO_IPV6=${GLOO_IPV6:-"0"}
@@ -117,8 +112,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
     +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \
     +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \

From 2105fb4b53e9e4aef0d0c084625385fa364c58d4 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 11:00:20 +0800
Subject: [PATCH 55/61] debug

---
 ...un_sft_qwen3moe_235b_a22b_megatron_dlc.sh} |   0
 .../run_sft_qwen3moe_30b_a3b_megatron.sh      | 113 ------------------
 ..._sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh} |   0
 ... run_sft_qwen3moe_30b_a3b_megatron_dlc.sh} |   0
 4 files changed, 113 deletions(-)
 rename recipes_custom/{Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh => Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh} (100%)
 delete mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh
 rename recipes_custom/Qwen3-30BA3B-translate/{run_sft_qwen3moe_30b_a3b_megatron_aux.sh => run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh} (100%)
 rename recipes_custom/Qwen3-30BA3B-translate/{run_sft_qwen3moe_30b_a3b_megatron.sh => run_sft_qwen3moe_30b_a3b_megatron_dlc.sh} (100%)

diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh
similarity index 100%
rename from recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh
rename to recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh
diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh
deleted file mode 100644
index a45209ffcc1..00000000000
--- a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
-TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet}
-backend=${BACKEND:-megatron}
-project_name=verl_sft_235ba22b_2507
-RESUME_MODE=disable
-MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B}
-
-SP_SIZE=${SP_SIZE:-1}
-FSDP_SIZE=${FSDP_SIZE:-64}
-FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
-
-TP_SIZE=${TP_SIZE:-4}
-PP_SIZE=${PP_SIZE:-1}
-EP_SIZE=${EP_SIZE:-8}
-VPP_SIZE=${VPP_SIZE:-null}
-CP_SIZE=${CP_SIZE:-1}
-
-PAD_MODE=${PAD_MODE:-no_padding}
-USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
-
-FSDP_ENGINE_CONFIG="
-    engine=${backend} \
-    optim=${backend} \
-    optim.lr=5e-6 \
-    optim.lr_warmup_steps_ratio=0.05 \
-    optim.weight_decay=0.1 \
-    optim.betas="[0.9,0.95]" \
-    optim.clip_grad=1.0 \
-    optim.min_lr_ratio=0.1 \
-    optim.warmup_style=cosine \
-    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
-    engine.strategy=${FSDP_STRATEGY} \
-    engine.fsdp_size=${FSDP_SIZE}"
-
-MEGATRON_ENGINE_CONFIG="
-    engine=${backend} \
-    optim=${backend} \
-    optim.lr=6e-6 \
-    optim.lr_warmup_steps_ratio=0.05 \
-    optim.weight_decay=0.1 \
-    optim.betas="[0.9,0.95]" \
-    optim.clip_grad=1.0 \
-    optim.lr_warmup_init=0 \
-    optim.lr_decay_style=cosine \
-    optim.min_lr=6e-7 \
-    engine.tensor_model_parallel_size=${TP_SIZE} \
-    engine.pipeline_model_parallel_size=${PP_SIZE} \
-    engine.expert_model_parallel_size=${EP_SIZE} \
-    engine.context_parallel_size=${CP_SIZE} \
-    engine.use_mbridge=True"
-
-if [ "$backend" = "fsdp" ]; then
-    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
-    echo "Using fsdp engine"
-    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
-else
-    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
-    echo "Using megatron engine"
-    exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}
-fi
-
-CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
-NNODES=${WORLD_SIZE:-16}           
-NODE_RANK=${RANK:-0}              
-MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
-MASTER_PORT=${MASTER_PORT:-23457} 
-
-echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
-echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
-
-if [ "$NODE_RANK" -eq 0 ]; then
-    mkdir -p "${CKPT_HOME}"
-fi
-
-export WANDB_MODE=offline
-export NCCL_DEBUG=WARN
-export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
-
-torchrun \
-    --nnodes=${NNODES} \
-    --node_rank=${NODE_RANK} \
-    --master_addr=${MASTER_ADDR} \
-    --master_port=${MASTER_PORT} \
-    --nproc-per-node=8 \
-    ${ENTRYPOINT} \
-    data.train_files="${TRAIN_FILES}" \
-    data.train_batch_size=512 \
-    data.max_length=4096 \
-    data.pad_mode=${PAD_MODE} \
-    data.truncation=right \
-    data.use_dynamic_bsz=True \
-    data.max_token_len_per_gpu=24576 \
-    data.messages_key=messages \
-    data.ignore_input_ids_mismatch=True \
-    model.path=$MODEL_ID \
-    model.use_remove_padding=${USE_REMOVE_PADDING} \
-    +model.override_config.output_router_logits=True \
-    +model.override_config.router_dtype="float32" \
-    model.enable_gradient_checkpointing=True \
-    ${ENGINE_CONFIG} \
-    trainer.test_freq=-1 \
-    trainer.save_freq=1000 \
-    'trainer.logger=[console]' \
-    trainer.project_name="${project_name}" \
-    trainer.experiment_name="${exp_name}" \
-    trainer.total_epochs=1 \
-    trainer.default_local_dir="${CKPT_HOME}" \
-    trainer.resume_mode=${RESUME_MODE} \
-    trainer.max_ckpt_to_keep=1 \
-    'checkpoint.save_contents=[model,optimizer,extra,hf_model]'
\ No newline at end of file
diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh
similarity index 100%
rename from recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh
rename to recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh
diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh
similarity index 100%
rename from recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh
rename to recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh

From 86c552933f2f1ff8acfe132fc59048a1bf502294 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 11:36:52 +0800
Subject: [PATCH 56/61] recipes: disable proxy and use wandb offline for GRPO

---
 .../run_grpo_fsdp_single_node.sh              | 20 +++++++++----------
 .../run_grpo_megatron_single_node.sh          | 20 +++++++++----------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 63f83e8d0ca..6a24c62af97 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -4,14 +4,12 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_MODE=${WANDB_MODE:-offline}
 export WANDB_API_KEY=${WANDB_API_KEY:-}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
-export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
+# Proxy is disabled by default. If you need it temporarily, set env vars
+# outside this script and remove the unsets below.
+unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
 export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
@@ -131,11 +129,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_API_KEY=$WANDB_API_KEY \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
-    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 0a28cc6b5be..7eefae2aca0 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -4,13 +4,11 @@ set -xeuo pipefail
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
-export WANDB_MODE=${WANDB_MODE:-online}
+export WANDB_MODE=${WANDB_MODE:-offline}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
-export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'}
-export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL}
-export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL}
-export http_proxy=${http_proxy:-$WANDB_PROXY_URL}
-export https_proxy=${https_proxy:-$WANDB_PROXY_URL}
+# Proxy is disabled by default. If you need it temporarily, set env vars
+# outside this script and remove the unsets below.
+unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
 export no_proxy=${no_proxy:-"$NO_PROXY"}
 export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-}
@@ -107,11 +105,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \
     +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \
     +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \
-    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \
-    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \
-    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \
-    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \
+    +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \
+    +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \
     +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \
     custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \

From 8908e254c44a5600d49faf2b3ff8330597de3776 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 12:03:17 +0800
Subject: [PATCH 57/61] k

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 2 --
 .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh           | 2 --
 2 files changed, 4 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 6a24c62af97..843b5c72127 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -7,8 +7,6 @@ export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-offline}
 export WANDB_API_KEY=${WANDB_API_KEY:-}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
-# Proxy is disabled by default. If you need it temporarily, set env vars
-# outside this script and remove the unsets below.
 unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
 export no_proxy=${no_proxy:-"$NO_PROXY"}
diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
index 7eefae2aca0..e427bc7aefd 100755
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh
@@ -6,8 +6,6 @@ export VLLM_USE_V1=1
 export VERL_USE_GPT_OSS=0
 export WANDB_MODE=${WANDB_MODE:-offline}
 export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb}
-# Proxy is disabled by default. If you need it temporarily, set env vars
-# outside this script and remove the unsets below.
 unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy
 export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"}
 export no_proxy=${no_proxy:-"$NO_PROXY"}

From 435467f04c834dcf0558d4d82260c327f5e0e7d2 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 11 Feb 2026 16:44:20 +0800
Subject: [PATCH 58/61] recipes: set FSDP MASTER_ADDR default

---
 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 843b5c72127..869f5ca231d 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -28,7 +28,8 @@ DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJ
 
 NNODES=${NNODES:-4}
 NODE_RANK=${NODE_RANK:-0}
-MASTER_ADDR=${MASTER_ADDR:-10.178.170.212}
+# FSDP cluster: Ray head + torch master live on the FSDP master node by default.
+MASTER_ADDR=${MASTER_ADDR:-10.178.131.202}
 MASTER_PORT=${MASTER_PORT:-23457}
 N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8}
 

From 6e4ce3dcf20ae3393d985a77d388ef303b4361e4 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 26 Feb 2026 14:54:41 +0800
Subject: [PATCH 59/61] chore: update custom training recipes

---
 recipe                                        |   2 +-
 .../run_sft_qwen2.5_72b_megatron_dlc.sh       | 110 ++++++++++++++++++
 ...run_sft_qwen3moe_235b_a22b_megatron_dlc.sh |  35 ------
 3 files changed, 111 insertions(+), 36 deletions(-)
 create mode 100644 recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh

diff --git a/recipe b/recipe
index 3490a22a0a3..21892b92769 160000
--- a/recipe
+++ b/recipe
@@ -1 +1 @@
-Subproject commit 3490a22a0a3adeb7e4787fe70b1060b642efbae4
+Subproject commit 21892b9276936efab5375c3f6b8415e472ef7118
diff --git a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh
new file mode 100644
index 00000000000..6691d574949
--- /dev/null
+++ b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet}
+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
+backend=${BACKEND:-megatron}
+project_name=verl_sft_qwen2.5_72b
+RESUME_MODE=disable
+MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen2.5-72B-A064}
+TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
+
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-64}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-8}
+PP_SIZE=${PP_SIZE:-1}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=5e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+MEGATRON_ENGINE_CONFIG="
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=6e-6 \
+    optim.lr_warmup_steps_ratio=0.05 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=6e-7 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE}"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=qwen2.5-72b-dense-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=qwen2.5-72b-dense-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}
+fi
+
+CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
+NNODES=${WORLD_SIZE:-16}           
+NODE_RANK=${RANK:-0}              
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
+MASTER_PORT=${MASTER_PORT:-23457} 
+
+echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
+echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    mkdir -p "${CKPT_HOME}"
+fi
+
+export WANDB_MODE=offline
+export NCCL_DEBUG=WARN
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
+
+torchrun \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nproc-per-node=8 \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=${TRAIN_BATCH_SIZE} \
+    data.max_length=2048 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=right \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=4096 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    model.enable_gradient_checkpointing=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=2000 \
+    'trainer.logger=[console]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=${TOTAL_EPOCHS} \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=2 \
+    'checkpoint.save_contents=[model,optimizer,extra,hf_model]'
diff --git a/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh
index ed6b21b5546..c86d4516133 100644
--- a/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh
+++ b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh
@@ -9,7 +9,6 @@ project_name=verl_sft_235ba22b_2507
 RESUME_MODE=disable
 MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-235B-A22B-Instruct-2507}
 TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
-TOTAL_TRAINING_STEPS=${TOTAL_TRAINING_STEPS:-}
 
 SP_SIZE=${SP_SIZE:-1}
 FSDP_SIZE=${FSDP_SIZE:-64}
@@ -83,39 +82,6 @@ export NCCL_DEBUG=WARN
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao
 
-if [[ -z "${TOTAL_TRAINING_STEPS}" ]]; then
-    # Megatron's OptimizerParamScheduler asserts `lr_decay_steps > 0`.
-    # VERL SFT derives total steps from `len(train_dataloader)`, which can be 0/unknown with some samplers
-    # (e.g. dynamic-bsz). Provide a safe positive estimate based on parquet row count.
-    TOTAL_TRAINING_STEPS=$(python3 - <<'PY'
-import math
-import os
-
-train_files = os.environ.get("TRAIN_FILES", "")
-batch_size = int(os.environ.get("TRAIN_BATCH_SIZE", "256"))
-epochs = int(os.environ.get("TOTAL_EPOCHS", "1"))
-
-rows = None
-try:
-    import pyarrow.parquet as pq
-
-    rows = pq.ParquetFile(train_files).metadata.num_rows
-except Exception:
-    rows = None
-
-if rows is None:
-    steps = 1000 * max(1, epochs)
-else:
-    steps_per_epoch = max(1, math.ceil(rows / max(1, batch_size)))
-    steps = steps_per_epoch * max(1, epochs)
-
-print(steps)
-PY
-)
-fi
-
-echo ">>> SFT steps: total_epochs=${TOTAL_EPOCHS}, train_batch_size=${TRAIN_BATCH_SIZE}, total_training_steps=${TOTAL_TRAINING_STEPS}"
-
 torchrun \
     --nnodes=${NNODES} \
     --node_rank=${NODE_RANK} \
@@ -143,7 +109,6 @@ torchrun \
     trainer.project_name="${project_name}" \
     trainer.experiment_name="${exp_name}" \
     trainer.total_epochs=${TOTAL_EPOCHS} \
-    trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \
     trainer.default_local_dir="${CKPT_HOME}" \
     trainer.resume_mode=${RESUME_MODE} \
     trainer.max_ckpt_to_keep=2 \

From c3890f822908b5ddb40451448ed86dfc5b7d4bab Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 26 Feb 2026 16:42:16 +0800
Subject: [PATCH 60/61] chore: update grpo single node script

---
 .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh    | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
index 869f5ca231d..a80107339bf 100644
--- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
+++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh
@@ -49,17 +49,7 @@ LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35}
 UPDATE_WEIGHTS_BUCKET_MB=${UPDATE_WEIGHTS_BUCKET_MB:-4096}
 
-# FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`).
-# Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided.
-MIN_LR_RATIO=${MIN_LR_RATIO:-}
-if [[ -z "${MIN_LR_RATIO}" ]]; then
-    MIN_LR_RATIO=$(python3 - <<PY
-actor_lr = float("${ACTOR_LR}")
-min_lr = float("${MIN_LR}")
-print(min_lr / actor_lr if actor_lr > 0 else 0.0)
-PY
-)
-fi
+MIN_LR_RATIO=${MIN_LR_RATIO:-0.1}
 
 python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \
     --config-name='ppo_trainer.yaml' \

From 529e576b1451ddf857a4aa9eef79e1382e73815f Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Fri, 27 Feb 2026 10:59:13 +0800
Subject: [PATCH 61/61] chore: clean formatting in qwen2.5 72b sft run script

---
 .../run_sft_qwen2.5_72b_megatron_dlc.sh                | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh
index 6691d574949..37108817085 100644
--- a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh
+++ b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh
@@ -6,7 +6,7 @@ TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_me
 TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
 backend=${BACKEND:-megatron}
 project_name=verl_sft_qwen2.5_72b
-RESUME_MODE=disable
+RESUME_MODE=disable # auto
 MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen2.5-72B-A064}
 TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
 
@@ -61,10 +61,10 @@ else
 fi
 
 CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
-NNODES=${WORLD_SIZE:-16}           
-NODE_RANK=${RANK:-0}              
-MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 
-MASTER_PORT=${MASTER_PORT:-23457} 
+NNODES=${WORLD_SIZE:-16}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+MASTER_PORT=${MASTER_PORT:-23457}
 
 echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES"
 echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"