From 32ff542f5e2979999ba3e8b969c2e7fb31b4d573 Mon Sep 17 00:00:00 2001 From: khazic Date: Mon, 2 Feb 2026 18:57:10 +0800 Subject: [PATCH 01/61] chore: point recipe submodule to fork --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index d5dd7a6aa57..af166615b4a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "recipe"] path = recipe - url = https://github.com/verl-project/verl-recipe.git + url = https://github.com/khazic/verl-recipe_lao.git From 27e354b44790a4fa1c38dc7f790dac6f5b2d31e6 Mon Sep 17 00:00:00 2001 From: khazic Date: Mon, 2 Feb 2026 19:14:21 +0800 Subject: [PATCH 02/61] feat: add custom Qwen3-30BA3B translate recipe --- .../run_sft_qwen3moe_30b_a3b_megatron.sh | 112 +++++++++++++++++ .../run_sft_qwen3moe_30b_a3b_megatron_aux.sh | 115 ++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh create mode 100644 recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh new file mode 100644 index 00000000000..5b0a7ea263d --- /dev/null +++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_translate_0109 +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-8} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=512 \ + data.max_length=8192 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=49152 \ + data.messages_key=messages \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.output_router_logits=True \ + +model.override_config.router_dtype="float32" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=5000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=2 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=3 \ + 'checkpoint.save_contents=[model,optimizer,extra]' \ No newline at end of file diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh new file mode 100644 index 00000000000..bf59deb9bda --- /dev/null +++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_translate_0109_aux +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=5e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True \ + +engine.override_transformer_config.moe_aux_loss_coeff=0.01 \ + +engine.override_transformer_config.moe_z_loss_coeff=0.001 \ + +engine.override_transformer_config.moe_router_load_balancing_type=aux_loss" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-8} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=512 \ + data.max_length=8192 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=49152 \ + data.messages_key=messages \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.output_router_logits=True \ + +model.override_config.router_dtype="float32" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=5000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=2 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=3 \ + 'checkpoint.save_contents=[model,optimizer,extra]' \ No newline at end of file From 3b2a4564a1c49f389a21ee2116716cf30266d954 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 3 Feb 2026 14:22:44 +0800 Subject: [PATCH 03/61] Add RLVR_ABCDE_dense scripts --- .../RLVR_ABCDE_dense/create_dataset.py | 198 ++++++++++++++++++ .../RLVR_ABCDE_dense/reward_function.py | 65 ++++++ .../RLVR_ABCDE_dense/run_grpo_dlc.sh | 82 ++++++++ .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh | 92 ++++++++ 4 files changed, 437 insertions(+) create mode 100644 recipes_custom/RLVR_ABCDE_dense/create_dataset.py create mode 100644 recipes_custom/RLVR_ABCDE_dense/reward_function.py create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh diff --git a/recipes_custom/RLVR_ABCDE_dense/create_dataset.py b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py new file mode 100644 index 00000000000..754dacde603 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py @@ -0,0 +1,198 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Task description: +Given a random word and a random char, count the number of occurrence of char in the word. + +Create CoT dataset that split the word into separate char. Then list the char and count the occurrence. + +The word set comes from shakespeare +""" + +import os.path +import random + +prompt_template = "How many {} are there in word {}?" + + +def generate_random_char(): + return chr(97 + random.randint(0, 25)) + + +def create_prompt_response(min_length=3, max_length=5): + # randomly generate a length + word_length = random.randint(min_length, max_length) + # randomly generate a target count number. This makes the target number + target_count_number = random.randint(1, word_length) + + char_lst = [] + # generate the word + # step 1: generate the target word + target_char = generate_random_char() + + for _ in range(target_count_number): + char_lst.append(target_char) + + # step 2: generate other words + for _ in range(word_length - target_count_number): + while True: + char = generate_random_char() + if char != target_char: + char_lst.append(char) + break + + # step 3: random permute char_lst + random.shuffle(char_lst) + + word = "-".join(char_lst) + + prompt = prompt_template.format(target_char, word) + final_answer = [] + + # cot + number = 0 + for i, char in enumerate(char_lst): + cot = f"{char}" + if char != target_char: + cot += " != " + else: + cot += " = " + number += 1 + cot += f"{target_char}." + + final_answer.append(cot) + + conclusion = f"\\boxed{{{number}}} {target_char} in {word}." + + final_answer.append(conclusion) + + final_answer = "\n".join(final_answer) + + return prompt, final_answer + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--total_number", type=int, default=10000) + parser.add_argument("--min_length", type=int, default=5) + parser.add_argument("--max_length", type=int, default=20) + parser.add_argument("--data_path", type=str, default="~/data/char_count") + + args = vars(parser.parse_args()) + + total_number = args["total_number"] + min_length = args["min_length"] + max_length = args["max_length"] + data_path = args["data_path"] + data_path = os.path.expanduser(data_path) + + full_output = [] + for _ in range(total_number): + output = create_prompt_response(min_length=min_length, max_length=max_length) + full_output.append(output) + + # random reorder + random.shuffle(full_output) + + # split for train and test + train_split_len = int(0.9 * len(full_output)) + train_outputs = full_output[:train_split_len] + test_output = full_output[train_split_len:] + + sft_train_dataset = {"messages": []} + + for o in train_outputs: + messages = [ + {"role": "user", "content": o[0]}, + {"role": "assistant", "content": o[1]}, + ] + + sft_train_dataset["messages"].append(messages) + + sft_test_dataset = {"messages": []} + + for o in test_output: + messages = [ + {"role": "user", "content": o[0]}, + {"role": "assistant", "content": o[1]}, + ] + sft_test_dataset["messages"].append(messages) + + import pandas as pd + + sft_train_dataset = pd.DataFrame(data=sft_train_dataset) + sft_test_dataset = pd.DataFrame(data=sft_test_dataset) + + folder = os.path.join(data_path, "sft") + + os.makedirs(folder, exist_ok=True) + + sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) + sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) + + # build RL dataset + rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} + + rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} + + from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed + + for o in train_outputs: + prompt = o[0] + response = o[1] + prompt_with_template = [ + { + "role": "user", + "content": prompt, + } + ] + + rl_train_dataset["prompt"].append(prompt_with_template) + rl_train_dataset["data_source"].append("char_count") + rl_train_dataset["ability"].append("other") + rl_train_dataset["reward_model"].append( + {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} + ) + rl_train_dataset["extra_info"].append({"response": response}) + + for o in test_output: + prompt = o[0] + response = o[1] + prompt_with_template = [ + { + "role": "user", + "content": prompt, + } + ] + + rl_test_dataset["prompt"].append(prompt_with_template) + rl_test_dataset["data_source"].append("char_count") + rl_test_dataset["ability"].append("other") + rl_test_dataset["reward_model"].append( + {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} + ) + rl_test_dataset["extra_info"].append({"response": response}) + + rl_train_dataset = pd.DataFrame(data=rl_train_dataset) + rl_test_dataset = pd.DataFrame(data=rl_test_dataset) + + folder = os.path.join(data_path, "rl") + + os.makedirs(folder, exist_ok=True) + + rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) + rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) diff --git a/recipes_custom/RLVR_ABCDE_dense/reward_function.py b/recipes_custom/RLVR_ABCDE_dense/reward_function.py new file mode 100644 index 00000000000..61fe81bf207 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/reward_function.py @@ -0,0 +1,65 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Reward function +""" + +import re + +DEFAULT_CHOICES = ("A", "B", "C", "D", "E") +BOXED_PATTERN = re.compile(r"\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}") +CHOICE_PATTERN = re.compile( + r"(?:answer|option|choice)?\s*[:=]?\s*([A-Za-z])\b", re.IGNORECASE +) + + +def _extract_boxed_answer(text: str) -> str: + matches = BOXED_PATTERN.findall(text) + return matches[-1] if matches else "" + + +def _normalize_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str: + text = (text or "").strip().upper() + for char in text: + if char in valid_choices: + return char + return "" + + +def extract_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str: + """ + Extract a single-letter choice, preferring \\boxed{} values but falling back + to phrases like "Answer: C" or the first standalone letter. + """ + text = str(text or "") + candidate = _normalize_choice(_extract_boxed_answer(text), valid_choices) + if candidate: + return candidate + match = CHOICE_PATTERN.search(text) + if match: + candidate = _normalize_choice(match.group(1), valid_choices) + if candidate: + return candidate + return _normalize_choice(text, valid_choices) + + +def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None): + try: + model_choice = extract_choice(solution_str) + gold_choice = extract_choice(ground_truth) + return 1 if model_choice and gold_choice and model_choice == gold_choice else 0 + except Exception: + print(ground_truth, solution_str) + return 0 diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh new file mode 100644 index 00000000000..40eead0aa96 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -xeuo pipefail + + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} +VAL_FILES=${VAL_FILES:-} +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} +PROJECT_NAME=${PROJECT_NAME:-rlvr} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo} + +NNODES=${PET_NNODES:-${WORLD_SIZE:-30}} +NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} +MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} +MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} +N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} + +RAY_PORT=${RAY_PORT:-6379} +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} +RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" +echo ">>> Ray 地址: $RAY_ADDRESS" + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN + +if [ "$NODE_RANK" -eq 0 ]; then + ray start --head \ + --node-ip-address="$MASTER_ADDR" \ + --port="$RAY_PORT" \ + --dashboard-port="$RAY_DASHBOARD_PORT" +else + ray start --address="$RAY_ADDRESS" --block & +fi + +# Give Ray a moment to settle +sleep 5 + +python3 $ENTRYPOINT \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.train_batch_size=2048 \ + data.max_prompt_length=2048 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((2048 + 1024)) \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.enforce_eager=True \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0.05 \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=5 \ + trainer.use_legacy_worker_impl=disable \ + ray_kwargs.ray_init.address=$RAY_ADDRESS \ + custom_reward_function.path=./reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh new file mode 100644 index 00000000000..46b031b8d41 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export VERL_DISABLE_HARMONY=1 +export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} +PROJECT_NAME=${PROJECT_NAME:-rlvr} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_megatron} + +NNODES=${PET_NNODES:-${WORLD_SIZE:-28}} +NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} +MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} +MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} +N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} + +TP_SIZE=${TP_SIZE:-8} +PP_SIZE=${PP_SIZE:-1} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +RAY_PORT=${RAY_PORT:-6379} +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} +RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} + +if [ "$NODE_RANK" -eq 0 ]; then + ray start --head \ + --node-ip-address="$MASTER_ADDR" \ + --port="$RAY_PORT" \ + --dashboard-port="$RAY_DASHBOARD_PORT" +else + ray start --address="$RAY_ADDRESS" --block + exit 0 +fi + +sleep 5 + +python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=512 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=224 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=224 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=100 \ + trainer.test_freq=100 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \ + custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ + custom_reward_function.name=char_count_reward_function From 8ef1cb89b1a883a2be559c25dc6460ed052bebcd Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 5 Feb 2026 15:00:45 +0800 Subject: [PATCH 04/61] chore: adjust GRPO launch scripts and trainer defaults - add FSDP GRPO launcher with vLLM rollout settings - update Megatron launcher to keep workers running and log to W&B - increase Megatron NCCL timeout to 1200s - log validation generations by default in PPO trainer - remove legacy GRPO DLC script --- .../RLVR_ABCDE_dense/run_grpo_dlc.sh | 82 -------------- .../RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh | 101 ++++++++++++++++++ .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh | 4 +- verl/trainer/config/ppo_megatron_trainer.yaml | 2 +- verl/trainer/config/ppo_trainer.yaml | 2 +- 5 files changed, 105 insertions(+), 86 deletions(-) delete mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh deleted file mode 100644 index 40eead0aa96..00000000000 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_dlc.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - - -ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} -TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} -VAL_FILES=${VAL_FILES:-} -MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} -PROJECT_NAME=${PROJECT_NAME:-rlvr} -EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo} - -NNODES=${PET_NNODES:-${WORLD_SIZE:-30}} -NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} -MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} -MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} -N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} - -RAY_PORT=${RAY_PORT:-6379} -RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} -RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} - -echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" -echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" -echo ">>> Ray 地址: $RAY_ADDRESS" - -export WANDB_MODE=offline -export NCCL_DEBUG=WARN - -if [ "$NODE_RANK" -eq 0 ]; then - ray start --head \ - --node-ip-address="$MASTER_ADDR" \ - --port="$RAY_PORT" \ - --dashboard-port="$RAY_DASHBOARD_PORT" -else - ray start --address="$RAY_ADDRESS" --block & -fi - -# Give Ray a moment to settle -sleep 5 - -python3 $ENTRYPOINT \ - algorithm.adv_estimator=grpo \ - data.train_files=$TRAIN_FILES \ - data.train_batch_size=2048 \ - data.max_prompt_length=2048 \ - data.max_response_length=1024 \ - data.filter_overlong_prompts=False \ - data.truncation='error' \ - actor_rollout_ref.model.path=$MODEL_ID \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.actor.ppo_mini_batch_size=128 \ - actor_rollout_ref.actor.use_dynamic_bsz=True \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((2048 + 1024)) \ - actor_rollout_ref.actor.use_kl_loss=False \ - actor_rollout_ref.actor.kl_loss_coef=0.0 \ - actor_rollout_ref.actor.kl_loss_type=low_var_kl \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.fsdp_config.param_offload=True \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ - actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ - actor_rollout_ref.rollout.n=8 \ - actor_rollout_ref.rollout.enforce_eager=True \ - actor_rollout_ref.ref.fsdp_config.param_offload=True \ - algorithm.use_kl_in_reward=False \ - trainer.critic_warmup=0.05 \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name=$PROJECT_NAME \ - trainer.experiment_name=$EXPERIMENT_NAME \ - trainer.val_before_train=False \ - trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ - trainer.nnodes=$NNODES \ - trainer.save_freq=-1 \ - trainer.test_freq=-1 \ - trainer.total_epochs=5 \ - trainer.use_legacy_worker_impl=disable \ - ray_kwargs.ray_init.address=$RAY_ADDRESS \ - custom_reward_function.path=./reward_function.py \ - custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh new file mode 100644 index 00000000000..42786ea6300 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export VERL_DISABLE_HARMONY=1 +export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} +PROJECT_NAME=${PROJECT_NAME:-rlvr} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_fsdp} + +NNODES=${PET_NNODES:-${WORLD_SIZE:-28}} +NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} +MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} +MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} +N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} + +FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2} +FSDP_SIZE=${FSDP_SIZE:-8} +ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False} +REF_OFFLOAD=${REF_OFFLOAD:-False} +CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +RAY_PORT=${RAY_PORT:-6379} +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} +RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} + +if [ "$NODE_RANK" -eq 0 ]; then + ray start --head \ + --node-ip-address="$MASTER_ADDR" \ + --port="$RAY_PORT" \ + --dashboard-port="$RAY_DASHBOARD_PORT" +else + ray start --address="$RAY_ADDRESS" + exit 0 +fi + +sleep 5 + +python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=512 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=224 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=224 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + critic.strategy=$FSDP_STRATEGY \ + critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ + critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ + critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=100 \ + trainer.test_freq=100 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \ + custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh index 46b031b8d41..c3b0bff0457 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh @@ -36,7 +36,7 @@ if [ "$NODE_RANK" -eq 0 ]; then --port="$RAY_PORT" \ --dashboard-port="$RAY_DASHBOARD_PORT" else - ray start --address="$RAY_ADDRESS" --block + ray start --address="$RAY_ADDRESS" exit 0 fi @@ -76,7 +76,7 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ - trainer.logger='["console","tensorboard"]' \ + trainer.logger='["console","wandb"]' \ trainer.project_name=$PROJECT_NAME \ trainer.experiment_name=$EXPERIMENT_NAME \ trainer.val_before_train=False \ diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml index 76ba4c57575..3c2505592ad 100644 --- a/verl/trainer/config/ppo_megatron_trainer.yaml +++ b/verl/trainer/config/ppo_megatron_trainer.yaml @@ -26,7 +26,7 @@ defaults: actor_rollout_ref: hybrid_engine: True - nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron + nccl_timeout: 1200 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron model: override_config: diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index 7489b522fa2..a9cf8e6c650 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -141,7 +141,7 @@ trainer: logger: ["console", "wandb"] # Number of generations to log during validation - log_val_generations: 0 + log_val_generations: 10 # Directory for logging rollout data; no dump if null rollout_data_dir: null From 3c3288c2377b97b7c8e3b1897e191a252b1fe8cf Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 5 Feb 2026 16:17:58 +0800 Subject: [PATCH 05/61] feat: add single-node Megatron GRPO launcher - add single-node 8xGPU Megatron GRPO script with TP/PP=1 - tune batch sizes and validation defaults for single-node runs - update existing GRPO launch scripts to match latest paths/settings --- .../RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh | 2 - .../RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh | 12 ++- .../run_grpo_megatron_single_node.sh | 74 +++++++++++++++++++ 3 files changed, 79 insertions(+), 9 deletions(-) create mode 100755 recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh index 42786ea6300..6ab8523d75b 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh @@ -4,7 +4,6 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export VERL_DISABLE_HARMONY=1 export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-} ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} @@ -96,6 +95,5 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co trainer.total_epochs=5 \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ - +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \ custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh index c3b0bff0457..1b0659fc7d0 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh @@ -4,14 +4,13 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export VERL_DISABLE_HARMONY=1 -export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-} +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} -TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} -MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} -PROJECT_NAME=${PROJECT_NAME:-rlvr} -EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_megatron} +TRAIN_FILES=${TRAIN_FILES:-/llm-alignment/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron} NNODES=${PET_NNODES:-${WORLD_SIZE:-28}} NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} @@ -87,6 +86,5 @@ python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/co trainer.total_epochs=5 \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ - +ray_kwargs.ray_init.runtime_env.env_vars.VERL_DISABLE_HARMONY='"1"' \ custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh new file mode 100755 index 00000000000..141cca62476 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single} + +# Single node, 8 GPUs +NNODES=1 +NODE_RANK=0 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=${MASTER_PORT:-23457} +N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} + +TP_SIZE=1 +PP_SIZE=1 + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=2048 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=300 \ + trainer.test_freq=300 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.num_cpus=32 \ + custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ + custom_reward_function.name=char_count_reward_function From c79bebee0b34bbd0f539436e1671f2ac197abe24 Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 5 Feb 2026 16:28:14 +0800 Subject: [PATCH 06/61] chore: run single-node GRPO in W&B offline mode - set WANDB_MODE=offline in single-node Megatron script - avoid proxy failures during W&B logging --- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 141cca62476..a91b644c315 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -4,6 +4,7 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 +export WANDB_MODE=offline export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} @@ -12,7 +13,6 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single} -# Single node, 8 GPUs NNODES=1 NODE_RANK=0 MASTER_ADDR=127.0.0.1 From 56ba579ad720b0e72c9276dc9dd7f38537824b98 Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 5 Feb 2026 17:07:27 +0800 Subject: [PATCH 07/61] chore: lower single-node GRPO memory footprint - reduce batch sizes and sequence lengths for Megatron single-node - align FSDP single-node script with safer rollout settings - keep vLLM utilization low for constrained free memory --- .../run_grpo_fsdp_single_node.sh | 83 +++++++++++++++++++ .../run_grpo_megatron_single_node.sh | 14 ++-- 2 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh new file mode 100644 index 00000000000..1850e7ecf99 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export WANDB_MODE=offline +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} + +NNODES=1 +NODE_RANK=0 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=${MASTER_PORT:-23457} +N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} + +FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2} +FSDP_SIZE=${FSDP_SIZE:-8} +ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False} +REF_OFFLOAD=${REF_OFFLOAD:-False} +CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=2048 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=16 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \ + actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + critic.strategy=$FSDP_STRATEGY \ + critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ + critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ + critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=300 \ + trainer.test_freq=300 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.num_cpus=32 \ + custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index a91b644c315..9ac4d5f94df 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -33,16 +33,16 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_files=$TRAIN_FILES \ data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ - data.train_batch_size=32 \ - data.max_prompt_length=1024 \ - data.max_response_length=1024 \ + data.train_batch_size=16 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ data.filter_overlong_prompts=False \ data.truncation='error' \ actor_rollout_ref.model.path=$MODEL_ID \ actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=32 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.ppo_mini_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.actor.use_kl_loss=False \ @@ -53,8 +53,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \ + actor_rollout_ref.rollout.n=4 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ From 8e8deedc99e83aba78e6e51aa9ffa0439b5f0a3c Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 5 Feb 2026 17:13:34 +0800 Subject: [PATCH 08/61] chore: tune vLLM rollout memory for single-node - raise vLLM gpu_memory_utilization to 0.30 for KV cache - lower rollout.n and cap max batched tokens for stability - apply settings to both Megatron and FSDP single-node scripts --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 1850e7ecf99..15bf94ced00 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -58,8 +58,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \ - actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ + actor_rollout_ref.rollout.n=2 \ + actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 9ac4d5f94df..03c472be798 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -53,8 +53,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \ - actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ + actor_rollout_ref.rollout.n=2 \ + actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ From cfafe22b018aa49ffa5e5097ab3207d016847b56 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:32:23 +0800 Subject: [PATCH 09/61] Update GRPO scripts for 4-node Ray --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 7 ++++--- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 15bf94ced00..d13f49f6a6a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,9 +13,9 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} -NNODES=1 -NODE_RANK=0 -MASTER_ADDR=127.0.0.1 +NNODES=${NNODES:-4} +NODE_RANK=${NODE_RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} @@ -61,6 +61,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ actor_rollout_ref.rollout.n=2 \ actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ + actor_rollout_ref.rollout.max_model_len=8192 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 03c472be798..915fda505e9 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,9 +13,9 @@ MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single} -NNODES=1 -NODE_RANK=0 -MASTER_ADDR=127.0.0.1 +NNODES=${NNODES:-4} +NODE_RANK=${NODE_RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} @@ -56,6 +56,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ actor_rollout_ref.rollout.n=2 \ actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ + actor_rollout_ref.rollout.max_model_len=8192 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ From 4b005cb58617d73ffc6057f81d98d25933b3e916 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:36:33 +0800 Subject: [PATCH 10/61] Use Ray address for existing cluster --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 ++- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index d13f49f6a6a..9e4247d4a9a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -28,6 +28,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} +RAY_ADDRESS=${RAY_ADDRESS:-auto} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ @@ -80,6 +81,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.save_freq=300 \ trainer.test_freq=300 \ trainer.total_epochs=5 \ - +ray_kwargs.ray_init.num_cpus=32 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 915fda505e9..c4d730e0e72 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -25,6 +25,7 @@ PP_SIZE=1 rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} +RAY_ADDRESS=${RAY_ADDRESS:-auto} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_megatron_trainer.yaml' \ @@ -71,6 +72,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.save_freq=300 \ trainer.test_freq=300 \ trainer.total_epochs=5 \ - +ray_kwargs.ray_init.num_cpus=32 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 787a9eb3f2caf924896f382f0cc97092889196fc Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:38:09 +0800 Subject: [PATCH 11/61] Add Ray runtime_env for code import --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 +++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 9e4247d4a9a..663198e4c5c 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -29,6 +29,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-auto} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ @@ -82,5 +83,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.test_freq=300 \ trainer.total_epochs=5 \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index c4d730e0e72..08f0dc1287c 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -26,6 +26,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-auto} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_megatron_trainer.yaml' \ @@ -73,5 +74,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.test_freq=300 \ trainer.total_epochs=5 \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 4f360e1673db63dd615e0d76baf4c62c8271bc01 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:42:35 +0800 Subject: [PATCH 12/61] Set socket IFNAME and increase batch size --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 +++- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 663198e4c5c..10ce6d8e2a6 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -6,6 +6,8 @@ export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -38,7 +40,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_files=$TRAIN_FILES \ data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ - data.train_batch_size=16 \ + data.train_batch_size=64 \ data.max_prompt_length=512 \ data.max_response_length=512 \ data.filter_overlong_prompts=False \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 08f0dc1287c..856ec593c93 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -6,6 +6,8 @@ export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -35,7 +37,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_files=$TRAIN_FILES \ data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ - data.train_batch_size=16 \ + data.train_batch_size=64 \ data.max_prompt_length=512 \ data.max_response_length=512 \ data.filter_overlong_prompts=False \ From 5acfc8763362e4e94f8e485deb3d7ab90239b307 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:52:46 +0800 Subject: [PATCH 13/61] Propagate env to Ray workers and adjust batch --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 10ce6d8e2a6..2cda3cc18bd 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -87,5 +87,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ + +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 856ec593c93..8083bfc3fec 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -78,5 +78,9 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ + +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From d41a15780ce2062ad2ba57596f60606d96b0235d Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 15:54:33 +0800 Subject: [PATCH 14/61] Quote MASTER_PORT in Ray runtime env --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 2cda3cc18bd..55221bdb32a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -88,7 +88,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ - +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 8083bfc3fec..c23b6584bd8 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -79,7 +79,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ - +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ From 6fa835f7462fadaadb792d51e65d7260d31a1a58 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 16:02:11 +0800 Subject: [PATCH 15/61] Add WANDB proxy env vars to RLVR scripts --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 55221bdb32a..4dca55e595d 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -5,6 +5,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline +export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index c23b6584bd8..44bec63f62e 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -5,6 +5,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline +export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} From 069746fed6fcc3a954942ed47e6f359e9fda0a9d Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 16:05:43 +0800 Subject: [PATCH 16/61] Remove WANDB proxy envs and keep offline mode --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 ----- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 5 ----- 2 files changed, 10 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 4dca55e595d..55221bdb32a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -5,11 +5,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline -export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 44bec63f62e..c23b6584bd8 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -5,11 +5,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=offline -export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} From afa0f415668503713ba00ec0748e0329be068906 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 16:15:32 +0800 Subject: [PATCH 17/61] Enable WANDB logging via proxy in RLVR scripts --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 13 ++++++++++++- .../run_grpo_megatron_single_node.sh | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 55221bdb32a..51a9be58c37 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -4,7 +4,12 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export WANDB_MODE=offline +export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} @@ -91,5 +96,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index c23b6584bd8..605b23d754c 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -4,7 +4,12 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export WANDB_MODE=offline +export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} @@ -82,5 +87,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 042c4c8bb48ad778f7165785fb9c354366d5e050 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 16:48:32 +0800 Subject: [PATCH 18/61] Increase max prompt length to 2048 --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 51a9be58c37..7189bbc30c3 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -46,7 +46,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ - data.max_prompt_length=512 \ + data.max_prompt_length=2048 \ data.max_response_length=512 \ data.filter_overlong_prompts=False \ data.truncation='error' \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 605b23d754c..5862c1d4829 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -43,7 +43,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ - data.max_prompt_length=512 \ + data.max_prompt_length=2048 \ data.max_response_length=512 \ data.filter_overlong_prompts=False \ data.truncation='error' \ From 232e77a9e59633892e80e55dc9d3997afe83bc2b Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 17:52:47 +0800 Subject: [PATCH 19/61] Tune RLVR GRPO configs for LR decay and larger rollout batches --- .../run_grpo_fsdp_single_node.sh | 21 ++++++++++++------- .../run_grpo_megatron_single_node.sh | 21 ++++++++++++------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 7189bbc30c3..e14abbb7c64 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -37,6 +37,9 @@ USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-auto} RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} +ACTOR_LR=${ACTOR_LR:-1e-6} +MIN_LR=${MIN_LR:-1e-7} +LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ @@ -47,7 +50,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ data.max_prompt_length=2048 \ - data.max_response_length=512 \ + data.max_response_length=2048 \ data.filter_overlong_prompts=False \ data.truncation='error' \ actor_rollout_ref.model.path=$MODEL_ID \ @@ -56,24 +59,26 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=16 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ + actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ + actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.kl_loss_coef=0.0 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ - actor_rollout_ref.rollout.n=2 \ - actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.max_num_batched_tokens=81920 \ actor_rollout_ref.rollout.max_model_len=8192 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ critic.strategy=$FSDP_STRATEGY \ critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 5862c1d4829..5605525960d 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -34,6 +34,9 @@ USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-auto} RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} +ACTOR_LR=${ACTOR_LR:-1e-6} +MIN_LR=${MIN_LR:-1e-7} +LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_megatron_trainer.yaml' \ @@ -44,29 +47,31 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ data.max_prompt_length=2048 \ - data.max_response_length=512 \ + data.max_response_length=2048 \ data.filter_overlong_prompts=False \ data.truncation='error' \ actor_rollout_ref.model.path=$MODEL_ID \ actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=16 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ + actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ + actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.kl_loss_coef=0.0 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ - actor_rollout_ref.rollout.n=2 \ - actor_rollout_ref.rollout.max_num_batched_tokens=4096 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.max_num_batched_tokens=81920 \ actor_rollout_ref.rollout.max_model_len=8192 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ algorithm.use_kl_in_reward=False \ From 2e10ab5e0a8465d1c9451b69ed96bfa67c38fd0a Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 18:03:52 +0800 Subject: [PATCH 20/61] Align FSDP and Megatron rollout settings --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 6 +++--- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index e14abbb7c64..4a4d096753b 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -72,10 +72,10 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=81920 \ - actor_rollout_ref.rollout.max_model_len=8192 \ + actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ + actor_rollout_ref.rollout.max_model_len=4096 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 5605525960d..b6cb00ed073 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -67,10 +67,10 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.30 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=81920 \ - actor_rollout_ref.rollout.max_model_len=8192 \ + actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ + actor_rollout_ref.rollout.max_model_len=4096 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ From 070f9f64d8c3eb66ab27317de532e89200012402 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 18:09:32 +0800 Subject: [PATCH 21/61] Lower vLLM GPU memory utilization to 0.35 --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 4a4d096753b..085340cd9d3 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -72,7 +72,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ actor_rollout_ref.rollout.max_model_len=4096 \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index b6cb00ed073..09e0a24834e 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -67,7 +67,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ actor_rollout_ref.rollout.max_model_len=4096 \ From a9d04070390578ecfd3c6eede6e7dae93e332874 Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 6 Feb 2026 18:19:20 +0800 Subject: [PATCH 22/61] Reduce rollout memory pressure while keeping n=16 --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 12 ++++++------ .../run_grpo_megatron_single_node.sh | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 085340cd9d3..1658725fb2f 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -50,7 +50,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ data.max_prompt_length=2048 \ - data.max_response_length=2048 \ + data.max_response_length=1024 \ data.filter_overlong_prompts=False \ data.truncation='error' \ actor_rollout_ref.model.path=$MODEL_ID \ @@ -63,22 +63,22 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.kl_loss_coef=0.0 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ - actor_rollout_ref.rollout.max_model_len=4096 \ + actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ + actor_rollout_ref.rollout.max_model_len=3072 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ critic.strategy=$FSDP_STRATEGY \ critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 09e0a24834e..2ad36a0bd4f 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -47,7 +47,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.return_raw_chat=$RETURN_RAW_CHAT \ data.train_batch_size=64 \ data.max_prompt_length=2048 \ - data.max_response_length=2048 \ + data.max_response_length=1024 \ data.filter_overlong_prompts=False \ data.truncation='error' \ actor_rollout_ref.model.path=$MODEL_ID \ @@ -56,22 +56,22 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.kl_loss_coef=0.0 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ - actor_rollout_ref.rollout.max_model_len=4096 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ + actor_rollout_ref.rollout.max_model_len=3072 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ algorithm.use_kl_in_reward=False \ From 06bf1743dd3979f2a970c378ab21092af26f0278 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 14:42:24 +0800 Subject: [PATCH 23/61] Align FSDP GRPO config and add Qwen3 recipes --- .../run_sft_qwen3moe_235b_a22b_megatron.sh | 113 ++++++++++++++++++ .../run_sft_qwen3moe_30b_a3b_megatron.sh | 113 ++++++++++++++++++ .../run_grpo_fsdp_single_node.sh | 14 ++- .../run_grpo_megatron_single_node.sh | 15 +-- 4 files changed, 242 insertions(+), 13 deletions(-) create mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh create mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh new file mode 100644 index 00000000000..39d84d8beab --- /dev/null +++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_235ba22b_2507 +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=256 \ + data.max_length=1024 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=10240 \ + data.messages_key=messages \ + data.ignore_input_ids_mismatch=True \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.router_dtype="float16" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=1000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=1 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=2 \ + 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh new file mode 100644 index 00000000000..a45209ffcc1 --- /dev/null +++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_235ba22b_2507 +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=512 \ + data.max_length=4096 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=24576 \ + data.messages_key=messages \ + data.ignore_input_ids_mismatch=True \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.output_router_logits=True \ + +model.override_config.router_dtype="float32" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=1000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=1 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=1 \ + 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' \ No newline at end of file diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 1658725fb2f..ba41dc99650 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,6 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} +export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -22,7 +23,7 @@ EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} NNODES=${NNODES:-4} NODE_RANK=${NODE_RANK:-0} -MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} +MASTER_ADDR=${MASTER_ADDR:-10.178.131.202} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} @@ -48,8 +49,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_files=$TRAIN_FILES \ data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ - data.train_batch_size=64 \ - data.max_prompt_length=2048 \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ data.max_response_length=1024 \ data.filter_overlong_prompts=False \ data.truncation='error' \ @@ -62,7 +63,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=False \ actor_rollout_ref.actor.kl_loss_coef=0.0 \ @@ -74,8 +75,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ - actor_rollout_ref.rollout.max_model_len=3072 \ + actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ + actor_rollout_ref.rollout.max_model_len=2048 \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ @@ -107,5 +108,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.RAY_TMPDIR=$RAY_TMPDIR \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 2ad36a0bd4f..b8f92c11202 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,6 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} +export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -26,7 +27,7 @@ MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} -TP_SIZE=1 +TP_SIZE=4 PP_SIZE=1 rollout_mode=${ROLLOUT_MODE:-async} @@ -45,8 +46,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c data.val_files=$TRAIN_FILES \ data.val_max_samples=2048 \ data.return_raw_chat=$RETURN_RAW_CHAT \ - data.train_batch_size=64 \ - data.max_prompt_length=2048 \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ data.max_response_length=1024 \ data.filter_overlong_prompts=False \ data.truncation='error' \ @@ -55,7 +56,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ @@ -69,8 +70,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.mode=$rollout_mode \ actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ actor_rollout_ref.rollout.n=16 \ - actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ - actor_rollout_ref.rollout.max_model_len=3072 \ + actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ + actor_rollout_ref.rollout.max_model_len=2048 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ @@ -99,4 +100,4 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ - custom_reward_function.name=char_count_reward_function + custom_reward_function.name=char_count_reward_function \ No newline at end of file From 888ece5a1b9334dae5508df5b058eeb222446e44 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 14:47:11 +0800 Subject: [PATCH 24/61] Fix FSDP min_lr override --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index ba41dc99650..15548606d44 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -61,7 +61,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ - actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ + +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ From 470995b6f072c4751ac5d1849b6e33eccb4d90b8 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 14:49:08 +0800 Subject: [PATCH 25/61] Fix FSDP lr_decay_style override --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 15548606d44..d72aeaab182 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -62,7 +62,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ - actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ + +actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=False \ From 3de34930f3ad6d7123e8a79e95aa4aeb640669dd Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:02:33 +0800 Subject: [PATCH 26/61] Align FSDP Ray settings with Megatron --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index d72aeaab182..9690dc85fbb 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -23,7 +23,7 @@ EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} NNODES=${NNODES:-4} NODE_RANK=${NODE_RANK:-0} -MASTER_ADDR=${MASTER_ADDR:-10.178.131.202} +MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} @@ -108,6 +108,5 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.RAY_TMPDIR=$RAY_TMPDIR \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 23151041f29e69fbce71187d1cd538c1dcde4224 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:11:25 +0800 Subject: [PATCH 27/61] Unset Ray socket env vars before launch --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 9690dc85fbb..be54472b495 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -1,6 +1,10 @@ #!/usr/bin/env bash set -xeuo pipefail +unset RAYLET_SOCKET_NAME +unset PLASMA_STORE_SOCKET_NAME +unset RAY_SESSION_DIR + export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index b8f92c11202..9a66a9c033a 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -1,6 +1,10 @@ #!/usr/bin/env bash set -xeuo pipefail +unset RAYLET_SOCKET_NAME +unset PLASMA_STORE_SOCKET_NAME +unset RAY_SESSION_DIR + export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 @@ -100,4 +104,4 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ - custom_reward_function.name=char_count_reward_function \ No newline at end of file + custom_reward_function.name=char_count_reward_function From ed816dc54947ce8dcded5830332a686fedbe1dfb Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:23:39 +0800 Subject: [PATCH 28/61] =?UTF-8?q?RLVR=5FABCDE=5Fdense:=20=E5=AF=B9?= =?UTF-8?q?=E9=BD=90=20FSDP/Megatron=20=E6=B6=88=E8=9E=8D=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E4=B8=8E=E5=A4=9A=E8=8A=82=E7=82=B9=20checkpoint=20?= =?UTF-8?q?=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cursor --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 7 ++++++- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index be54472b495..dde1d2da2b3 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -24,6 +24,8 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} +# 与 Megatron 一致:多节点时 checkpoint 需写共享目录,用绝对路径 +DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}} NNODES=${NNODES:-4} NODE_RANK=${NODE_RANK:-0} @@ -45,6 +47,8 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} +# 与 Megatron 一致,消融实验用同一显存占用 +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ @@ -77,7 +81,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ + actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \ actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ actor_rollout_ref.rollout.max_model_len=2048 \ @@ -93,6 +97,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.logger='["console","wandb"]' \ trainer.project_name=$PROJECT_NAME \ trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.default_local_dir=$DEFAULT_LOCAL_DIR \ trainer.val_before_train=True \ trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ trainer.nnodes=$NNODES \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 9a66a9c033a..05bce754450 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -24,6 +24,7 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single} +DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}} NNODES=${NNODES:-4} NODE_RANK=${NODE_RANK:-0} @@ -42,6 +43,8 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} +# 与 FSDP 一致,消融实验用同一显存占用 +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_megatron_trainer.yaml' \ @@ -72,7 +75,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.mode=$rollout_mode \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \ + actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \ actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ actor_rollout_ref.rollout.max_model_len=2048 \ @@ -84,6 +87,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.logger='["console","wandb"]' \ trainer.project_name=$PROJECT_NAME \ trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.default_local_dir=$DEFAULT_LOCAL_DIR \ trainer.val_before_train=True \ trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ trainer.nnodes=$NNODES \ From e562bfb9f06da26ee008cc73581700a10b194294 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:27:31 +0800 Subject: [PATCH 29/61] =?UTF-8?q?RLVR:=20ray.init=20=5Ftemp=5Fdir=20?= =?UTF-8?q?=E6=8C=87=E5=90=91=20RAY=5FTMPDIR=20=E9=81=BF=E5=85=8D=20/tmp?= =?UTF-8?q?=20=E7=A3=81=E7=9B=98=E9=85=8D=E9=A2=9D=E4=B8=8D=E8=B6=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cursor --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 +-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index dde1d2da2b3..14b30835619 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -24,7 +24,6 @@ TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} -# 与 Megatron 一致:多节点时 checkpoint 需写共享目录,用绝对路径 DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}} NNODES=${NNODES:-4} @@ -47,7 +46,6 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} -# 与 Megatron 一致,消融实验用同一显存占用 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ @@ -104,6 +102,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.save_freq=300 \ trainer.test_freq=300 \ trainer.total_epochs=5 \ + +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 05bce754450..84af56a5ba6 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -43,7 +43,6 @@ RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} -# 与 FSDP 一致,消融实验用同一显存占用 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ @@ -94,6 +93,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.save_freq=300 \ trainer.test_freq=300 \ trainer.total_epochs=5 \ + +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ From 21c9dd8bc6000a24e7d65ea72e0bd4004c8495b1 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:49:27 +0800 Subject: [PATCH 30/61] Fix ray address and master port in launch scripts --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ---- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 8 ++------ 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 14b30835619..1f3638d06c7 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -1,10 +1,6 @@ #!/usr/bin/env bash set -xeuo pipefail -unset RAYLET_SOCKET_NAME -unset PLASMA_STORE_SOCKET_NAME -unset RAY_SESSION_DIR - export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 84af56a5ba6..2ec95d226f0 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -1,10 +1,6 @@ #!/usr/bin/env bash set -xeuo pipefail -unset RAYLET_SOCKET_NAME -unset PLASMA_STORE_SOCKET_NAME -unset RAY_SESSION_DIR - export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 @@ -38,7 +34,7 @@ PP_SIZE=1 rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} -RAY_ADDRESS=${RAY_ADDRESS:-auto} +RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} @@ -98,7 +94,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ - +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ From 7ab6ed61c8e6a81048dc036468db9f206741dd68 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 15:52:20 +0800 Subject: [PATCH 31/61] Fix Ray env var types for master port --- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 2ec95d226f0..dd291d20ab6 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -94,7 +94,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ - +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=$MASTER_PORT \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ From fd018b69ef754d483e3bb2883e6542d49ce2da17 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:01:13 +0800 Subject: [PATCH 32/61] Update RLVR launch scripts --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 1f3638d06c7..564e48b20eb 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp +export RAY_TMPDIR=/hbox2dir/ray_tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index dd291d20ab6..e2a820c0328 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/llm-align/liuchonghan/ray_tmp +export RAY_TMPDIR=/hbox2dir/ray_tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} From 52fc39ac2c2a1d3a7b07344f4255ca2eae3ad020 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:03:13 +0800 Subject: [PATCH 33/61] Set explicit Ray address for FSDP launch --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 564e48b20eb..14cc639e75a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -37,7 +37,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} -RAY_ADDRESS=${RAY_ADDRESS:-auto} +RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} From 9b26af596fef7ce396097240dbec0774a031c64d Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:03:55 +0800 Subject: [PATCH 34/61] Update FSDP Ray head address --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 14cc639e75a..b27f8ebedcc 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -37,7 +37,7 @@ CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} -RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} +RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} From 11a2cd1aeb74f053b46aee8a59f7589874391f97 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:06:54 +0800 Subject: [PATCH 35/61] Shorten Ray temp and working dir paths --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index b27f8ebedcc..8d2e0630fea 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/hbox2dir/ray_tmp +export RAY_TMPDIR=/hbox2dir/r ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -38,7 +38,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} -RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index e2a820c0328..62f037890b7 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,7 +13,7 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/hbox2dir/ray_tmp +export RAY_TMPDIR=/hbox2dir/r ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -35,7 +35,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} -RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/verl_lao} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} From 34ad8385fc7cd1b50547b9375d9be3a018c81489 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:12:31 +0800 Subject: [PATCH 36/61] Avoid Ray working_dir packaging to shorten IPC paths --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 +-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 8d2e0630fea..2651e9d0a30 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -100,8 +100,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.total_epochs=5 \ +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ - +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ - +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 62f037890b7..6d87f7519d5 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -91,8 +91,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c trainer.total_epochs=5 \ +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ +ray_kwargs.ray_init.address=$RAY_ADDRESS \ - +ray_kwargs.ray_init.runtime_env.working_dir=$RAY_WORKING_DIR \ - +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=$RAY_WORKING_DIR:${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ From b25034e8f1be61ffe9c628cb8070eba7420964ef Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:16:43 +0800 Subject: [PATCH 37/61] Use user-owned short paths for Ray temp and work dirs --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 5 +++-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 2651e9d0a30..7afec051a12 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,7 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/hbox2dir/r +export RAY_TMPDIR=/llm-align/liuchonghan/r +export TMPDIR=/llm-align/liuchonghan/tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -38,7 +39,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} -RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 6d87f7519d5..033a13f7da9 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,7 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/hbox2dir/r +export RAY_TMPDIR=/llm-align/liuchonghan/r +export TMPDIR=/llm-align/liuchonghan/tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} @@ -35,7 +36,7 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} -RAY_WORKING_DIR=${RAY_WORKING_DIR:-/hbox2dir/w} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} From fff6f0900c039e6df9768c1f829840e87e7d5811 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:25:39 +0800 Subject: [PATCH 38/61] Move Ray temp and TMPDIR to /dev/shm --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 7afec051a12..3b2dfff4cf3 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,8 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/llm-align/liuchonghan/r -export TMPDIR=/llm-align/liuchonghan/tmp +export RAY_TMPDIR=/dev/shm/ray +export TMPDIR=/dev/shm/tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 033a13f7da9..b960a5e0906 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,8 +13,8 @@ export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} -export RAY_TMPDIR=/llm-align/liuchonghan/r -export TMPDIR=/llm-align/liuchonghan/tmp +export RAY_TMPDIR=/dev/shm/ray +export TMPDIR=/dev/shm/tmp ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} From 568690f420634e6d3ca1005a687473d3e6dd3f61 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:28:27 +0800 Subject: [PATCH 39/61] Pass TMPDIR to Ray runtime env --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 1 + recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 3b2dfff4cf3..44443beab7e 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -107,6 +107,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index b960a5e0906..14e19b718fc 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -98,6 +98,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ From d736fa306887763695175c1de8c38ae45acaeef1 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:33:10 +0800 Subject: [PATCH 40/61] Set WANDB_DIR to shared path for Ray workers --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 ++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 44443beab7e..f898c7d3477 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} @@ -107,6 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 14e19b718fc..b56b8915a0d 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} @@ -98,6 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ From a7d74602dcf35ed7075a8d34ded48fd3bfdf2a47 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:41:13 +0800 Subject: [PATCH 41/61] Disable Gloo IPv6 in RLVR launch scripts --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 ++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index f898c7d3477..cee7f208e54 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,6 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export GLOO_IPV6=${GLOO_IPV6:-0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp @@ -107,6 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index b56b8915a0d..d0e37bd952c 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,6 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export GLOO_IPV6=${GLOO_IPV6:-0} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp @@ -98,6 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ From 1e9e40b4aaa62f8b0194867edf6a6358a9616770 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:42:39 +0800 Subject: [PATCH 42/61] Ensure GLOO_IPV6 is passed as string --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index cee7f208e54..7a53ffcaf0a 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -13,7 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} -export GLOO_IPV6=${GLOO_IPV6:-0} +export GLOO_IPV6=${GLOO_IPV6:-"0"} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index d0e37bd952c..1b70f1ee86b 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,7 +13,7 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} -export GLOO_IPV6=${GLOO_IPV6:-0} +export GLOO_IPV6=${GLOO_IPV6:-"0"} export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp From fb012a9e3270aa87a2df76ce36e82c6b0c340140 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 16:44:29 +0800 Subject: [PATCH 43/61] Quote GLOO_IPV6 for Ray runtime env --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 +- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 7a53ffcaf0a..839b9ab4874 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -108,7 +108,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ - +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 1b70f1ee86b..bd20150c868 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -99,7 +99,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ - +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=$GLOO_IPV6 \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ From 23098c06a8c46d283cf63ebf1be74d6c006baf4c Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 17:11:30 +0800 Subject: [PATCH 44/61] Fix FSDP optimizer overrides --- .../run_grpo_fsdp_single_node.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 839b9ab4874..90e375dd528 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -41,12 +41,23 @@ rollout_mode=${ROLLOUT_MODE:-async} USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} -RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} -LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} +LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} # constant|cosine GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} +# FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`). +# Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided. +MIN_LR_RATIO=${MIN_LR_RATIO:-} +if [[ -z "${MIN_LR_RATIO}" ]]; then + MIN_LR_RATIO=$(python3 - < 0 else 0.0) +PY +) +fi + python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ algorithm.adv_estimator=grpo \ @@ -66,8 +77,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ - +actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ - +actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ + +actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \ + +actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \ actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=False \ From 603824deaa18356720001a88185c824157939187 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 17:13:09 +0800 Subject: [PATCH 45/61] Fix Hydra overrides for FSDP optimizer --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 90e375dd528..f53317e200d 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -43,7 +43,7 @@ RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} -LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} # constant|cosine +LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} # FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`). @@ -77,8 +77,8 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ - +actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \ - +actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \ + actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \ + actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \ actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ actor_rollout_ref.actor.use_kl_loss=False \ From b8fba05419af0c9450d2b1ccbc756491bc668a64 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 17:23:43 +0800 Subject: [PATCH 46/61] Pass WANDB_API_KEY to Ray runtime env --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index f53317e200d..ed5cb78c173 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -5,6 +5,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_API_KEY=${WANDB_API_KEY:-} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} @@ -121,6 +122,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_API_KEY=$WANDB_API_KEY \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ From 253fe3fcac9b2d81559da07ed858527c15141983 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 17:40:06 +0800 Subject: [PATCH 47/61] Add JSON-to-parquet converter for VERL SFT --- scripts/json_qa_to_verl_sft_parquet.py | 192 +++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 scripts/json_qa_to_verl_sft_parquet.py diff --git a/scripts/json_qa_to_verl_sft_parquet.py b/scripts/json_qa_to_verl_sft_parquet.py new file mode 100644 index 00000000000..e475db2e361 --- /dev/null +++ b/scripts/json_qa_to_verl_sft_parquet.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Convert a QA-style dataset (JSON array or JSONL) into a VERL SFT parquet file. + +Input item example: + {"question": "...", "response": "..."} + +Output schemas: +- single_turn: columns `question` and `answer` (strings) + Use with `verl/trainer/config/sft_trainer.yaml` defaults: + data.prompt_key=question + data.response_key=answer + +- messages: column `messages` (list of {role, content}) + Use with `verl/trainer/config/sft_trainer_engine.yaml` (MultiTurnSFTDataset): + data.messages_key=messages +""" + +from __future__ import annotations + +import argparse +import json +import os +from typing import Any, Dict, Iterator, List, Optional + + +def iter_items(path: str) -> Iterator[Dict[str, Any]]: + """ + Iterate items from either: + - JSON array file: [ {...}, {...}, ... ] + - JSONL file: one JSON object per line + + For huge JSON arrays, install `ijson` to stream: + pip install ijson + """ + try: + import ijson # type: ignore + except Exception: + ijson = None + + with open(path, "rb") as f: + # Peek the first non-whitespace byte. + first = None + while True: + b = f.read(1) + if not b: + break + if b not in b" \t\r\n": + first = b + break + f.seek(0) + + if first == b"[": + if ijson is None: + data = json.load(f) + if not isinstance(data, list): + raise ValueError(f"Expected a JSON array in {path}") + for obj in data: + if not isinstance(obj, dict): + raise ValueError(f"Expected dict items, got {type(obj)}") + yield obj + return + + for obj in ijson.items(f, "item"): + if not isinstance(obj, dict): + raise ValueError(f"Expected dict items, got {type(obj)}") + yield obj + return + + # JSONL fallback + for line in f: + line = line.strip() + if not line: + continue + obj = json.loads(line) + if not isinstance(obj, dict): + raise ValueError(f"Expected dict items, got {type(obj)}") + yield obj + + +def make_row( + item: Dict[str, Any], + *, + input_key: str, + output_key: str, + out_format: str, + system_prompt: Optional[str], +) -> Dict[str, Any]: + q = item.get(input_key) + a = item.get(output_key) + if q is None or a is None: + raise KeyError(f"Missing keys: {input_key!r} / {output_key!r}. Got keys={sorted(item.keys())}") + if not isinstance(q, str) or not isinstance(a, str): + raise TypeError(f"Expected strings; got {type(q)} / {type(a)}") + + if out_format == "single_turn": + return {"question": q, "answer": a} + + if out_format == "messages": + messages: List[Dict[str, str]] = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": q}) + messages.append({"role": "assistant", "content": a}) + return {"messages": messages} + + raise ValueError(f"Unknown out_format: {out_format}") + + +def write_parquet( + *, + input_path: str, + output_path: str, + input_key: str, + output_key: str, + out_format: str, + system_prompt: Optional[str], + batch_size: int, +) -> int: + import pyarrow as pa + import pyarrow.parquet as pq + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + + if out_format == "single_turn": + schema = pa.schema([("question", pa.string()), ("answer", pa.string())]) + else: + msg_struct = pa.struct([("role", pa.string()), ("content", pa.string())]) + schema = pa.schema([("messages", pa.list_(msg_struct))]) + + writer: Optional[pq.ParquetWriter] = None + buf: List[Dict[str, Any]] = [] + total = 0 + + def flush() -> None: + nonlocal writer, buf, total + if not buf: + return + table = pa.Table.from_pylist(buf, schema=schema) + if writer is None: + writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd") + writer.write_table(table) + total += len(buf) + buf = [] + + try: + for it in iter_items(input_path): + buf.append( + make_row( + it, + input_key=input_key, + output_key=output_key, + out_format=out_format, + system_prompt=system_prompt, + ) + ) + if len(buf) >= batch_size: + flush() + flush() + finally: + if writer is not None: + writer.close() + + return total + + +def main() -> None: + ap = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + ap.add_argument("--input", required=True, help="Input JSON/JSONL path") + ap.add_argument("--output", required=True, help="Output parquet path") + ap.add_argument("--input_key", default="question", help="Field name for prompt text") + ap.add_argument("--output_key", default="response", help="Field name for response text") + ap.add_argument("--format", dest="out_format", choices=["single_turn", "messages"], default="single_turn") + ap.add_argument("--system_prompt", default=None, help="Optional system prompt (messages format only)") + ap.add_argument("--batch_size", type=int, default=4096, help="Write batch size") + args = ap.parse_args() + + n = write_parquet( + input_path=args.input, + output_path=args.output, + input_key=args.input_key, + output_key=args.output_key, + out_format=args.out_format, + system_prompt=args.system_prompt, + batch_size=args.batch_size, + ) + print(f"[OK] Wrote {n} rows -> {args.output}") + + +if __name__ == "__main__": + main() + From 4b96b3b63676112bb698157b3a3ce1e419e50b13 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 17:49:26 +0800 Subject: [PATCH 48/61] Tune FSDP rollout weight-sync bucket --- .../run_grpo_fsdp_single_node.sh | 2 + scripts/json_qa_to_verl_sft_parquet.py | 192 ------------------ 2 files changed, 2 insertions(+), 192 deletions(-) delete mode 100644 scripts/json_qa_to_verl_sft_parquet.py diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index ed5cb78c173..da3598b1f6e 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -46,6 +46,7 @@ ACTOR_LR=${ACTOR_LR:-1e-6} MIN_LR=${MIN_LR:-1e-7} LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} +UPDATE_WEIGHTS_BUCKET_MB=${UPDATE_WEIGHTS_BUCKET_MB:-4096} # FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`). # Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided. @@ -94,6 +95,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ actor_rollout_ref.rollout.max_model_len=2048 \ + actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=$UPDATE_WEIGHTS_BUCKET_MB \ actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ diff --git a/scripts/json_qa_to_verl_sft_parquet.py b/scripts/json_qa_to_verl_sft_parquet.py deleted file mode 100644 index e475db2e361..00000000000 --- a/scripts/json_qa_to_verl_sft_parquet.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert a QA-style dataset (JSON array or JSONL) into a VERL SFT parquet file. - -Input item example: - {"question": "...", "response": "..."} - -Output schemas: -- single_turn: columns `question` and `answer` (strings) - Use with `verl/trainer/config/sft_trainer.yaml` defaults: - data.prompt_key=question - data.response_key=answer - -- messages: column `messages` (list of {role, content}) - Use with `verl/trainer/config/sft_trainer_engine.yaml` (MultiTurnSFTDataset): - data.messages_key=messages -""" - -from __future__ import annotations - -import argparse -import json -import os -from typing import Any, Dict, Iterator, List, Optional - - -def iter_items(path: str) -> Iterator[Dict[str, Any]]: - """ - Iterate items from either: - - JSON array file: [ {...}, {...}, ... ] - - JSONL file: one JSON object per line - - For huge JSON arrays, install `ijson` to stream: - pip install ijson - """ - try: - import ijson # type: ignore - except Exception: - ijson = None - - with open(path, "rb") as f: - # Peek the first non-whitespace byte. - first = None - while True: - b = f.read(1) - if not b: - break - if b not in b" \t\r\n": - first = b - break - f.seek(0) - - if first == b"[": - if ijson is None: - data = json.load(f) - if not isinstance(data, list): - raise ValueError(f"Expected a JSON array in {path}") - for obj in data: - if not isinstance(obj, dict): - raise ValueError(f"Expected dict items, got {type(obj)}") - yield obj - return - - for obj in ijson.items(f, "item"): - if not isinstance(obj, dict): - raise ValueError(f"Expected dict items, got {type(obj)}") - yield obj - return - - # JSONL fallback - for line in f: - line = line.strip() - if not line: - continue - obj = json.loads(line) - if not isinstance(obj, dict): - raise ValueError(f"Expected dict items, got {type(obj)}") - yield obj - - -def make_row( - item: Dict[str, Any], - *, - input_key: str, - output_key: str, - out_format: str, - system_prompt: Optional[str], -) -> Dict[str, Any]: - q = item.get(input_key) - a = item.get(output_key) - if q is None or a is None: - raise KeyError(f"Missing keys: {input_key!r} / {output_key!r}. Got keys={sorted(item.keys())}") - if not isinstance(q, str) or not isinstance(a, str): - raise TypeError(f"Expected strings; got {type(q)} / {type(a)}") - - if out_format == "single_turn": - return {"question": q, "answer": a} - - if out_format == "messages": - messages: List[Dict[str, str]] = [] - if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - messages.append({"role": "user", "content": q}) - messages.append({"role": "assistant", "content": a}) - return {"messages": messages} - - raise ValueError(f"Unknown out_format: {out_format}") - - -def write_parquet( - *, - input_path: str, - output_path: str, - input_key: str, - output_key: str, - out_format: str, - system_prompt: Optional[str], - batch_size: int, -) -> int: - import pyarrow as pa - import pyarrow.parquet as pq - - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) - - if out_format == "single_turn": - schema = pa.schema([("question", pa.string()), ("answer", pa.string())]) - else: - msg_struct = pa.struct([("role", pa.string()), ("content", pa.string())]) - schema = pa.schema([("messages", pa.list_(msg_struct))]) - - writer: Optional[pq.ParquetWriter] = None - buf: List[Dict[str, Any]] = [] - total = 0 - - def flush() -> None: - nonlocal writer, buf, total - if not buf: - return - table = pa.Table.from_pylist(buf, schema=schema) - if writer is None: - writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd") - writer.write_table(table) - total += len(buf) - buf = [] - - try: - for it in iter_items(input_path): - buf.append( - make_row( - it, - input_key=input_key, - output_key=output_key, - out_format=out_format, - system_prompt=system_prompt, - ) - ) - if len(buf) >= batch_size: - flush() - flush() - finally: - if writer is not None: - writer.close() - - return total - - -def main() -> None: - ap = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - ap.add_argument("--input", required=True, help="Input JSON/JSONL path") - ap.add_argument("--output", required=True, help="Output parquet path") - ap.add_argument("--input_key", default="question", help="Field name for prompt text") - ap.add_argument("--output_key", default="response", help="Field name for response text") - ap.add_argument("--format", dest="out_format", choices=["single_turn", "messages"], default="single_turn") - ap.add_argument("--system_prompt", default=None, help="Optional system prompt (messages format only)") - ap.add_argument("--batch_size", type=int, default=4096, help="Write batch size") - args = ap.parse_args() - - n = write_parquet( - input_path=args.input, - output_path=args.output, - input_key=args.input_key, - output_key=args.output_key, - out_format=args.out_format, - system_prompt=args.system_prompt, - batch_size=args.batch_size, - ) - print(f"[OK] Wrote {n} rows -> {args.output}") - - -if __name__ == "__main__": - main() - From 2be47a4eedb4f8cfc72e3f7b59ad68995df0941c Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 18:19:00 +0800 Subject: [PATCH 49/61] Propagate proxy and tmp dirs to Ray env --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 6 ++++++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index da3598b1f6e..ec65374c59e 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -12,6 +12,8 @@ export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} +export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} +export all_proxy=${all_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -19,6 +21,8 @@ export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp +mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR" + ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} @@ -132,5 +136,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index bd20150c868..97eeb9a7785 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -11,6 +11,8 @@ export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} +export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} +export all_proxy=${all_proxy:-$WANDB_PROXY_URL} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -18,6 +20,8 @@ export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} export RAY_TMPDIR=/dev/shm/ray export TMPDIR=/dev/shm/tmp +mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR" + ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} @@ -108,5 +112,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 093ed14f9f36267a571cdec923e6f68bb3a17b5c Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 18:35:41 +0800 Subject: [PATCH 50/61] Fix SFT Megatron lr scheduler steps --- .../run_sft_qwen3moe_235b_a22b_megatron.sh | 47 +++++++++++++++++-- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh index 39d84d8beab..ed6b21b5546 100644 --- a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh +++ b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh @@ -2,11 +2,14 @@ set -xeuo pipefail ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} -TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet} +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} backend=${BACKEND:-megatron} project_name=verl_sft_235ba22b_2507 RESUME_MODE=disable -MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B} +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-235B-A22B-Instruct-2507} +TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} +TOTAL_TRAINING_STEPS=${TOTAL_TRAINING_STEPS:-} SP_SIZE=${SP_SIZE:-1} FSDP_SIZE=${FSDP_SIZE:-64} @@ -80,6 +83,39 @@ export NCCL_DEBUG=WARN export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao +if [[ -z "${TOTAL_TRAINING_STEPS}" ]]; then + # Megatron's OptimizerParamScheduler asserts `lr_decay_steps > 0`. + # VERL SFT derives total steps from `len(train_dataloader)`, which can be 0/unknown with some samplers + # (e.g. dynamic-bsz). Provide a safe positive estimate based on parquet row count. + TOTAL_TRAINING_STEPS=$(python3 - <<'PY' +import math +import os + +train_files = os.environ.get("TRAIN_FILES", "") +batch_size = int(os.environ.get("TRAIN_BATCH_SIZE", "256")) +epochs = int(os.environ.get("TOTAL_EPOCHS", "1")) + +rows = None +try: + import pyarrow.parquet as pq + + rows = pq.ParquetFile(train_files).metadata.num_rows +except Exception: + rows = None + +if rows is None: + steps = 1000 * max(1, epochs) +else: + steps_per_epoch = max(1, math.ceil(rows / max(1, batch_size))) + steps = steps_per_epoch * max(1, epochs) + +print(steps) +PY +) +fi + +echo ">>> SFT steps: total_epochs=${TOTAL_EPOCHS}, train_batch_size=${TRAIN_BATCH_SIZE}, total_training_steps=${TOTAL_TRAINING_STEPS}" + torchrun \ --nnodes=${NNODES} \ --node_rank=${NODE_RANK} \ @@ -88,7 +124,7 @@ torchrun \ --nproc-per-node=8 \ ${ENTRYPOINT} \ data.train_files="${TRAIN_FILES}" \ - data.train_batch_size=256 \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ data.max_length=1024 \ data.pad_mode=${PAD_MODE} \ data.truncation=right \ @@ -102,11 +138,12 @@ torchrun \ model.enable_gradient_checkpointing=True \ ${ENGINE_CONFIG} \ trainer.test_freq=-1 \ - trainer.save_freq=1000 \ + trainer.save_freq=2000 \ 'trainer.logger=[console]' \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ - trainer.total_epochs=1 \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \ trainer.default_local_dir="${CKPT_HOME}" \ trainer.resume_mode=${RESUME_MODE} \ trainer.max_ckpt_to_keep=2 \ From cba9e5ea9d15b29dc0f3a26112217003cd007a71 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 19:06:11 +0800 Subject: [PATCH 51/61] Add NO_PROXY for internal traffic --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++++ .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index ec65374c59e..2862547f79d 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -14,6 +14,8 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} export all_proxy=${all_proxy:-$WANDB_PROXY_URL} +export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} +export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -138,5 +140,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 97eeb9a7785..5996c52b710 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -13,6 +13,8 @@ export http_proxy=${http_proxy:-$WANDB_PROXY_URL} export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} export all_proxy=${all_proxy:-$WANDB_PROXY_URL} +export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} +export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -114,5 +116,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 3cee17dbca58c251155e6710a27b957a0e932c90 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 10 Feb 2026 19:08:30 +0800 Subject: [PATCH 52/61] Quote NO_PROXY for Hydra overrides --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 4 ++-- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 2862547f79d..f153c0698f6 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -140,7 +140,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 5996c52b710..9967d82d4c0 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -116,7 +116,7 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=$NO_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=$no_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ custom_reward_function.name=char_count_reward_function From 70616b261770cdef6b95de4221ca7f08c4fb0fba Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 10:35:11 +0800 Subject: [PATCH 53/61] Force proxy env vars for Ray workers --- .../run_grpo_fsdp_single_node.sh | 17 ++++++++++------- .../run_grpo_megatron_single_node.sh | 17 ++++++++++------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index f153c0698f6..524e5caa10c 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -8,14 +8,17 @@ export WANDB_MODE=${WANDB_MODE:-online} export WANDB_API_KEY=${WANDB_API_KEY:-} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} -export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} -export all_proxy=${all_proxy:-$WANDB_PROXY_URL} +# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`). +export HTTP_PROXY="$WANDB_PROXY_URL" +export HTTPS_PROXY="$WANDB_PROXY_URL" +export http_proxy="$WANDB_PROXY_URL" +export https_proxy="$WANDB_PROXY_URL" +export ALL_PROXY="$WANDB_PROXY_URL" +export all_proxy="$WANDB_PROXY_URL" + +# Ensure internal traffic never goes through the proxy. export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} -export no_proxy=${no_proxy:-"$NO_PROXY"} +export no_proxy="$NO_PROXY" export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 9967d82d4c0..2baa5f24c5d 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -7,14 +7,17 @@ export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-online} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} -export ALL_PROXY=${ALL_PROXY:-$WANDB_PROXY_URL} -export all_proxy=${all_proxy:-$WANDB_PROXY_URL} +# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`). +export HTTP_PROXY="$WANDB_PROXY_URL" +export HTTPS_PROXY="$WANDB_PROXY_URL" +export http_proxy="$WANDB_PROXY_URL" +export https_proxy="$WANDB_PROXY_URL" +export ALL_PROXY="$WANDB_PROXY_URL" +export all_proxy="$WANDB_PROXY_URL" + +# Ensure internal traffic never goes through the proxy. export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} -export no_proxy=${no_proxy:-"$NO_PROXY"} +export no_proxy="$NO_PROXY" export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} From 2cc92e97944b908c387b4760ff39183174241a6d Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 10:52:30 +0800 Subject: [PATCH 54/61] recipes: drop ALL_PROXY from GRPO scripts --- .../run_grpo_fsdp_single_node.sh | 17 +++++------------ .../run_grpo_megatron_single_node.sh | 17 +++++------------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 524e5caa10c..63f83e8d0ca 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -8,17 +8,12 @@ export WANDB_MODE=${WANDB_MODE:-online} export WANDB_API_KEY=${WANDB_API_KEY:-} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`). -export HTTP_PROXY="$WANDB_PROXY_URL" -export HTTPS_PROXY="$WANDB_PROXY_URL" -export http_proxy="$WANDB_PROXY_URL" -export https_proxy="$WANDB_PROXY_URL" -export ALL_PROXY="$WANDB_PROXY_URL" -export all_proxy="$WANDB_PROXY_URL" - -# Ensure internal traffic never goes through the proxy. +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} -export no_proxy="$NO_PROXY" +export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -141,8 +136,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 2baa5f24c5d..0a28cc6b5be 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -7,17 +7,12 @@ export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-online} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -# Force proxy vars for this job (base images often preset `http_proxy` / `no_proxy`). -export HTTP_PROXY="$WANDB_PROXY_URL" -export HTTPS_PROXY="$WANDB_PROXY_URL" -export http_proxy="$WANDB_PROXY_URL" -export https_proxy="$WANDB_PROXY_URL" -export ALL_PROXY="$WANDB_PROXY_URL" -export all_proxy="$WANDB_PROXY_URL" - -# Ensure internal traffic never goes through the proxy. +export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} +export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} +export http_proxy=${http_proxy:-$WANDB_PROXY_URL} +export https_proxy=${https_proxy:-$WANDB_PROXY_URL} export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} -export no_proxy="$NO_PROXY" +export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} export GLOO_IPV6=${GLOO_IPV6:-"0"} @@ -117,8 +112,6 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.ALL_PROXY=$ALL_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.all_proxy=$all_proxy \ +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ From 2105fb4b53e9e4aef0d0c084625385fa364c58d4 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 11:00:20 +0800 Subject: [PATCH 55/61] debug --- ...un_sft_qwen3moe_235b_a22b_megatron_dlc.sh} | 0 .../run_sft_qwen3moe_30b_a3b_megatron.sh | 113 ------------------ ..._sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh} | 0 ... run_sft_qwen3moe_30b_a3b_megatron_dlc.sh} | 0 4 files changed, 113 deletions(-) rename recipes_custom/{Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh => Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh} (100%) delete mode 100644 recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh rename recipes_custom/Qwen3-30BA3B-translate/{run_sft_qwen3moe_30b_a3b_megatron_aux.sh => run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh} (100%) rename recipes_custom/Qwen3-30BA3B-translate/{run_sft_qwen3moe_30b_a3b_megatron.sh => run_sft_qwen3moe_30b_a3b_megatron_dlc.sh} (100%) diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh similarity index 100% rename from recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_235b_a22b_megatron.sh rename to recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh diff --git a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh deleted file mode 100644 index a45209ffcc1..00000000000 --- a/recipes_custom/Qwen3-235BA22B-2507/run_sft_qwen3moe_30b_a3b_megatron.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} -TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data_verl.parquet} -backend=${BACKEND:-megatron} -project_name=verl_sft_235ba22b_2507 -RESUME_MODE=disable -MODEL_ID=${MODEL_ID:-/mnt/data/open_models/Qwen3/Qwen3-235B-A22B} - -SP_SIZE=${SP_SIZE:-1} -FSDP_SIZE=${FSDP_SIZE:-64} -FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} - -TP_SIZE=${TP_SIZE:-4} -PP_SIZE=${PP_SIZE:-1} -EP_SIZE=${EP_SIZE:-8} -VPP_SIZE=${VPP_SIZE:-null} -CP_SIZE=${CP_SIZE:-1} - -PAD_MODE=${PAD_MODE:-no_padding} -USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} - -FSDP_ENGINE_CONFIG=" - engine=${backend} \ - optim=${backend} \ - optim.lr=5e-6 \ - optim.lr_warmup_steps_ratio=0.05 \ - optim.weight_decay=0.1 \ - optim.betas="[0.9,0.95]" \ - optim.clip_grad=1.0 \ - optim.min_lr_ratio=0.1 \ - optim.warmup_style=cosine \ - engine.ulysses_sequence_parallel_size=${SP_SIZE} \ - engine.strategy=${FSDP_STRATEGY} \ - engine.fsdp_size=${FSDP_SIZE}" - -MEGATRON_ENGINE_CONFIG=" - engine=${backend} \ - optim=${backend} \ - optim.lr=6e-6 \ - optim.lr_warmup_steps_ratio=0.05 \ - optim.weight_decay=0.1 \ - optim.betas="[0.9,0.95]" \ - optim.clip_grad=1.0 \ - optim.lr_warmup_init=0 \ - optim.lr_decay_style=cosine \ - optim.min_lr=6e-7 \ - engine.tensor_model_parallel_size=${TP_SIZE} \ - engine.pipeline_model_parallel_size=${PP_SIZE} \ - engine.expert_model_parallel_size=${EP_SIZE} \ - engine.context_parallel_size=${CP_SIZE} \ - engine.use_mbridge=True" - -if [ "$backend" = "fsdp" ]; then - ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" - echo "Using fsdp engine" - exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} -else - ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" - echo "Using megatron engine" - exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} -fi - -CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} -NNODES=${WORLD_SIZE:-16} -NODE_RANK=${RANK:-0} -MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -MASTER_PORT=${MASTER_PORT:-23457} - -echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" -echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" - -if [ "$NODE_RANK" -eq 0 ]; then - mkdir -p "${CKPT_HOME}" -fi - -export WANDB_MODE=offline -export NCCL_DEBUG=WARN -export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao - -torchrun \ - --nnodes=${NNODES} \ - --node_rank=${NODE_RANK} \ - --master_addr=${MASTER_ADDR} \ - --master_port=${MASTER_PORT} \ - --nproc-per-node=8 \ - ${ENTRYPOINT} \ - data.train_files="${TRAIN_FILES}" \ - data.train_batch_size=512 \ - data.max_length=4096 \ - data.pad_mode=${PAD_MODE} \ - data.truncation=right \ - data.use_dynamic_bsz=True \ - data.max_token_len_per_gpu=24576 \ - data.messages_key=messages \ - data.ignore_input_ids_mismatch=True \ - model.path=$MODEL_ID \ - model.use_remove_padding=${USE_REMOVE_PADDING} \ - +model.override_config.output_router_logits=True \ - +model.override_config.router_dtype="float32" \ - model.enable_gradient_checkpointing=True \ - ${ENGINE_CONFIG} \ - trainer.test_freq=-1 \ - trainer.save_freq=1000 \ - 'trainer.logger=[console]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.total_epochs=1 \ - trainer.default_local_dir="${CKPT_HOME}" \ - trainer.resume_mode=${RESUME_MODE} \ - trainer.max_ckpt_to_keep=1 \ - 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' \ No newline at end of file diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh similarity index 100% rename from recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux.sh rename to recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh similarity index 100% rename from recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron.sh rename to recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh From 86c552933f2f1ff8acfe132fc59048a1bf502294 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 11:36:52 +0800 Subject: [PATCH 56/61] recipes: disable proxy and use wandb offline for GRPO --- .../run_grpo_fsdp_single_node.sh | 20 +++++++++---------- .../run_grpo_megatron_single_node.sh | 20 +++++++++---------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 63f83e8d0ca..6a24c62af97 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -4,14 +4,12 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_MODE=${WANDB_MODE:-offline} export WANDB_API_KEY=${WANDB_API_KEY:-} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} -export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} +# Proxy is disabled by default. If you need it temporarily, set env vars +# outside this script and remove the unsets below. +unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} @@ -131,11 +129,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_API_KEY=$WANDB_API_KEY \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ - +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ - +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 0a28cc6b5be..7eefae2aca0 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -4,13 +4,11 @@ set -xeuo pipefail export CUDA_DEVICE_MAX_CONNECTIONS=1 export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 -export WANDB_MODE=${WANDB_MODE:-online} +export WANDB_MODE=${WANDB_MODE:-offline} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} -export WANDB_PROXY_URL=${WANDB_PROXY_URL:-'http://yuhaiqiang:%7Bs%23fwCGAdJTQnFyE@proxy.ops.qihoo.net:8000'} -export HTTP_PROXY=${HTTP_PROXY:-$WANDB_PROXY_URL} -export HTTPS_PROXY=${HTTPS_PROXY:-$WANDB_PROXY_URL} -export http_proxy=${http_proxy:-$WANDB_PROXY_URL} -export https_proxy=${https_proxy:-$WANDB_PROXY_URL} +# Proxy is disabled by default. If you need it temporarily, set env vars +# outside this script and remove the unsets below. +unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} export no_proxy=${no_proxy:-"$NO_PROXY"} export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} @@ -107,11 +105,11 @@ python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/c +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ - +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=$WANDB_PROXY_URL \ - +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=$HTTP_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=$HTTPS_PROXY \ - +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=$http_proxy \ - +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=$https_proxy \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \ +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ From 8908e254c44a5600d49faf2b3ff8330597de3776 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 12:03:17 +0800 Subject: [PATCH 57/61] k --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 2 -- .../RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh | 2 -- 2 files changed, 4 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 6a24c62af97..843b5c72127 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -7,8 +7,6 @@ export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-offline} export WANDB_API_KEY=${WANDB_API_KEY:-} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} -# Proxy is disabled by default. If you need it temporarily, set env vars -# outside this script and remove the unsets below. unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} export no_proxy=${no_proxy:-"$NO_PROXY"} diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh index 7eefae2aca0..e427bc7aefd 100755 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -6,8 +6,6 @@ export VLLM_USE_V1=1 export VERL_USE_GPT_OSS=0 export WANDB_MODE=${WANDB_MODE:-offline} export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} -# Proxy is disabled by default. If you need it temporarily, set env vars -# outside this script and remove the unsets below. unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} export no_proxy=${no_proxy:-"$NO_PROXY"} From 435467f04c834dcf0558d4d82260c327f5e0e7d2 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 11 Feb 2026 16:44:20 +0800 Subject: [PATCH 58/61] recipes: set FSDP MASTER_ADDR default --- recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 843b5c72127..869f5ca231d 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -28,7 +28,8 @@ DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJ NNODES=${NNODES:-4} NODE_RANK=${NODE_RANK:-0} -MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} +# FSDP cluster: Ray head + torch master live on the FSDP master node by default. +MASTER_ADDR=${MASTER_ADDR:-10.178.131.202} MASTER_PORT=${MASTER_PORT:-23457} N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} From 6e4ce3dcf20ae3393d985a77d388ef303b4361e4 Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 26 Feb 2026 14:54:41 +0800 Subject: [PATCH 59/61] chore: update custom training recipes --- recipe | 2 +- .../run_sft_qwen2.5_72b_megatron_dlc.sh | 110 ++++++++++++++++++ ...run_sft_qwen3moe_235b_a22b_megatron_dlc.sh | 35 ------ 3 files changed, 111 insertions(+), 36 deletions(-) create mode 100644 recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh diff --git a/recipe b/recipe index 3490a22a0a3..21892b92769 160000 --- a/recipe +++ b/recipe @@ -1 +1 @@ -Subproject commit 3490a22a0a3adeb7e4787fe70b1060b642efbae4 +Subproject commit 21892b9276936efab5375c3f6b8415e472ef7118 diff --git a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh new file mode 100644 index 00000000000..6691d574949 --- /dev/null +++ b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet} +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} +backend=${BACKEND:-megatron} +project_name=verl_sft_qwen2.5_72b +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen2.5-72B-A064} +TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-8} +PP_SIZE=${PP_SIZE:-1} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.context_parallel_size=${CP_SIZE}" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=qwen2.5-72b-dense-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=qwen2.5-72b-dense-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_length=2048 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=4096 \ + data.messages_key=messages \ + data.ignore_input_ids_mismatch=True \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=2000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=2 \ + 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' diff --git a/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh index ed6b21b5546..c86d4516133 100644 --- a/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh +++ b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh @@ -9,7 +9,6 @@ project_name=verl_sft_235ba22b_2507 RESUME_MODE=disable MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-235B-A22B-Instruct-2507} TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} -TOTAL_TRAINING_STEPS=${TOTAL_TRAINING_STEPS:-} SP_SIZE=${SP_SIZE:-1} FSDP_SIZE=${FSDP_SIZE:-64} @@ -83,39 +82,6 @@ export NCCL_DEBUG=WARN export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao -if [[ -z "${TOTAL_TRAINING_STEPS}" ]]; then - # Megatron's OptimizerParamScheduler asserts `lr_decay_steps > 0`. - # VERL SFT derives total steps from `len(train_dataloader)`, which can be 0/unknown with some samplers - # (e.g. dynamic-bsz). Provide a safe positive estimate based on parquet row count. - TOTAL_TRAINING_STEPS=$(python3 - <<'PY' -import math -import os - -train_files = os.environ.get("TRAIN_FILES", "") -batch_size = int(os.environ.get("TRAIN_BATCH_SIZE", "256")) -epochs = int(os.environ.get("TOTAL_EPOCHS", "1")) - -rows = None -try: - import pyarrow.parquet as pq - - rows = pq.ParquetFile(train_files).metadata.num_rows -except Exception: - rows = None - -if rows is None: - steps = 1000 * max(1, epochs) -else: - steps_per_epoch = max(1, math.ceil(rows / max(1, batch_size))) - steps = steps_per_epoch * max(1, epochs) - -print(steps) -PY -) -fi - -echo ">>> SFT steps: total_epochs=${TOTAL_EPOCHS}, train_batch_size=${TRAIN_BATCH_SIZE}, total_training_steps=${TOTAL_TRAINING_STEPS}" - torchrun \ --nnodes=${NNODES} \ --node_rank=${NODE_RANK} \ @@ -143,7 +109,6 @@ torchrun \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ trainer.total_epochs=${TOTAL_EPOCHS} \ - trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \ trainer.default_local_dir="${CKPT_HOME}" \ trainer.resume_mode=${RESUME_MODE} \ trainer.max_ckpt_to_keep=2 \ From c3890f822908b5ddb40451448ed86dfc5b7d4bab Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 26 Feb 2026 16:42:16 +0800 Subject: [PATCH 60/61] chore: update grpo single node script --- .../RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh index 869f5ca231d..a80107339bf 100644 --- a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -49,17 +49,7 @@ LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} UPDATE_WEIGHTS_BUCKET_MB=${UPDATE_WEIGHTS_BUCKET_MB:-4096} -# FSDP optimizer uses `min_lr_ratio` (not `min_lr`) and `lr_scheduler_type` (not `lr_decay_style`). -# Default to MIN_LR / ACTOR_LR when MIN_LR_RATIO is not explicitly provided. -MIN_LR_RATIO=${MIN_LR_RATIO:-} -if [[ -z "${MIN_LR_RATIO}" ]]; then - MIN_LR_RATIO=$(python3 - < 0 else 0.0) -PY -) -fi +MIN_LR_RATIO=${MIN_LR_RATIO:-0.1} python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ --config-name='ppo_trainer.yaml' \ From 529e576b1451ddf857a4aa9eef79e1382e73815f Mon Sep 17 00:00:00 2001 From: khazic Date: Fri, 27 Feb 2026 10:59:13 +0800 Subject: [PATCH 61/61] chore: clean formatting in qwen2.5 72b sft run script --- .../run_sft_qwen2.5_72b_megatron_dlc.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh index 6691d574949..37108817085 100644 --- a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh +++ b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh @@ -6,7 +6,7 @@ TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_me TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} backend=${BACKEND:-megatron} project_name=verl_sft_qwen2.5_72b -RESUME_MODE=disable +RESUME_MODE=disable # auto MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen2.5-72B-A064} TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} @@ -61,10 +61,10 @@ else fi CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} -NNODES=${WORLD_SIZE:-16} -NODE_RANK=${RANK:-0} -MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -MASTER_PORT=${MASTER_PORT:-23457} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT"