diff --git a/.gitmodules b/.gitmodules index d5dd7a6aa57..af166615b4a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "recipe"] path = recipe - url = https://github.com/verl-project/verl-recipe.git + url = https://github.com/khazic/verl-recipe_lao.git diff --git a/recipe b/recipe index 3490a22a0a3..21892b92769 160000 --- a/recipe +++ b/recipe @@ -1 +1 @@ -Subproject commit 3490a22a0a3adeb7e4787fe70b1060b642efbae4 +Subproject commit 21892b9276936efab5375c3f6b8415e472ef7118 diff --git a/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh new file mode 100644 index 00000000000..37108817085 --- /dev/null +++ b/recipes_custom/Qwen2.5-72B-sft/run_sft_qwen2.5_72b_megatron_dlc.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet} +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} +backend=${BACKEND:-megatron} +project_name=verl_sft_qwen2.5_72b +RESUME_MODE=disable # auto +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen2.5-72B-A064} +TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-8} +PP_SIZE=${PP_SIZE:-1} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.context_parallel_size=${CP_SIZE}" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=qwen2.5-72b-dense-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=qwen2.5-72b-dense-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_length=2048 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=4096 \ + data.messages_key=messages \ + data.ignore_input_ids_mismatch=True \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=2000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=2 \ + 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' diff --git a/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh new file mode 100644 index 00000000000..c86d4516133 --- /dev/null +++ b/recipes_custom/Qwen3-235BA22B-2507-sft/run_sft_qwen3moe_235b_a22b_megatron_dlc.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/235b_dataset/merged_sft_with_messages.parquet} +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} +backend=${BACKEND:-megatron} +project_name=verl_sft_235ba22b_2507 +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-235B-A22B-Instruct-2507} +TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-235b-a22b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-16} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl_lao + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_length=1024 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=10240 \ + data.messages_key=messages \ + data.ignore_input_ids_mismatch=True \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.router_dtype="float16" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=2000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=2 \ + 'checkpoint.save_contents=[model,optimizer,extra,hf_model]' diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh new file mode 100644 index 00000000000..bf59deb9bda --- /dev/null +++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_aux_dlc.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_translate_0109_aux +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=5e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True \ + +engine.override_transformer_config.moe_aux_loss_coeff=0.01 \ + +engine.override_transformer_config.moe_z_loss_coeff=0.001 \ + +engine.override_transformer_config.moe_router_load_balancing_type=aux_loss" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-8} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=512 \ + data.max_length=8192 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=49152 \ + data.messages_key=messages \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.output_router_logits=True \ + +model.override_config.router_dtype="float32" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=5000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=2 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=3 \ + 'checkpoint.save_contents=[model,optimizer,extra]' \ No newline at end of file diff --git a/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh new file mode 100644 index 00000000000..5b0a7ea263d --- /dev/null +++ b/recipes_custom/Qwen3-30BA3B-translate/run_sft_qwen3moe_30b_a3b_megatron_dlc.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/translate_parquet/train_data.parquet} +backend=${BACKEND:-megatron} +project_name=verl_sft_translate_0109 +RESUME_MODE=disable +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/Qwen3-30B-A3B-Instruct-2507} + +SP_SIZE=${SP_SIZE:-1} +FSDP_SIZE=${FSDP_SIZE:-64} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-4} +PP_SIZE=${PP_SIZE:-1} +EP_SIZE=${EP_SIZE:-8} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=5e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + +MEGATRON_ENGINE_CONFIG=" + engine=${backend} \ + optim=${backend} \ + optim.lr=6e-6 \ + optim.lr_warmup_steps_ratio=0.05 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=6e-7 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.expert_model_parallel_size=${EP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE} +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=nvidia-qwen3-30b-moe-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-ep${EP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE} +fi + +CKPT_HOME=${CKPT_HOME:-/mnt/data/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} +NNODES=${WORLD_SIZE:-8} +NODE_RANK=${RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +MASTER_PORT=${MASTER_PORT:-23457} + +echo ">>> 节点信息: RANK $NODE_RANK / WORLD_SIZE $NNODES" +echo ">>> 通信信息: MASTER $MASTER_ADDR : $MASTER_PORT" + +if [ "$NODE_RANK" -eq 0 ]; then + mkdir -p "${CKPT_HOME}" +fi + +export WANDB_MODE=offline +export NCCL_DEBUG=WARN +export PYTHONPATH=${PYTHONPATH:-}:/mnt/data/liuchonghan/verl + +torchrun \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + --nproc-per-node=8 \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=512 \ + data.max_length=8192 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=right \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=49152 \ + data.messages_key=messages \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + +model.override_config.output_router_logits=True \ + +model.override_config.router_dtype="float32" \ + model.enable_gradient_checkpointing=True \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=5000 \ + 'trainer.logger=[console]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=2 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=3 \ + 'checkpoint.save_contents=[model,optimizer,extra]' \ No newline at end of file diff --git a/recipes_custom/RLVR_ABCDE_dense/create_dataset.py b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py new file mode 100644 index 00000000000..754dacde603 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/create_dataset.py @@ -0,0 +1,198 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Task description: +Given a random word and a random char, count the number of occurrence of char in the word. + +Create CoT dataset that split the word into separate char. Then list the char and count the occurrence. + +The word set comes from shakespeare +""" + +import os.path +import random + +prompt_template = "How many {} are there in word {}?" + + +def generate_random_char(): + return chr(97 + random.randint(0, 25)) + + +def create_prompt_response(min_length=3, max_length=5): + # randomly generate a length + word_length = random.randint(min_length, max_length) + # randomly generate a target count number. This makes the target number + target_count_number = random.randint(1, word_length) + + char_lst = [] + # generate the word + # step 1: generate the target word + target_char = generate_random_char() + + for _ in range(target_count_number): + char_lst.append(target_char) + + # step 2: generate other words + for _ in range(word_length - target_count_number): + while True: + char = generate_random_char() + if char != target_char: + char_lst.append(char) + break + + # step 3: random permute char_lst + random.shuffle(char_lst) + + word = "-".join(char_lst) + + prompt = prompt_template.format(target_char, word) + final_answer = [] + + # cot + number = 0 + for i, char in enumerate(char_lst): + cot = f"{char}" + if char != target_char: + cot += " != " + else: + cot += " = " + number += 1 + cot += f"{target_char}." + + final_answer.append(cot) + + conclusion = f"\\boxed{{{number}}} {target_char} in {word}." + + final_answer.append(conclusion) + + final_answer = "\n".join(final_answer) + + return prompt, final_answer + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--total_number", type=int, default=10000) + parser.add_argument("--min_length", type=int, default=5) + parser.add_argument("--max_length", type=int, default=20) + parser.add_argument("--data_path", type=str, default="~/data/char_count") + + args = vars(parser.parse_args()) + + total_number = args["total_number"] + min_length = args["min_length"] + max_length = args["max_length"] + data_path = args["data_path"] + data_path = os.path.expanduser(data_path) + + full_output = [] + for _ in range(total_number): + output = create_prompt_response(min_length=min_length, max_length=max_length) + full_output.append(output) + + # random reorder + random.shuffle(full_output) + + # split for train and test + train_split_len = int(0.9 * len(full_output)) + train_outputs = full_output[:train_split_len] + test_output = full_output[train_split_len:] + + sft_train_dataset = {"messages": []} + + for o in train_outputs: + messages = [ + {"role": "user", "content": o[0]}, + {"role": "assistant", "content": o[1]}, + ] + + sft_train_dataset["messages"].append(messages) + + sft_test_dataset = {"messages": []} + + for o in test_output: + messages = [ + {"role": "user", "content": o[0]}, + {"role": "assistant", "content": o[1]}, + ] + sft_test_dataset["messages"].append(messages) + + import pandas as pd + + sft_train_dataset = pd.DataFrame(data=sft_train_dataset) + sft_test_dataset = pd.DataFrame(data=sft_test_dataset) + + folder = os.path.join(data_path, "sft") + + os.makedirs(folder, exist_ok=True) + + sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) + sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) + + # build RL dataset + rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} + + rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []} + + from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed + + for o in train_outputs: + prompt = o[0] + response = o[1] + prompt_with_template = [ + { + "role": "user", + "content": prompt, + } + ] + + rl_train_dataset["prompt"].append(prompt_with_template) + rl_train_dataset["data_source"].append("char_count") + rl_train_dataset["ability"].append("other") + rl_train_dataset["reward_model"].append( + {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} + ) + rl_train_dataset["extra_info"].append({"response": response}) + + for o in test_output: + prompt = o[0] + response = o[1] + prompt_with_template = [ + { + "role": "user", + "content": prompt, + } + ] + + rl_test_dataset["prompt"].append(prompt_with_template) + rl_test_dataset["data_source"].append("char_count") + rl_test_dataset["ability"].append("other") + rl_test_dataset["reward_model"].append( + {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))} + ) + rl_test_dataset["extra_info"].append({"response": response}) + + rl_train_dataset = pd.DataFrame(data=rl_train_dataset) + rl_test_dataset = pd.DataFrame(data=rl_test_dataset) + + folder = os.path.join(data_path, "rl") + + os.makedirs(folder, exist_ok=True) + + rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet")) + rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet")) diff --git a/recipes_custom/RLVR_ABCDE_dense/reward_function.py b/recipes_custom/RLVR_ABCDE_dense/reward_function.py new file mode 100644 index 00000000000..61fe81bf207 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/reward_function.py @@ -0,0 +1,65 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Reward function +""" + +import re + +DEFAULT_CHOICES = ("A", "B", "C", "D", "E") +BOXED_PATTERN = re.compile(r"\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}") +CHOICE_PATTERN = re.compile( + r"(?:answer|option|choice)?\s*[:=]?\s*([A-Za-z])\b", re.IGNORECASE +) + + +def _extract_boxed_answer(text: str) -> str: + matches = BOXED_PATTERN.findall(text) + return matches[-1] if matches else "" + + +def _normalize_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str: + text = (text or "").strip().upper() + for char in text: + if char in valid_choices: + return char + return "" + + +def extract_choice(text: str, valid_choices=DEFAULT_CHOICES) -> str: + """ + Extract a single-letter choice, preferring \\boxed{} values but falling back + to phrases like "Answer: C" or the first standalone letter. + """ + text = str(text or "") + candidate = _normalize_choice(_extract_boxed_answer(text), valid_choices) + if candidate: + return candidate + match = CHOICE_PATTERN.search(text) + if match: + candidate = _normalize_choice(match.group(1), valid_choices) + if candidate: + return candidate + return _normalize_choice(text, valid_choices) + + +def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None): + try: + model_choice = extract_choice(solution_str) + gold_choice = extract_choice(ground_truth) + return 1 if model_choice and gold_choice and model_choice == gold_choice else 0 + except Exception: + print(ground_truth, solution_str) + return 0 diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh new file mode 100644 index 00000000000..6ab8523d75b --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_dlc.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export PYTHONPATH=/mnt/data/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/mnt/data/liuchonghan/vmlu_dataset/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/mnt/data/liuchonghan/75_0129_ckpt3000} +PROJECT_NAME=${PROJECT_NAME:-rlvr} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_72b_grpo_fsdp} + +NNODES=${PET_NNODES:-${WORLD_SIZE:-28}} +NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} +MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} +MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} +N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} + +FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2} +FSDP_SIZE=${FSDP_SIZE:-8} +ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False} +REF_OFFLOAD=${REF_OFFLOAD:-False} +CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +RAY_PORT=${RAY_PORT:-6379} +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} +RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} + +if [ "$NODE_RANK" -eq 0 ]; then + ray start --head \ + --node-ip-address="$MASTER_ADDR" \ + --port="$RAY_PORT" \ + --dashboard-port="$RAY_DASHBOARD_PORT" +else + ray start --address="$RAY_ADDRESS" + exit 0 +fi + +sleep 5 + +python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=512 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=224 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=224 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + critic.strategy=$FSDP_STRATEGY \ + critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ + critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ + critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=100 \ + trainer.test_freq=100 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ + custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh new file mode 100644 index 00000000000..a80107339bf --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_fsdp_single_node.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export WANDB_MODE=${WANDB_MODE:-offline} +export WANDB_API_KEY=${WANDB_API_KEY:-} +export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} +unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy +export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} +export no_proxy=${no_proxy:-"$NO_PROXY"} +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export GLOO_IPV6=${GLOO_IPV6:-"0"} +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} +export RAY_TMPDIR=/dev/shm/ray +export TMPDIR=/dev/shm/tmp + +mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR" + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_fsdp_single} +DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}} + +NNODES=${NNODES:-4} +NODE_RANK=${NODE_RANK:-0} +# FSDP cluster: Ray head + torch master live on the FSDP master node by default. +MASTER_ADDR=${MASTER_ADDR:-10.178.131.202} +MASTER_PORT=${MASTER_PORT:-23457} +N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} + +FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp2} +FSDP_SIZE=${FSDP_SIZE:-8} +ACTOR_OFFLOAD=${ACTOR_OFFLOAD:-False} +REF_OFFLOAD=${REF_OFFLOAD:-False} +CRITIC_OFFLOAD=${CRITIC_OFFLOAD:-False} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} +RAY_ADDRESS=${RAY_ADDRESS:-10.178.131.202:6379} +ACTOR_LR=${ACTOR_LR:-1e-6} +MIN_LR=${MIN_LR:-1e-7} +LR_SCHEDULER_TYPE=${LR_SCHEDULER_TYPE:-cosine} +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} +UPDATE_WEIGHTS_BUCKET_MB=${UPDATE_WEIGHTS_BUCKET_MB:-4096} + +MIN_LR_RATIO=${MIN_LR_RATIO:-0.1} + +python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=2048 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.actor.fsdp_config.param_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=$ACTOR_OFFLOAD \ + actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ + actor_rollout_ref.actor.optim.min_lr_ratio=$MIN_LR_RATIO \ + actor_rollout_ref.actor.optim.lr_scheduler_type=$LR_SCHEDULER_TYPE \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ + actor_rollout_ref.rollout.max_model_len=2048 \ + actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=$UPDATE_WEIGHTS_BUCKET_MB \ + actor_rollout_ref.ref.fsdp_config.fsdp_size=$FSDP_SIZE \ + actor_rollout_ref.ref.fsdp_config.param_offload=$REF_OFFLOAD \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + critic.strategy=$FSDP_STRATEGY \ + critic.model.fsdp_config.fsdp_size=$FSDP_SIZE \ + critic.model.fsdp_config.param_offload=$CRITIC_OFFLOAD \ + critic.model.fsdp_config.optimizer_offload=$CRITIC_OFFLOAD \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.default_local_dir=$DEFAULT_LOCAL_DIR \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=300 \ + trainer.test_freq=300 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"$MASTER_PORT\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_API_KEY=$WANDB_API_KEY \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ + custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh new file mode 100644 index 00000000000..1b0659fc7d0 --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_dlc.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/llm-alignment/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron} + +NNODES=${PET_NNODES:-${WORLD_SIZE:-28}} +NODE_RANK=${PET_NODE_RANK:-${RANK:-0}} +MASTER_ADDR=${PET_MASTER_ADDR:-${MASTER_ADDR:-"127.0.0.1"}} +MASTER_PORT=${PET_MASTER_PORT:-${MASTER_PORT:-23457}} +N_GPUS_PER_NODE=${PET_NPROC_PER_NODE:-${NPROC_PER_NODE:-${N_GPUS_PER_NODE:-8}}} + +TP_SIZE=${TP_SIZE:-8} +PP_SIZE=${PP_SIZE:-1} + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} + +RAY_PORT=${RAY_PORT:-6379} +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} +RAY_ADDRESS=${RAY_ADDRESS:-$MASTER_ADDR:$RAY_PORT} + +if [ "$NODE_RANK" -eq 0 ]; then + ray start --head \ + --node-ip-address="$MASTER_ADDR" \ + --port="$RAY_PORT" \ + --dashboard-port="$RAY_DASHBOARD_PORT" +else + ray start --address="$RAY_ADDRESS" + exit 0 +fi + +sleep 5 + +python3 $ENTRYPOINT --config-path=/mnt/data/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=512 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=224 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=224 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=100 \ + trainer.test_freq=100 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_GPT_OSS='"0"' \ + custom_reward_function.path=/mnt/data/liuchonghan/verl_lao/recipes_custom/rlvr_72b/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh new file mode 100755 index 00000000000..e427bc7aefd --- /dev/null +++ b/recipes_custom/RLVR_ABCDE_dense/run_grpo_megatron_single_node.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export VLLM_USE_V1=1 +export VERL_USE_GPT_OSS=0 +export WANDB_MODE=${WANDB_MODE:-offline} +export WANDB_DIR=${WANDB_DIR:-/llm-align/liuchonghan/wandb} +unset WANDB_PROXY_URL HTTP_PROXY HTTPS_PROXY http_proxy https_proxy ALL_PROXY all_proxy +export NO_PROXY=${NO_PROXY:-"localhost,127.0.0.1,::1,10.,172.16.,172.17.,172.18.,172.19.,192.168.,.svc,.cluster.local,.hbox-aigc.svc"} +export no_proxy=${no_proxy:-"$NO_PROXY"} +export PYTHONPATH=/llm-align/liuchonghan/verl_lao:${PYTHONPATH:-} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0} +export GLOO_IPV6=${GLOO_IPV6:-"0"} +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0} +export RAY_TMPDIR=/dev/shm/ray +export TMPDIR=/dev/shm/tmp + +mkdir -p "$WANDB_DIR" "$RAY_TMPDIR" "$TMPDIR" + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.main_ppo"} +TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/all_data_merged_rlhf.json} +MODEL_ID=${MODEL_ID:-/llm-align/liuchonghan/Qwen3-8B} +PROJECT_NAME=${PROJECT_NAME:-rlvr_8b} +EXPERIMENT_NAME=${EXPERIMENT_NAME:-rlvr_8b_grpo_megatron_single} +DEFAULT_LOCAL_DIR=${DEFAULT_LOCAL_DIR:-/llm-align/liuchonghan/checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}} + +NNODES=${NNODES:-4} +NODE_RANK=${NODE_RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-10.178.170.212} +MASTER_PORT=${MASTER_PORT:-23457} +N_GPUS_PER_NODE=${N_GPUS_PER_NODE:-8} + +TP_SIZE=4 +PP_SIZE=1 + +rollout_mode=${ROLLOUT_MODE:-async} +USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-True} +RETURN_RAW_CHAT=${RETURN_RAW_CHAT:-True} +RAY_ADDRESS=${RAY_ADDRESS:-10.178.170.212:6379} +RAY_WORKING_DIR=${RAY_WORKING_DIR:-/llm-align/liuchonghan/w} +ACTOR_LR=${ACTOR_LR:-1e-6} +MIN_LR=${MIN_LR:-1e-7} +LR_DECAY_STYLE=${LR_DECAY_STYLE:-cosine} +GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.35} + +python3 $ENTRYPOINT --config-path=/llm-align/liuchonghan/verl_lao/verl/trainer/config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_FILES \ + data.val_files=$TRAIN_FILES \ + data.val_max_samples=2048 \ + data.return_raw_chat=$RETURN_RAW_CHAT \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_ID \ + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \ + actor_rollout_ref.actor.optim.lr=$ACTOR_LR \ + actor_rollout_ref.actor.optim.min_lr=$MIN_LR \ + actor_rollout_ref.actor.optim.lr_decay_style=$LR_DECAY_STYLE \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.gpu_memory_utilization=$GPU_MEMORY_UTILIZATION \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.max_num_batched_tokens=10384 \ + actor_rollout_ref.rollout.max_model_len=2048 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP_SIZE \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP_SIZE \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.default_local_dir=$DEFAULT_LOCAL_DIR \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=$N_GPUS_PER_NODE \ + trainer.nnodes=$NNODES \ + trainer.save_freq=300 \ + trainer.test_freq=300 \ + trainer.total_epochs=5 \ + +ray_kwargs.ray_init._temp_dir=$RAY_TMPDIR \ + +ray_kwargs.ray_init.address=$RAY_ADDRESS \ + +ray_kwargs.ray_init.runtime_env.env_vars.PYTHONPATH=${PYTHONPATH:-} \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_ADDR=$MASTER_ADDR \ + +ray_kwargs.ray_init.runtime_env.env_vars.MASTER_PORT=\"${MASTER_PORT}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \ + +ray_kwargs.ray_init.runtime_env.env_vars.GLOO_IPV6=\"${GLOO_IPV6}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_MODE=$WANDB_MODE \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_DIR=$WANDB_DIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.TMPDIR=$TMPDIR \ + +ray_kwargs.ray_init.runtime_env.env_vars.WANDB_PROXY_URL=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTP_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.HTTPS_PROXY=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.http_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.https_proxy=\"\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.NO_PROXY=\"${NO_PROXY}\" \ + +ray_kwargs.ray_init.runtime_env.env_vars.no_proxy=\"${no_proxy}\" \ + custom_reward_function.path=/llm-align/liuchonghan/verl_lao/recipes_custom/RLVR_ABCDE_dense/reward_function.py \ + custom_reward_function.name=char_count_reward_function diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml index 17dddd60dc6..acb3e155fa9 100644 --- a/verl/trainer/config/ppo_megatron_trainer.yaml +++ b/verl/trainer/config/ppo_megatron_trainer.yaml @@ -26,7 +26,7 @@ defaults: actor_rollout_ref: hybrid_engine: True - nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron + nccl_timeout: 1200 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron model: override_config: diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index fd9b59862ae..2117fd2302d 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -133,7 +133,7 @@ trainer: logger: ["console", "wandb"] # Number of generations to log during validation - log_val_generations: 0 + log_val_generations: 10 # Directory for logging rollout data; no dump if null rollout_data_dir: null