|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Qwen3.5-397B-A17B SFT with Megatron backend + mbridge |
| 3 | +# |
| 4 | +# Requirements: |
| 5 | +# - 128+ GPUs (80GB each, e.g. 16x8 H100/H200) |
| 6 | +# - Docker: verlai/verl:vllm015 (or equivalent) |
| 7 | +# - Additional packages on top of the base image: |
| 8 | +# pip install --upgrade transformers |
| 9 | +# pip install flash-linear-attention |
| 10 | +# pip install -U git+https://github.com/ISEEKYAN/mbridge.git |
| 11 | +# - Megatron-LM dev branch with Qwen3.5 GDN support |
| 12 | +# |
| 13 | +# Qwen3.5 architecture notes: |
| 14 | +# Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does |
| 15 | +# NOT support packed sequences (THD format) in Megatron-LM. Therefore: |
| 16 | +# - engine.use_remove_padding=False (forces bshd compute format) |
| 17 | +# - model.use_remove_padding=True (keeps NestedTensor in data pipeline) |
| 18 | +# - data.use_dynamic_bsz=False (required for bshd mode) |
| 19 | +# |
| 20 | +# Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD |
| 21 | +# format will be supported and engine.use_remove_padding can be set to True |
| 22 | +# for better performance. |
| 23 | +# |
| 24 | +# Tested parallelism config (128 GPUs / 16 nodes): |
| 25 | +# TP=2 PP=4 EP=32 CP=1 |
| 26 | + |
| 27 | +set -xeuo pipefail |
| 28 | + |
| 29 | +# ============================================================ |
| 30 | +# Distributed |
| 31 | +# ============================================================ |
| 32 | +NUM_GPUS=${NUM_GPUS:-8} |
| 33 | +MASTER_ADDR=${MASTER_ADDR:-localhost} |
| 34 | +MASTER_PORT=${MASTER_PORT:-29500} |
| 35 | +NNODES=${NNODES:-16} |
| 36 | +NODE_RANK=${NODE_RANK:-0} |
| 37 | + |
| 38 | +# ============================================================ |
| 39 | +# Data |
| 40 | +# ============================================================ |
| 41 | +DATASET_DIR=${DATASET_DIR:-~/dataset} |
| 42 | +TRAIN_FILES=${TRAIN_FILES:-${DATASET_DIR}/train.parquet} |
| 43 | + |
| 44 | +# ============================================================ |
| 45 | +# Model |
| 46 | +# ============================================================ |
| 47 | +MODEL_PATH=${MODEL_PATH:-Qwen/Qwen3.5-397B-A17B} |
| 48 | + |
| 49 | +# ============================================================ |
| 50 | +# Parallelism |
| 51 | +# ============================================================ |
| 52 | +TP_SIZE=${TP_SIZE:-2} |
| 53 | +PP_SIZE=${PP_SIZE:-4} |
| 54 | +VPP_SIZE=${VPP_SIZE:-null} |
| 55 | +CP_SIZE=${CP_SIZE:-1} |
| 56 | +EP_SIZE=${EP_SIZE:-32} |
| 57 | +ETP_SIZE=${ETP_SIZE:-1} |
| 58 | + |
| 59 | +# ============================================================ |
| 60 | +# Training |
| 61 | +# ============================================================ |
| 62 | +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-128} |
| 63 | +MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2} |
| 64 | +MAX_LENGTH=${MAX_LENGTH:-2048} |
| 65 | +LR=${LR:-2e-5} |
| 66 | +MIN_LR=${MIN_LR:-2e-6} |
| 67 | +DTYPE=${DTYPE:-bfloat16} |
| 68 | + |
| 69 | +BACKEND=megatron |
| 70 | +RESUME_MODE=${RESUME_MODE:-disable} |
| 71 | + |
| 72 | +project_name=verl_sft_qwen3_5 |
| 73 | +exp_name=qwen3_5-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-ep${EP_SIZE} |
| 74 | +ckpts_home=${ckpts_home:-~/verl/checkpoints/${project_name}/${exp_name}} |
| 75 | +mkdir -p "${ckpts_home}" |
| 76 | + |
| 77 | +# ============================================================ |
| 78 | +# Engine config |
| 79 | +# ============================================================ |
| 80 | +# Key Qwen3.5 settings: |
| 81 | +# engine.use_remove_padding=False - GDN requires bshd format (no THD) |
| 82 | +# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge) |
| 83 | +ENGINE_CONFIG="\ |
| 84 | + engine=${BACKEND} \ |
| 85 | + optim=${BACKEND} \ |
| 86 | + optim.lr=${LR} \ |
| 87 | + optim.min_lr=${MIN_LR} \ |
| 88 | + optim.lr_warmup_steps=10 \ |
| 89 | + optim.weight_decay=0.1 \ |
| 90 | + optim.betas='[0.9,0.95]' \ |
| 91 | + optim.clip_grad=1.0 \ |
| 92 | + optim.lr_warmup_init=0 \ |
| 93 | + optim.lr_decay_style=cosine \ |
| 94 | + +optim.override_optimizer_config.optimizer_offload_fraction=1 \ |
| 95 | + +optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \ |
| 96 | + +optim.override_optimizer_config.use_precision_aware_optimizer=True \ |
| 97 | + +optim.override_optimizer_config.optimizer_cpu_offload=True \ |
| 98 | + engine.tensor_model_parallel_size=${TP_SIZE} \ |
| 99 | + engine.pipeline_model_parallel_size=${PP_SIZE} \ |
| 100 | + engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \ |
| 101 | + engine.context_parallel_size=${CP_SIZE} \ |
| 102 | + engine.expert_model_parallel_size=${EP_SIZE} \ |
| 103 | + engine.expert_tensor_parallel_size=${ETP_SIZE} \ |
| 104 | + engine.use_mbridge=True \ |
| 105 | + engine.vanilla_mbridge=True \ |
| 106 | + engine.dtype=${DTYPE} \ |
| 107 | + engine.use_remove_padding=False \ |
| 108 | + engine.override_transformer_config.attention_backend=auto \ |
| 109 | + +engine.override_transformer_config.recompute_method=uniform \ |
| 110 | + +engine.override_transformer_config.recompute_granularity=full \ |
| 111 | + +engine.override_transformer_config.recompute_num_layers=1" |
| 112 | + |
| 113 | +# ============================================================ |
| 114 | +# Launch |
| 115 | +# ============================================================ |
| 116 | +torchrun \ |
| 117 | + --nproc_per_node=${NUM_GPUS} \ |
| 118 | + --nnodes=${NNODES} \ |
| 119 | + --node_rank=${NODE_RANK} \ |
| 120 | + --master_addr=${MASTER_ADDR} \ |
| 121 | + --master_port=${MASTER_PORT} \ |
| 122 | + -m verl.trainer.sft_trainer \ |
| 123 | + data.train_files="${TRAIN_FILES}" \ |
| 124 | + data.train_batch_size=${TRAIN_BATCH_SIZE} \ |
| 125 | + data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ |
| 126 | + data.max_length=${MAX_LENGTH} \ |
| 127 | + data.pad_mode=no_padding \ |
| 128 | + data.truncation=error \ |
| 129 | + data.use_dynamic_bsz=False \ |
| 130 | + data.max_token_len_per_gpu=${MAX_LENGTH} \ |
| 131 | + data.messages_key=messages \ |
| 132 | + model.path=${MODEL_PATH} \ |
| 133 | + model.use_remove_padding=True \ |
| 134 | + model.trust_remote_code=True \ |
| 135 | + ${ENGINE_CONFIG} \ |
| 136 | + trainer.test_freq=-1 \ |
| 137 | + trainer.save_freq=500 \ |
| 138 | + trainer.logger="['console']" \ |
| 139 | + trainer.project_name="${project_name}" \ |
| 140 | + trainer.experiment_name="${exp_name}" \ |
| 141 | + trainer.total_epochs=1 \ |
| 142 | + trainer.default_local_dir="${ckpts_home}" \ |
| 143 | + trainer.resume_mode=${RESUME_MODE} |
0 commit comments