-
Notifications
You must be signed in to change notification settings - Fork 3.3k
[perf, trtllm] feat: Add Nsight support for rollout server mode (trtllm) #5391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
3ed6ba3
c4ad789
06ea68c
579fa2f
7dbe100
e129e9e
0deda9b
26a17c8
988588a
1c7b9bd
afbcfea
1ee1fef
1192d50
b9b57d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| set -x | ||
|
|
||
| # Clean all slurm / MPI / PMIx env to avoid pmix mismatch error | ||
| for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do | ||
| unset "$v" | ||
| done | ||
|
|
||
| export RAY_DEDUP_LOGS=0 | ||
|
|
||
| # ----- | ||
| # Config | ||
| # ----- | ||
| TP=${1:-4} | ||
| PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"} | ||
| EXP_NAME=trtllm-qwen2-7b-tp${TP}-8gpus${EXP_NAME_SUFFIX:+"-"}${EXP_NAME_SUFFIX} | ||
|
|
||
| if [ $TP -eq 4 ]; then | ||
| MAX_BATCH_SIZE=1024 | ||
| else | ||
| MAX_BATCH_SIZE=384 | ||
| fi | ||
|
|
||
| # ----- | ||
| # Data | ||
| # ----- | ||
| DATADIR=${DATADIR:-$PWD/data} | ||
| MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-7B-Instruct"} | ||
|
|
||
| GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet | ||
| GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet | ||
| MATH_TRAIN_PATH=${DATADIR}/math/train.parquet | ||
| MATH_TEST_PATH=${DATADIR}/math/test.parquet | ||
|
|
||
| TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']" | ||
| TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']" | ||
|
|
||
| # ----- | ||
| # Launch | ||
| # ----- | ||
| python3 -m verl.trainer.main_ppo \ | ||
| algorithm.adv_estimator=grpo \ | ||
| algorithm.rollout_correction.rollout_is_threshold=2.0 \ | ||
| data.train_files="$TRAIN_FILES" \ | ||
| data.val_files="$TEST_FILES" \ | ||
| data.train_batch_size=1024 \ | ||
| data.max_prompt_length=2048 \ | ||
| data.max_response_length=1024 \ | ||
| data.return_raw_chat=True \ | ||
| data.filter_overlong_prompts=True \ | ||
| data.truncation='error' \ | ||
| actor_rollout_ref.hybrid_engine=True \ | ||
| actor_rollout_ref.model.path=${MODEL_PATH} \ | ||
| actor_rollout_ref.actor.optim.lr=1e-6 \ | ||
| actor_rollout_ref.model.use_remove_padding=True \ | ||
| actor_rollout_ref.actor.ppo_mini_batch_size=256 \ | ||
| actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.actor.use_kl_loss=True \ | ||
| actor_rollout_ref.actor.kl_loss_coef=0.001 \ | ||
| actor_rollout_ref.actor.kl_loss_type=low_var_kl \ | ||
| actor_rollout_ref.actor.entropy_coeff=0 \ | ||
| actor_rollout_ref.model.enable_gradient_checkpointing=True \ | ||
| actor_rollout_ref.actor.fsdp_config.param_offload=False \ | ||
| actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ | ||
| actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \ | ||
| actor_rollout_ref.rollout.name=trtllm \ | ||
| actor_rollout_ref.rollout.mode="async" \ | ||
| actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ | ||
| actor_rollout_ref.rollout.n=5 \ | ||
| actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \ | ||
| actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \ | ||
| actor_rollout_ref.rollout.calculate_log_probs=True \ | ||
| actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.ref.fsdp_config.param_offload=True \ | ||
| actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \ | ||
| algorithm.use_kl_in_reward=False \ | ||
| trainer.critic_warmup=0 \ | ||
| trainer.logger='["console"]' \ | ||
| trainer.project_name="${PROJECT_NAME}" \ | ||
| trainer.experiment_name=${EXP_NAME} \ | ||
| trainer.n_gpus_per_node=8 \ | ||
| trainer.nnodes=2 \ | ||
| trainer.save_freq=-1 \ | ||
| trainer.test_freq=5 \ | ||
| trainer.resume_mode=disable \ | ||
| trainer.total_epochs=15 \ | ||
| trainer.val_before_train=False \ | ||
| trainer.total_training_steps=6 \ | ||
| global_profiler.tool=nsys \ | ||
| global_profiler.steps='[2,3,5]' \ | ||
| global_profiler.profile_continuous_steps=True \ | ||
| global_profiler.global_tool_config.nsys.discrete=False \ | ||
| global_profiler.global_tool_config.nsys.worker_nsight_options.capture-range-end='repeat-shutdown:2' \ | ||
| actor_rollout_ref.actor.profiler.enable=True \ | ||
| actor_rollout_ref.actor.profiler.all_ranks=False \ | ||
| actor_rollout_ref.actor.profiler.ranks=[0,2] \ | ||
| actor_rollout_ref.rollout.profiler.enable=True \ | ||
| actor_rollout_ref.rollout.profiler.all_replicas=False \ | ||
| actor_rollout_ref.rollout.profiler.replicas=[0,2] \ | ||
| actor_rollout_ref.rollout.profiler.all_ranks=False \ | ||
| actor_rollout_ref.rollout.profiler.ranks=[0,2] \ | ||
| "${@:2}" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -316,6 +316,12 @@ profiler: | |
| # whether enable profile on rollout | ||
| enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} | ||
|
|
||
| # Whether to profile all replicas. | ||
| all_replicas: false | ||
|
||
|
|
||
| # The replicas that will be profiled. [] or [0,1,...] | ||
| replicas: [] | ||
|
|
||
| # Whether to profile all ranks. | ||
| all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
exportcommands only set environment variables for the currentRUNlayer and will not persist in the final container image's environment. This meansLD_LIBRARY_PATHwill not include thenvshmemlibrary path at runtime, which can lead to "library not found" errors when the application tries to load shared libraries from that path. You should use theENVinstruction to set environment variables that are required at runtime to ensure they are available to the container's processes.