@@ -4,7 +4,7 @@ conda activate verl
44export PATH=$CONDA_PREFIX /bin:$PATH
55export NCCL_P2P_DISABLE=1
66export CUDA_DEVICE_ORDER=PCI_BUS_ID
7- export CUDA_VISIBLE_DEVICES=5,6,7,8
7+ export CUDA_VISIBLE_DEVICES=3,4
88export DATA_PATH=$PWD /../verlData
99export HF_HOME=$DATA_PATH
1010export VLLM_CACHE_DIR=$DATA_PATH /vllm_cache
@@ -17,17 +17,17 @@ ROLLOUT_NAME="vllm" # sglang or vllm
1717
1818FAMILY=" Qwen"
1919STUDENT_MODEL=Qwen2.5-0.5B
20- TEACHER_MODEL=Qwen2.5-3B -Instruct
20+ TEACHER_MODEL=Qwen2.5-7B -Instruct
2121
2222USE_POLICY_GRADIENT=False
23- DISTILLATION_LOSS_MODE=" k3"
23+ # DISTILLATION_LOSS_MODE="k3"
2424DISTILLATION_LOSS_MODE=" forward_kl_topk"
2525
26- DISTILLATION_LOSS_MODE= " k1 "
27- USE_POLICY_GRADIENT=True
26+ # USE_POLICY_GRADIENT=True
27+ # DISTILLATION_LOSS_MODE="k1"
2828
2929DISTILLATION_LOSS_MAX_CLAMP=10.0
30- DISTILLATION_LOG_PROB_MIN_CLAMP=null
30+ DISTILLATION_LOG_PROB_MIN_CLAMP=-10.0
3131
3232PROJECT_NAME=' verl_on_policy_distillation_example_gsm8k'
3333EXP_NAME=" ${FAMILY} /student-${STUDENT_MODEL} /teacher-${TEACHER_MODEL} /loss-${DISTILLATION_LOSS_MODE} -pg-${USE_POLICY_GRADIENT} -maxclamp-${DISTILLATION_LOSS_MAX_CLAMP} -logprobminclamp-${DISTILLATION_LOG_PROB_MIN_CLAMP} "
@@ -41,7 +41,7 @@ USE_DYNAMIC_BSZ=False
4141
4242STUDENT_WORLD_SIZE=2
4343
44- TEACHER_RESOURCE_POOL=True
44+ TEACHER_RESOURCE_POOL=False
4545TEACHER_WORLD_SIZE=2
4646
4747ENFORCE_EAGER=False # true for faster debugging
@@ -122,7 +122,7 @@ ALGORITHM=(
122122)
123123
124124TRAINER=(
125- trainer.logger=' ["console"]'
125+ trainer.logger=' ["console","wandb" ]'
126126 trainer.project_name=$PROJECT_NAME
127127 trainer.experiment_name=$EXP_NAME
128128 trainer.n_gpus_per_node=$STUDENT_WORLD_SIZE
0 commit comments