Skip to content

Commit b3f63c6

Browse files
committed
Merge remote-tracking branch 'origin/main' into main-0212
2 parents 4c03e9b + 92c454a commit b3f63c6

File tree

5 files changed

+33
-10
lines changed

5 files changed

+33
-10
lines changed

verl/experimental/reward_loop/reward_loop.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
def migrate_legacy_reward_impl(config):
3939
"""
4040
Migrate the legacy reward model implementation to the new one.
41-
This is a temporary fix. A more robust one will be added.
4241
"""
4342
# 1. reward workers migration
4443
# config.reward_model.num_workers -> config.reward.num_workers
@@ -49,7 +48,7 @@ def migrate_legacy_reward_impl(config):
4948
# config.reward_model.reward_manager -> config.reward.reward_manager
5049
if config.reward_model.reward_manager is not None:
5150
config.reward.reward_manager.name = config.reward_model.reward_manager
52-
if config.reward_model.get("reward_loop_source") is not None:
51+
if config.reward_model.reward_loop_source is not None:
5352
config.reward.reward_manager.source = config.reward_model.reward_loop_source
5453
config.reward.reward_manager.module.path = config.reward_model.reward_loop_module_path
5554
config.reward.reward_manager.module.name = config.reward_model.reward_loop_class_name
@@ -64,19 +63,29 @@ def migrate_legacy_reward_impl(config):
6463
for key in ["enable", "enable_resource_pool", "n_gpus_per_node", "nnodes"]:
6564
if config.reward_model.get(key) is not None:
6665
config.reward.reward_model[key] = config.reward_model[key]
67-
# for dapo reward kwargs
66+
if config.reward_model.model.path is not None:
67+
config.reward.reward_model.model_path = config.reward_model.model.path
68+
# config.reward_model.reward_kwargs -> config.reward.reward_kwargs (for dapo algo)
6869
if config.reward_model.get("reward_kwargs") is not None:
69-
with open_dict(config.reward.reward_model):
70-
config.reward.reward_model["reward_kwargs"] = config.reward_model["reward_kwargs"]
70+
with open_dict(config.reward):
71+
config.reward["reward_kwargs"] = config.reward_model["reward_kwargs"]
72+
# config.reward_model.rollout -> config.reward.reward_model.rollout
7173
legacy_rollout = config.reward_model.rollout
72-
if not all(v is None for v in legacy_rollout.values()):
73-
config.reward.reward_model.rollout = legacy_rollout
74+
for key in legacy_rollout.keys():
75+
if legacy_rollout[key] is not None:
76+
config.reward.reward_model.rollout[key] = legacy_rollout[key]
7477

7578
# 5. sandbox_fusion migration
7679
# config.sandbox_fusion -> reward.sandbox_fusion
7780
if not all(v is None for v in config.sandbox_fusion.values()):
7881
config.reward.sandbox_fusion = config.sandbox_fusion
7982

83+
# 6. delete legacy config from configs
84+
with open_dict(config):
85+
del config.reward_model
86+
del config.custom_reward_function
87+
del config.sandbox_fusion
88+
8089
return config
8190

8291

@@ -222,12 +231,10 @@ async def compute_score_disrm(self, data: DataProto) -> dict:
222231
engine_name = self.config.reward.reward_model.rollout.name
223232
model_name = self.config.reward.reward_model.model_path
224233
if engine_name == "vllm":
225-
# TODO (dyy): the "activation" has been changed to "use_activation" in vllm 0.11.2
226234
payloads = {
227235
"model": model_name,
228236
"input": disrm_prompt,
229-
"activation": False,
230-
# "add_special_tokens": False, # vllm >= 0.11.2
237+
"use_activation": False,
231238
}
232239
output = await self._post_request(payloads, "classify")
233240
rm_score = output["data"][-1]["probs"][-1]

verl/trainer/config/_generated_ppo_megatron_trainer.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,10 @@ reward_model:
585585
reward_loop_source: null
586586
reward_loop_module_path: null
587587
reward_loop_class_name: null
588+
model:
589+
path: null
590+
external_lib: null
591+
trust_remote_code: null
588592
rollout:
589593
name: null
590594
dtype: null

verl/trainer/config/_generated_ppo_trainer.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,10 @@ reward_model:
505505
reward_loop_source: null
506506
reward_loop_module_path: null
507507
reward_loop_class_name: null
508+
model:
509+
path: null
510+
external_lib: null
511+
trust_remote_code: null
508512
rollout:
509513
name: null
510514
dtype: null

verl/trainer/config/_generated_ppo_veomni_trainer.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,10 @@ reward_model:
500500
reward_loop_source: null
501501
reward_loop_module_path: null
502502
reward_loop_class_name: null
503+
model:
504+
path: null
505+
external_lib: null
506+
trust_remote_code: null
503507
rollout:
504508
name: null
505509
dtype: null

verl/trainer/config/legacy_reward_impl.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ reward_model:
1212
reward_loop_source: null
1313
reward_loop_module_path: null
1414
reward_loop_class_name: null
15+
model:
16+
path: null
17+
external_lib: null
18+
trust_remote_code: null
1519
rollout:
1620
name: null
1721
dtype: null

0 commit comments

Comments
 (0)