3838def migrate_legacy_reward_impl (config ):
3939 """
4040 Migrate the legacy reward model implementation to the new one.
41- This is a temporary fix. A more robust one will be added.
4241 """
4342 # 1. reward workers migration
4443 # config.reward_model.num_workers -> config.reward.num_workers
@@ -49,7 +48,7 @@ def migrate_legacy_reward_impl(config):
4948 # config.reward_model.reward_manager -> config.reward.reward_manager
5049 if config .reward_model .reward_manager is not None :
5150 config .reward .reward_manager .name = config .reward_model .reward_manager
52- if config .reward_model .get ( " reward_loop_source" ) is not None :
51+ if config .reward_model .reward_loop_source is not None :
5352 config .reward .reward_manager .source = config .reward_model .reward_loop_source
5453 config .reward .reward_manager .module .path = config .reward_model .reward_loop_module_path
5554 config .reward .reward_manager .module .name = config .reward_model .reward_loop_class_name
@@ -64,19 +63,29 @@ def migrate_legacy_reward_impl(config):
6463 for key in ["enable" , "enable_resource_pool" , "n_gpus_per_node" , "nnodes" ]:
6564 if config .reward_model .get (key ) is not None :
6665 config .reward .reward_model [key ] = config .reward_model [key ]
67- # for dapo reward kwargs
66+ if config .reward_model .model .path is not None :
67+ config .reward .reward_model .model_path = config .reward_model .model .path
68+ # config.reward_model.reward_kwargs -> config.reward.reward_kwargs (for dapo algo)
6869 if config .reward_model .get ("reward_kwargs" ) is not None :
69- with open_dict (config .reward .reward_model ):
70- config .reward .reward_model ["reward_kwargs" ] = config .reward_model ["reward_kwargs" ]
70+ with open_dict (config .reward ):
71+ config .reward ["reward_kwargs" ] = config .reward_model ["reward_kwargs" ]
72+ # config.reward_model.rollout -> config.reward.reward_model.rollout
7173 legacy_rollout = config .reward_model .rollout
72- if not all (v is None for v in legacy_rollout .values ()):
73- config .reward .reward_model .rollout = legacy_rollout
74+ for key in legacy_rollout .keys ():
75+ if legacy_rollout [key ] is not None :
76+ config .reward .reward_model .rollout [key ] = legacy_rollout [key ]
7477
7578 # 5. sandbox_fusion migration
7679 # config.sandbox_fusion -> reward.sandbox_fusion
7780 if not all (v is None for v in config .sandbox_fusion .values ()):
7881 config .reward .sandbox_fusion = config .sandbox_fusion
7982
83+ # 6. delete legacy config from configs
84+ with open_dict (config ):
85+ del config .reward_model
86+ del config .custom_reward_function
87+ del config .sandbox_fusion
88+
8089 return config
8190
8291
@@ -222,12 +231,10 @@ async def compute_score_disrm(self, data: DataProto) -> dict:
222231 engine_name = self .config .reward .reward_model .rollout .name
223232 model_name = self .config .reward .reward_model .model_path
224233 if engine_name == "vllm" :
225- # TODO (dyy): the "activation" has been changed to "use_activation" in vllm 0.11.2
226234 payloads = {
227235 "model" : model_name ,
228236 "input" : disrm_prompt ,
229- "activation" : False ,
230- # "add_special_tokens": False, # vllm >= 0.11.2
237+ "use_activation" : False ,
231238 }
232239 output = await self ._post_request (payloads , "classify" )
233240 rm_score = output ["data" ][- 1 ]["probs" ][- 1 ]
0 commit comments