diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index ee6176775e0..d8cf336bdb1 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -353,10 +353,14 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False) ) else: + # Note that we have to pass None to the images and videos if there are no new images / videos + # to stay compatible with downstream image processing logic! + images = new_images_this_turn if new_images_this_turn else None + videos = None response_ids = await self.apply_chat_template( add_messages, - images=new_images_this_turn, # Using local variable - videos=None, + images=images, + videos=videos, remove_system_prompt=True, )