Ruff

JacobHelwig · JacobHelwig · commit 5383cb23db7b · 2026-02-25T12:54:29.000-06:00
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -478,7 +478,9 @@ async def generate_sequences(self, batch: DataProto) -> DataProto:
             )
         outputs = await asyncio.gather(*tasks)
 
-        output = self._postprocess(outputs, input_non_tensor_batch=batch.non_tensor_batch, validate=batch.meta_info.get("validate", False))
+        output = self._postprocess(
+            outputs, input_non_tensor_batch=batch.non_tensor_batch, validate=batch.meta_info.get("validate", False)
+        )
         return output
 
     async def _run_agent_loop(
@@ -736,10 +738,18 @@ async def _compute_score(self, output, prompts, responses, attention_mask, input
     async def _compute_teacher_logprobs(self, output, prompt_ids, response_ids, validate):
         """Compute teacher logprobs for single sample."""
         if self.distillation_enabled and not validate:
-            data = DataProto(batch=TensorDict({"prompt_ids": torch.tensor([prompt_ids]), "response_ids": torch.tensor([response_ids])}, batch_size=1))
+            data = DataProto(
+                batch=TensorDict(
+                    {"prompt_ids": torch.tensor([prompt_ids]), "response_ids": torch.tensor([response_ids])},
+                    batch_size=1,
+                )
+            )
             selected_teacher_loop_worker_handle = random.choice(self.teacher_loop_worker_handles)
             result = await selected_teacher_loop_worker_handle.compute_logprobs.remote(data)
-            response_ids, response_logprobs = result["response_ids"], result["response_logprobs"] # (1, S, K), S=sequence length, K=topk/1
+            response_ids, response_logprobs = (
+                result["response_ids"],
+                result["response_logprobs"],
+            )  # (1, S, K), S=sequence length, K=topk/1
 
             pad_size = self.config.actor_rollout_ref.rollout.response_length - response_ids.shape[1]
             padding = (0, 0, 0, pad_size)  # pad the sequence dimension
@@ -976,7 +986,9 @@ def _init_agent_loop_workers(self):
                     scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
                         node_id=node_id, soft=True
                     ),
-                ).remote(self.config, self.server_handles, self.reward_loop_worker_handles, self.teacher_loop_worker_handles)
+                ).remote(
+                    self.config, self.server_handles, self.reward_loop_worker_handles, self.teacher_loop_worker_handles
+                )
             )
 
     def generate_sequences(self, prompts: DataProto) -> DataProto:
diff --git a/verl/experimental/teacher_loop/__init__.py b/verl/experimental/teacher_loop/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .teacher_loop import TeacherLoopManager, TeacherLoopWorker
-from .teacher_loop import TeacherModelManager
+from .teacher_loop import TeacherLoopManager, TeacherLoopWorker, TeacherModelManager
 
 __all__ = ["TeacherModelManager", "TeacherLoopWorker", "TeacherLoopManager"]
diff --git a/verl/experimental/teacher_loop/teacher_loop.py b/verl/experimental/teacher_loop/teacher_loop.py
@@ -20,21 +20,16 @@
 import numpy as np
 import ray
 import torch
-import torch.nn.functional as F
-from omegaconf import DictConfig, open_dict
+from omegaconf import DictConfig
 from tensordict import TensorDict
 
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayResourcePool
-from verl.trainer.ppo.reward import load_reward_manager
-from verl.utils import hf_tokenizer
-from verl.utils.fs import copy_to_local
+from verl.trainer.distillation.losses import DistillationLossSettings, get_distillation_loss_settings
 from verl.utils.config import omega_conf_to_dataclass
-
-from .teacher_model import TeacherModelManager
 from verl.workers.config import DistillationConfig, DistillationLossConfig
 
-from verl.trainer.distillation.losses import get_distillation_loss_settings, DistillationLossSettings
+from .teacher_model import TeacherModelManager
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -52,7 +47,9 @@ def __init__(self, config: DictConfig, teacher_router_address: str = None):
         self.config = config
         self.distillation_config: DistillationConfig = self.config.distillation
         self.distillation_loss_config: DistillationLossConfig = self.distillation_config.distillation_loss
-        self.distillation_loss_settings: DistillationLossSettings = get_distillation_loss_settings(self.distillation_loss_config.loss_mode)
+        self.distillation_loss_settings: DistillationLossSettings = get_distillation_loss_settings(
+            self.distillation_loss_config.loss_mode
+        )
         self.teacher_router_address = teacher_router_address
         # # Serialize teacher requests per actor to reduce pressure on the teacher vLLM router/backend.
         # self._request_semaphore = asyncio.Semaphore(1)
@@ -112,16 +109,16 @@ async def _post_request(self, payload: dict, endpoint: str, max_retries: int = 1
             raise last_exception
 
     async def _compute_logprobs(self, data: DataProto) -> dict:
-        prompt_ids = data.batch['prompt_ids']
-        response_ids = data.batch['response_ids']
+        prompt_ids = data.batch["prompt_ids"]
+        response_ids = data.batch["response_ids"]
         input_ids = torch.cat([prompt_ids, response_ids], dim=1).squeeze(0).tolist()
         engine_name = self.config.distillation.teacher_model.inference.name
         model_name = self.config.distillation.teacher_model.model_path
         if engine_name == "vllm":
             if self.distillation_loss_settings.use_topk:
                 num_logprobs = topk = self.distillation_loss_config.topk
             else:
-                num_logprobs = 0 # only the sampled logprob    
+                num_logprobs = 0  # only the sampled logprob
             payloads = {
                 "model": model_name,
                 "prompt": input_ids,
@@ -140,7 +137,7 @@ async def _compute_logprobs(self, data: DataProto) -> dict:
             for logprobs_dict in response_logprob_dicts:
                 if num_logprobs == 0:
                     token_id_str = list(logprobs_dict.keys())[0]
-                    logprob = logprobs_dict[token_id_str]['logprob']
+                    logprob = logprobs_dict[token_id_str]["logprob"]
                     response_logprobs_ls.append([logprob])
                     response_ids_ls.append([int(token_id_str)])
                 else:
@@ -149,18 +146,22 @@ async def _compute_logprobs(self, data: DataProto) -> dict:
                     # We get either top-k logprobs or top-k plus the sampled logprob (if sampled token is not in top-k)
                     assert len(logprobs_dict) in [topk, topk + 1], len(logprobs_dict)
                     for token_id_str, token_dict in logprobs_dict.items():
-                        if token_dict['rank'] > topk:
-                            continue # the sampled token is not in the top-k
-                        rank = token_dict['rank']
-                        logprob = token_dict['logprob']
+                        if token_dict["rank"] > topk:
+                            continue  # the sampled token is not in the top-k
+                        rank = token_dict["rank"]
+                        logprob = token_dict["logprob"]
                         response_ids[rank - 1] = int(token_id_str)
                         response_logprobs[rank - 1] = logprob
                     response_logprobs_ls.append(response_logprobs)
                     response_ids_ls.append(response_ids)
-            logprobs_dtype = torch.bfloat16 if self.distillation_config.teacher_model.inference.dtype == "bfloat16" else torch.float32
+            logprobs_dtype = (
+                torch.bfloat16
+                if self.distillation_config.teacher_model.inference.dtype == "bfloat16"
+                else torch.float32
+            )
             response_logprobs = torch.tensor(response_logprobs_ls, dtype=logprobs_dtype).unsqueeze(0)
             response_ids = torch.tensor(response_ids_ls, dtype=torch.long).unsqueeze(0)
-            
+
         elif engine_name == "sglang":
             raise ValueError("SGLang backend does not support distillation currently.")
             payloads = {
@@ -200,7 +201,9 @@ class TeacherLoopManager:
 
     def __init__(self, config: DictConfig, teacher_resource_pool: RayResourcePool = None):
         self.config = config
-        self.distillation_config: DistillationConfig = omega_conf_to_dataclass(self.config.distillation) # to dataclass for the post init to handle top-k and engine kwargs
+        self.distillation_config: DistillationConfig = omega_conf_to_dataclass(
+            self.config.distillation
+        )  # to dataclass for the post init to handle top-k and engine kwargs
         self.teacher_model_manager = TeacherModelManager(self.distillation_config.teacher_model, teacher_resource_pool)
         self.teacher_router_address = self.teacher_model_manager.get_router_address()
 
@@ -266,6 +269,7 @@ def compute_teacher_logprobs(self, data: DataProto) -> DataProto:
 
     def _run_all(self, tasks: list[asyncio.Task]):
         raise NotImplementedError("TODO:RM")
+
         async def run_all():
             return await asyncio.gather(*tasks)
 
diff --git a/verl/experimental/teacher_loop/teacher_model.py b/verl/experimental/teacher_loop/teacher_model.py
@@ -17,7 +17,7 @@
 import os
 
 from verl.single_controller.ray.base import RayResourcePool, split_resource_pool
-from verl.workers.config import HFModelConfig, DistillationTeacherModelConfig
+from verl.workers.config import DistillationTeacherModelConfig, HFModelConfig
 from verl.workers.rollout.replica import get_rollout_replica_class
 
 logger = logging.getLogger(__file__)
@@ -91,6 +91,7 @@ def _initialize_router(self):
         worker_urls = [f"http://{server_address}" for server_address in self.server_addresses]
 
         from ..reward_loop.router.naive_router import launch_router_process
+
         self.router_address, _ = launch_router_process(worker_urls=worker_urls)
 
     def get_router_address(self):
diff --git a/verl/trainer/distillation/fsdp/losses.py b/verl/trainer/distillation/fsdp/losses.py
@@ -27,6 +27,7 @@ def kl_divergence(log_q: torch.Tensor, log_p: torch.Tensor) -> torch.Tensor:
     kld = p * (log_p - log_q)
     return kld.sum(dim=-1)
 
+
 def compute_forward_kl_topk(
     student_logits: torch.Tensor,
     teacher_topk_log_probs: torch.Tensor,
diff --git a/verl/trainer/distillation/losses.py b/verl/trainer/distillation/losses.py
@@ -22,7 +22,7 @@
 from verl.trainer.distillation.types import DistillationLossInputs
 from verl.trainer.ppo.core_algos import agg_loss, kl_penalty
 from verl.utils.metric import AggregationType, Metric
-from verl.workers.config import DistillationConfig, DistillationLossConfig, ActorConfig
+from verl.workers.config import ActorConfig, DistillationConfig, DistillationLossConfig
 
 DistillationLossFn = Callable[
     [
@@ -56,8 +56,7 @@ def __post_init__(self):
         self.names = [self.names] if isinstance(self.names, str) else self.names
         if sum([self.use_topk, self.use_estimator]) > 1:
             raise ValueError(
-                f"Expected only one of use_estimator, use_topk, but got "
-                f"{self.use_estimator=}, {self.use_topk=}."
+                f"Expected only one of use_estimator, use_topk, but got {self.use_estimator=}, {self.use_topk=}."
             )
 
 
@@ -206,7 +205,11 @@ def compute_forward_kl_topk(
         teacher_topk_ids=teacher_topk_ids,
         config=distillation_config,
     )
-    distillation_losses, student_mass, teacher_mass = outputs["distillation_losses"], outputs["student_mass"], outputs["teacher_mass"]
+    distillation_losses, student_mass, teacher_mass = (
+        outputs["distillation_losses"],
+        outputs["student_mass"],
+        outputs["teacher_mass"],
+    )
 
     # Log amount of mass in the top-k log probabilities for both student and teacher.
     student_mass = student_mass[response_mask]
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
@@ -242,7 +242,9 @@ def init_resource_pool_mgr(self, config):
             if distillation_config.teacher_model.nnodes <= 0:
                 raise ValueError("config.distillation.teacher_model.nnodes must be greater than 0")
 
-            teacher_pool = [distillation_config.teacher_model.n_gpus_per_node] * distillation_config.teacher_model.nnodes
+            teacher_pool = [
+                distillation_config.teacher_model.n_gpus_per_node
+            ] * distillation_config.teacher_model.nnodes
             resource_pool_spec["teacher_pool"] = teacher_pool
 
         from verl.trainer.ppo.ray_trainer import ResourcePoolManager
@@ -274,7 +276,6 @@ def add_teacher_model_resource_pool(self, config):
             else:
                 self.mapping[Role.TeacherModel] = "global_pool"
 
-
     def add_ref_policy_worker(self, config, ref_policy_cls):
         """Add reference policy worker if KL loss or KL reward is used."""
         from verl.trainer.ppo.ray_trainer import Role
@@ -462,4 +463,4 @@ def create_rl_sampler(data_config, dataset):
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -38,7 +38,7 @@
 from verl.experimental.dataset.sampler import AbstractCurriculumSampler
 from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup, ResourcePoolManager
-from verl.single_controller.ray.base import create_colocated_worker_cls, split_resource_pool
+from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.config import AlgoConfig
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
@@ -54,9 +54,9 @@
     Role,
     WorkerType,
     need_critic,
-    need_teacher_policy,
     need_reference_policy,
     need_reward_model,
+    need_teacher_policy,
 )
 from verl.utils import tensordict_utils as tu
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
@@ -744,7 +744,6 @@ def init_workers(self):
             )
             self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
 
-
         # initialize WorkerGroup
         # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
         # you should not use `create_colocated_worker_cls`.
@@ -1150,7 +1149,6 @@ def _compute_ref_log_prob(self, batch: DataProto) -> DataProto:
 
         return ref_log_prob
 
-
     def _compute_old_log_prob(self, batch: DataProto):
         if self.use_legacy_worker_impl == "disable":
             # TODO: remove step 1, 2, 4 after we make the whole training tensordict and padding free
diff --git a/verl/utils/config.py b/verl/utils/config.py
@@ -199,4 +199,4 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
 
         get_vllm_max_lora_rank(lora_rank)
 
-    print("[validate_config] All configuration checks passed successfully!")
+    print("[validate_config] All configuration checks passed successfully!")
diff --git a/verl/utils/stages.py b/verl/utils/stages.py
@@ -20,4 +20,4 @@ class Stage(Enum):
 
     OLD_LOG_PROB = "old_log_prob"
     REF_LOG_PROB = "ref_log_prob"
-    ACTOR_UPDATE = "actor_update"
+    ACTOR_UPDATE = "actor_update"
diff --git a/verl/workers/config/__init__.py b/verl/workers/config/__init__.py
@@ -15,12 +15,12 @@
 from . import actor, critic, engine, model, optimizer, reward, rollout
 from .actor import *  # noqa: F401
 from .critic import *  # noqa: F401
+from .distillation import *  # noqa: F401
 from .engine import *  # noqa: F401
 from .model import *  # noqa: F401
 from .optimizer import *  # noqa: F401
 from .reward import *  # noqa: F401
 from .rollout import *  # noqa: F401
-from .distillation import *  # noqa: F401
 
 __all__ = (
     actor.__all__
diff --git a/verl/workers/config/distillation.py b/verl/workers/config/distillation.py
@@ -18,7 +18,6 @@
 from typing import Optional
 
 from verl.base_config import BaseConfig
-from verl.trainer.config.config import ModuleConfig
 
 from .rollout import RolloutConfig
 
@@ -27,6 +26,7 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
+
 @dataclass
 class DistillationLossConfig(BaseConfig):
     """Configuration for distillation loss settings.
@@ -66,7 +66,7 @@ def __post_init__(self):
 @dataclass
 class DistillationTeacherModelConfig(BaseConfig):
     """Configuration for on-policy distillation teacher.
-    
+
     enable_resource_pool (bool):
         Whether to enable separate resource pool for teacher model(s).
     n_gpus_per_node (int):
@@ -78,6 +78,7 @@ class DistillationTeacherModelConfig(BaseConfig):
     inference (RolloutConfig):
         Rollout configuration for the teacher model inference during distillation.
     """
+
     _mutable_fields = BaseConfig._mutable_fields
 
     enable_resource_pool: bool = False
@@ -87,7 +88,6 @@ class DistillationTeacherModelConfig(BaseConfig):
     inference: RolloutConfig = field(default_factory=RolloutConfig)
 
 
-
 @dataclass
 class DistillationConfig(BaseConfig):
     """Configuration for on-policy distillation.
@@ -109,7 +109,6 @@ class DistillationConfig(BaseConfig):
     teacher_model: DistillationTeacherModelConfig = field(default_factory=DistillationTeacherModelConfig)
     distillation_loss: DistillationLossConfig = field(default_factory=DistillationLossConfig)
 
-
     def __post_init__(self):
         engine_name = self.teacher_model.inference.name
         engine_kwargs = self.teacher_model.inference.engine_kwargs
@@ -129,4 +128,6 @@ def __post_init__(self):
                     )
                 engine_kwargs["vllm"] = vllm_engine_kwargs
             case _:
-                raise NotImplementedError(f"DistillationTeacherModelConfig does not support inference engine {engine_name}")
+                raise NotImplementedError(
+                    f"DistillationTeacherModelConfig does not support inference engine {engine_name}"
+                )
diff --git a/verl/workers/engine/fsdp/transformer_impl.py b/verl/workers/engine/fsdp/transformer_impl.py
@@ -33,7 +33,7 @@
 import verl.utils.torch_functional as verl_F
 from verl.models.transformers.monkey_patch import apply_monkey_patch
 from verl.trainer.config import CheckpointConfig
-from verl.trainer.distillation import prepare_student_distillation_inputs, is_distillation_enabled
+from verl.trainer.distillation import is_distillation_enabled, prepare_student_distillation_inputs
 from verl.utils import tensordict_utils as tu
 from verl.utils.activation_offload import enable_activation_offloading
 from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
@@ -915,9 +915,7 @@ def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict):
             if use_fused_kernels:
                 # temperature is singleton
                 if self.distillation_enabled:
-                    raise NotImplementedError(
-                        "Distillation with fused kernels is not supported yet"
-                    )
+                    raise NotImplementedError("Distillation with fused kernels is not supported yet")
                 log_probs = output.log_probs.squeeze(0)  # (total_nnz,)
                 entropy_rmpad = output.entropy.squeeze(0)  # (total_nnz,)
             else:
@@ -984,9 +982,7 @@ def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict):
                 log_probs = output.log_probs[:, -response_length - 1 : -1]
                 entropy = output.entropy[:, -response_length - 1 : -1]  # (bsz, response_length)
                 if self.distillation_enabled:
-                    raise NotImplementedError(
-                        "Distillation with fused kernels is not supported yet"
-                    )
+                    raise NotImplementedError("Distillation with fused kernels is not supported yet")
             else:
                 logits = output.logits  # (bsz, response_length, vocab_size)
                 temperature = output_args["temperature"]  # (bsz,)
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py
diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py

Original file line number	Diff line number	Diff line change
`@@ -199,4 +199,4 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):`
`199`	`199`
`200`	`200`	`get_vllm_max_lora_rank(lora_rank)`
`201`	`201`
`202`		`- print("[validate_config] All configuration checks passed successfully!")`
	`202`	`+ print("[validate_config] All configuration checks passed successfully!")`