EleutherAI
diff --git a/‎bergson/__main__.py‎
Lines changed: 16 additions & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎bergson/collector/collector.py‎
Lines changed: 116 additions & 14 deletions b/‎bergson/collector/collector.py‎
Lines changed: 116 additions & 14 deletions
diff --git a/‎bergson/config.py‎
Lines changed: 23 additions & 0 deletions b/‎bergson/config.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎bergson/data.py‎
Lines changed: 7 additions & 2 deletions b/‎bergson/data.py‎
Lines changed: 7 additions & 2 deletions
@@ -6,7 +6,8 @@
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
+from .config import HessianConfig, IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
+from .hessians.hessian_approximations import approximate_hessians
 from .query.query_index import query
 from .reduce import reduce
 from .score.score import score_dataset
@@ -99,11 +100,24 @@ def execute(self):
         query(self.query_cfg)
 
 
+@dataclass
+class Hessian:
+    """Approximate Hessian matrices using KFAC or EKFAC."""
+
+    hessian_cfg: HessianConfig
+    index_cfg: IndexConfig
+
+    def execute(self):
+        """Compute Hessian approximation."""
+        validate_run_path(self.index_cfg)
+        approximate_hessians(self.index_cfg, self.hessian_cfg)
+
+
 @dataclass
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query, Reduce, Score]
+    command: Union[Build, Query, Reduce, Score, Hessian]
 
     def execute(self):
         """Run the script."""
 
@@ -1,11 +1,13 @@
 import functools
+import hashlib
 import os
 from abc import ABC, abstractmethod
 from contextlib import ContextDecorator, nullcontext
 from dataclasses import astuple, dataclass, field
 from fnmatch import fnmatchcase
 from typing import Callable, Literal, Mapping, Optional
 
+import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -24,15 +26,14 @@
 from tqdm.auto import tqdm
 from transformers import PreTrainedModel
 
-from bergson.config import AttentionConfig, IndexConfig
+from bergson.config import AttentionConfig, HessianConfig, IndexConfig
 from bergson.data import pad_and_tensor
 from bergson.gradients import (
     GradientProcessor,
     LayerAdapter,
 )
 from bergson.utils.logger import get_logger
 from bergson.utils.peft import set_peft_enabled
-from bergson.utils.utils import create_projection_matrix
 
 
 @dataclass
@@ -78,6 +79,7 @@ class HookCollectorBase(ContextDecorator, ABC):
     Optional configuration specifying how to split up the attention module gradients
     into per-head gradients. See also bergson.config.AttentionConfig.
     """
+    logger = get_logger("HookCollectorBase", level="INFO")
 
     def __post_init__(
         self,
@@ -256,6 +258,28 @@ def projection(
         self.processor._projection_matrices[key] = A
         return A
 
+    def with_batch(self, valid_mask: Tensor | None = None) -> "HookCollectorBase":
+        """
+        Set the current batch indices and valid mask before entering the context.
+
+        This allows hooks to access batch indices and valid mask during
+        forward/backward passes.
+        Usage:
+            with collector.with_batch(indices, valid_mask):
+                # forward/backward pass
+                # hooks can access self._current_indices and self._current_valid_mask
+
+        Args:
+            indices: List of data indices in the current batch.
+            valid_mask: Optional boolean tensor of shape [batch_size, seq_len]
+                indicating which positions have valid labels for loss computation.
+
+        Returns:
+            self, for use as a context manager.
+        """
+        self._current_valid_mask = valid_mask
+        return self
+
     def __enter__(self):
         """Register forward and backward hooks on all target modules."""
         for name in self.target_info:
@@ -484,15 +508,23 @@ def run_with_collector_hooks(
             ):
                 batch = self.data[indices]
 
+                # Compute padded tensors and valid_mask before entering context
+                x, y, valid_mask = pad_and_tensor(
+                    batch["input_ids"],
+                    labels=batch.get("labels"),
+                    device=self.model.device,
+                )
+                total_processed += valid_mask.sum()
+
                 with (
-                    self.collector,
+                    self.collector.with_batch(valid_mask),
                     (
                         record_function(f"step_{step}")
                         if self.cfg.profile
                         else nullcontext()
                     ),
                 ):
-                    losses = self.forward_backward(self.model, batch)
+                    losses = self.forward_backward(self.model, x, y, batch)
 
                     # TODO: currently builder also calls torch.cuda.synchronize
                     torch.cuda.synchronize() if torch.cuda.is_available() else None
@@ -503,11 +535,17 @@ def run_with_collector_hooks(
                 step += 1
 
                 self.collector.process_batch(indices, losses=losses)
-                total_processed += len(indices)
 
         self.collector.teardown()
+
         if dist.is_initialized():
             dist.all_reduce(total_processed, op=dist.ReduceOp.SUM)
+
+        if self.rank == 0:
+            torch.save(
+                total_processed,
+                os.path.join(self.cfg.partial_run_path, "total_processed.pt"),
+            )
         self.logger.info(f"Total processed: {total_processed.item()}")
 
 
@@ -523,18 +561,17 @@ def fwd_bwd_factory(cfg: IndexConfig) -> Callable:
               summed loss.
 
     Returns:
-        A callable fwd_bwd(model, batch) -> Tensor that performs a forward pass and
-        backward pass, returning the per-sample losses.
-        The batch must contain "input_ids" and optionally "labels" and "advantage".
+        A callable fwd_bwd(model, x, y, batch) -> Tensor that performs a forward pass
+        and backward pass, returning the per-sample losses.
+        Args:
+            model: The model to run forward/backward on.
+            x: Padded input token ids tensor of shape [batch_size, seq_len].
+            y: Padded label tensor of shape [batch_size, seq_len] with -100 for padding.
+            batch: Original batch dict, used only for "advantage" if present.
         Returns a tensor of shape [batch_size] with one loss value per sample.
     """
 
-    def fwd_bwd(model, batch):
-        x, y = pad_and_tensor(
-            batch["input_ids"],  # type: ignore
-            labels=batch.get("labels"),  # type: ignore
-            device=model.device,
-        )
+    def fwd_bwd(model, x: Tensor, y: Tensor, batch: dict):
         logits = model(x).logits[:, :-1]
         masks = y[:, 1:] != -100
         denoms = (
@@ -571,3 +608,68 @@ def fwd_bwd(model, batch):
         return losses
 
     return fwd_bwd
+
+
+def fwd_bwd_hessian_factory(cfg: HessianConfig) -> Callable:
+    def fwd_bwd_hessian(model, x: Tensor, y: Tensor, batch: dict):
+        logits = model(x).logits[:, :-1]
+        masks = y[:, 1:] != -100
+        denoms = masks.sum(dim=1, dtype=model.dtype)
+
+        if not cfg.use_dataset_labels:
+            losses = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                y[:, 1:].flatten(),
+                reduction="none",
+            ).reshape_as(y[:, 1:])
+            losses = losses.sum(1) / denoms
+        else:
+            with torch.no_grad():
+                probs = F.softmax(logits, dim=-1)
+                sampled_tokens = torch.multinomial(
+                    probs.reshape(-1, probs.size(-1)),
+                    num_samples=1,
+                    replacement=True,
+                ).reshape_as(y[:, 1:])
+            losses = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                sampled_tokens.flatten(),
+                reduction="none",
+            ).reshape_as(y[:, 1:])
+
+        losses.sum().backward()
+        model.zero_grad()
+
+        return losses
+
+    return fwd_bwd_hessian
+
+
+def create_projection_matrix(
+    identifier: str,
+    m: int,
+    n: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    projection_type: Literal["normal", "rademacher"] = "normal",
+) -> Tensor:
+    """Create a projection matrix deterministically based on identifier and side."""
+    # Seed the PRNG with the name of the layer and what "side" we are projecting
+    message = bytes(identifier, "utf-8")
+    digest = hashlib.md5(message).digest()
+    seed = int.from_bytes(digest, byteorder="big") % (2**63 - 1)
+
+    if projection_type == "normal":
+        prng = torch.Generator(device).manual_seed(seed)
+        A = torch.randn(m, n, device=device, dtype=dtype, generator=prng)
+    elif projection_type == "rademacher":
+        numpy_rng = np.random.Generator(np.random.PCG64(seed))
+        random_bytes = numpy_rng.bytes((m * n + 7) // 8)
+        random_bytes = np.frombuffer(random_bytes, dtype=np.uint8)
+        A = np.unpackbits(random_bytes)[: m * n].reshape((m, n))
+        A = torch.from_numpy(A).to(device, dtype=dtype)
+        A = A.add_(-0.5).mul_(2)
+    else:
+        raise ValueError(f"Unknown projection type: {projection_type}")
+    A /= A.norm(dim=1, keepdim=True)
+    return A
@@ -298,6 +298,29 @@ class ReduceConfig:
     """Whether to unit normalize the gradients before reducing them."""
 
 
+@dataclass
+class HessianConfig:
+    """Config for reducing the gradients."""
+
+    method: Literal["kfac"] = "kfac"
+    """Method for approximating the Hessian."""
+
+    ev_correction: bool = False
+    """Whether to additionally compute eigenvalue correction."""
+
+    hessian_dtype: Literal["auto", "bf16", "fp16", "fp32"] = "auto"
+    """Precision (dtype) to use for the Hessian approximation."""
+
+    lambda_damp_factor: float = 0.1
+    """Damping factor for the Hessian approximation.
+    This will be a relative value multiplied
+    by the average eigenvalue of each module."""
+
+    use_dataset_labels: bool = False
+    """Whether to use dataset labels for Hessian (empirical Fisher) approximation.
+    If false, the model predictions will be used."""
+
+
 @dataclass
 class FaissConfig:
     """Configuration for FAISS index."""
 
@@ -466,7 +466,7 @@ def pad_and_tensor(
     padding_value: int = 0,
     dtype: torch.dtype | None = torch.long,
     device: torch.device | None = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Pad a list of sequences to the same length and convert them to tensors.
     Returns a tuple of padded sequences and labels. The labels are the same as the
@@ -485,7 +485,12 @@ def pad_and_tensor(
     # convert to tensor
     padded_tokens = torch.tensor(padded, dtype=dtype, device=device)
     padded_labels = torch.tensor(labels, dtype=dtype, device=device)
-    return padded_tokens, padded_labels
+    # Compute valid_masks: position i is valid if labels[i+1] != -100
+    N, S = padded_tokens.shape
+    valid_masks = torch.zeros(N, S, dtype=torch.bool, device=device)
+    valid_masks[:, :-1] = padded_labels[:, 1:] != -100
+
+    return padded_tokens, padded_labels, valid_masks
 
 
 def tokenize(batch: dict, *, args: DataConfig, tokenizer):