huber loss implementation (#127)

huttarichard · web-flow · commit 3130dda65756 · 2025-04-01T12:17:42.000+02:00
diff --git a/losses/losses.go b/losses/losses.go
@@ -155,3 +155,60 @@ func SPG(logPropActions []mat.Tensor, logProbTargets []mat.Tensor) mat.Tensor {
 	}
 	return ag.Neg(loss)
 }
+
+// Huber measures the Huber loss between each element in the input x and target y, controlled by
+// the threshold (delta). Below the threshold, it behaves like MSE; above it, it becomes linear
+// in order to reduce the effect of outliers. If reduceMean is true, it returns the average loss;
+// otherwise it returns the sum of the losses.
+//
+// Huber(d) = { 0.5 * (d^2)              if |d| ≤ δ
+//
+//	δ * (|d| - 0.5 * δ)      otherwise }
+//
+// Here, d = x - y.
+func Huber(x, y mat.Tensor, delta float64, reduceMean bool) mat.Tensor {
+	// 1) Compute d = x - y, then |d|
+	d := ag.Sub(x, y)
+	absD := ag.Abs(d)
+
+	// 2) Build a scalar tensor from 'delta'
+	//    then multiply it by the shape of absD (via OnesLike) to broadcast
+	//    the scalar across all elements. This avoids dimension mismatch with Min().
+	deltaMat := x.Value().(mat.Matrix).NewScalar(delta)
+	deltaVec := ag.ProdScalar(x.Value().(mat.Matrix).OnesLike(), deltaMat)
+
+	// 3) clipped = min(|d|, deltaVec)
+	clipped := ag.Min(absD, deltaVec)
+
+	// 4) 0.5 * (clipped)^2
+	halfSqr := ag.ProdScalar(ag.Square(clipped), x.Value().(mat.Matrix).NewScalar(0.5))
+
+	// 5) deltaVec * (|d| - clipped)
+	linear := ag.Prod(deltaVec, ag.Sub(absD, clipped))
+
+	// 6) Combine
+	loss := ag.Add(halfSqr, linear)
+
+	// 7) reduceMean or sum
+	if reduceMean {
+		return ag.ReduceMean(loss)
+	}
+	return ag.ReduceSum(loss)
+}
+
+// HuberSeq calculates the Huber loss on multiple (predicted, target) pairs.
+// It sums the Huber loss across the entire sequence, optionally averaging it
+// by the number of elements if reduceMean is true.
+func HuberSeq(predicted, target []mat.Tensor, delta float64, reduceMean bool) mat.Tensor {
+	// Accumulate the Huber loss across the sequence
+	loss := Huber(predicted[0], target[0], delta, false)
+	for i := 1; i < len(predicted); i++ {
+		loss = ag.Add(loss, Huber(predicted[i], target[i], delta, false))
+	}
+
+	// Optionally divide by length to get mean
+	if reduceMean {
+		return ag.DivScalar(loss, loss.Value().(mat.Matrix).NewScalar(float64(len(predicted))))
+	}
+	return loss
+}
diff --git a/losses/losses_test.go b/losses/losses_test.go
@@ -190,3 +190,41 @@ func assertScalarEqualApprox[T float.DType](t *testing.T, expected T, actual mat
 	v := float.ValueOf[T](actual.Item())
 	assert.InDelta(t, expected, v, 1.0e-06)
 }
+
+func TestHuberLoss(t *testing.T) {
+	t.Run("float32", func(t *testing.T) { testHuberLoss[float32](t, 1.0e-6) })
+	t.Run("float64", func(t *testing.T) { testHuberLoss[float64](t, 1.0e-12) })
+}
+
+func testHuberLoss[T float.DType](t *testing.T, tol T) {
+	// 1) Setup input, target, delta
+	x := mat.NewDense[T](mat.WithBacking([]T{0.0, 2.5, 4.0}), mat.WithGrad(true))
+	y := mat.NewDense[T](mat.WithBacking([]T{0.0, 1.0, 2.0}))
+	delta := 1.0
+
+	// 2) Compute Huber loss with reduceMean = false
+	loss := Huber(x, y, delta, false)
+
+	// Sum of the "Huber" terms across 3 elements => ~2.5
+	assert.InDelta(t, 2.5, loss.Value().Item().F64(), float64(tol))
+
+	// 3) Backward
+	ag.Backward(loss)
+
+	// For this example:
+	// d = [0, 1.5, 2.0]; gradient = sign(d)*delta if |d|>delta else d
+	// => [0, 1.0, 1.0]
+	assert.InDeltaSlice(t, []T{0.0, 1.0, 1.0}, x.Grad().Data(), float64(tol))
+
+	// 4) Test again with reduceMean = true
+	x2 := mat.NewDense[T](mat.WithBacking([]T{0.0, 2.5, 4.0}), mat.WithGrad(true))
+	y2 := mat.NewDense[T](mat.WithBacking([]T{0.0, 1.0, 2.0}))
+	loss2 := Huber(x2, y2, delta, true)
+
+	// The total is 2.5 for 3 elements => 2.5 / 3 = ~0.8333
+	assert.InDelta(t, 0.8333333333333333, loss2.Value().Item().F64(), float64(tol))
+
+	ag.Backward(loss2)
+	// The gradient is the same shape but divided by 3 => [0, ~0.3333, ~0.3333]
+	assert.InDeltaSlice(t, []T{0.0, 0.3333333333333333, 0.3333333333333333}, x2.Grad().Data(), float64(tol))
+}