diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ed1589cfc..d576057003 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,6 +276,7 @@ if (SKBUILD)
     set (ENABLE_idiff OFF)
     set (ENABLE_igrep OFF)
     set (ENABLE_iinfo OFF)
+    set (ENABLE_hwy_test OFF)
     set (ENABLE_testtex OFF)
     set (ENABLE_iv OFF)
 endif ()
@@ -286,6 +287,7 @@ if (OIIO_BUILD_TOOLS AND NOT BUILD_OIIOUTIL_ONLY)
     add_subdirectory (src/idiff)
     add_subdirectory (src/igrep)
     add_subdirectory (src/iinfo)
+    add_subdirectory (src/hwy_test)
     add_subdirectory (src/maketx)
     add_subdirectory (src/oiiotool)
     add_subdirectory (src/testtex)
diff --git a/docs/dev/Architecture.md b/docs/dev/Architecture.md
index 5e52bf4143..72f81d8907 100644
--- a/docs/dev/Architecture.md
+++ b/docs/dev/Architecture.md
@@ -117,6 +117,10 @@ objects. These algorithms include simple operations like copying, resizing,
 and compositing images, as well as more complex operations like color
 conversions, resizing, filtering, etc.
 
+Some performance-critical `ImageBufAlgo` implementations have SIMD-accelerated
+paths using Google Highway. For implementation details and guidance for adding
+new kernels, see `docs/dev/ImageBufAlgo_Highway.md`.
+
 ## Image caching: TextureSystem and ImageCache
 
 There are situations where ImageBuf is still not the right abstraction,
diff --git a/docs/dev/ImageBufAlgo_Highway.md b/docs/dev/ImageBufAlgo_Highway.md
new file mode 100644
index 0000000000..960766978c
--- /dev/null
+++ b/docs/dev/ImageBufAlgo_Highway.md
@@ -0,0 +1,264 @@
+ImageBufAlgo Highway (hwy) Implementation Guide
+==============================================
+
+This document explains how OpenImageIO uses Google Highway (hwy) to accelerate
+selected `ImageBufAlgo` operations, and how to add or modify kernels in a way
+that preserves OIIO semantics while keeping the code maintainable.
+
+This is a developer-facing document about the implementation structure in
+`src/libOpenImageIO/`. It does not describe the public API behavior of the
+algorithms.
+
+
+Goals and non-goals
+-------------------
+
+Goals:
+- Make the hwy-backed code paths easy to read and easy to extend.
+- Centralize repetitive boilerplate (type conversion, tails, ROI pointer math).
+- Preserve OIIO's numeric semantics (normalized integer model).
+- Keep scalar fallbacks as the source of truth for tricky layout cases.
+
+Non-goals:
+- Explain Highway itself. Refer to the upstream Highway documentation.
+- Guarantee that every ImageBufAlgo op has a hwy implementation.
+
+
+Where the code lives
+--------------------
+
+Core helpers:
+- `src/libOpenImageIO/imagebufalgo_hwy_pvt.h`
+
+Typical hwy call sites:
+- `src/libOpenImageIO/imagebufalgo_addsub.cpp`
+- `src/libOpenImageIO/imagebufalgo_muldiv.cpp`
+- `src/libOpenImageIO/imagebufalgo_mad.cpp`
+- `src/libOpenImageIO/imagebufalgo_pixelmath.cpp`
+- `src/libOpenImageIO/imagebufalgo_xform.cpp` (some ops are hwy-accelerated)
+
+
+Enabling and gating the hwy path
+-------------------------------
+
+The hwy path is only used when:
+- Highway usage is enabled at runtime (`OIIO::pvt::enable_hwy`).
+- The relevant `ImageBuf` objects have local pixel storage (`localpixels()` is
+  non-null), meaning the data is in process memory rather than accessed through
+  an `ImageCache` tile abstraction.
+- The operation can be safely expressed as contiguous streams of pixels/channels
+  for the hot path, or the code falls back to a scalar implementation for
+  strided/non-contiguous layouts.
+
+The common gating pattern looks like:
+- In a typed `*_impl` dispatcher: check `OIIO::pvt::enable_hwy` and `localpixels`
+  and then call a `*_impl_hwy` function; otherwise call `*_impl_scalar`.
+
+Important: the hwy path is an optimization. Correctness must not depend on hwy.
+
+
+OIIO numeric semantics: why we promote to float
+----------------------------------------------
+
+OIIO treats integer image pixels as normalized values:
+- Unsigned integers represent [0, 1].
+- Signed integers represent approximately [-1, 1] with clamping for INT_MIN.
+
+Therefore, most pixel math must be performed in float (or double) space, even
+when the stored data is integer. This is why the hwy layer uses the
+"LoadPromote/Operate/DemoteStore" pattern.
+
+For additional discussion (and pitfalls of saturating integer arithmetic), see:
+- `HIGHWAY_SATURATING_ANALYSIS.md`
+
+
+The core pattern: LoadPromote -> RunHwy* -> DemoteStore
+-------------------------------------------------------
+
+The helper header `imagebufalgo_hwy_pvt.h` defines the reusable building blocks:
+
+1) Computation type selection
+   - `SimdMathType<T>` selects `float` for most types, and `double` only when
+     the destination type is `double`.
+
+   Rationale:
+   - Float math is significantly faster on many targets.
+   - For OIIO, integer images are normalized to [0,1] (or ~[-1,1]), so float
+     precision is sufficient for typical image processing workloads.
+
+2) Load and promote (with normalization)
+   - `LoadPromote(d, ptr)` and `LoadPromoteN(d, ptr, count)` load values and
+     normalize integer ranges into the computation space.
+
+   Rationale:
+   - Consolidates all normalization and conversion logic in one place.
+   - Prevents subtle drift where each operation re-implements integer scaling.
+   - Ensures tail handling ("N" variants) is correct and consistent.
+
+3) Demote and store (with denormalization/clamp/round)
+   - `DemoteStore(d, ptr, v)` and `DemoteStoreN(d, ptr, v, count)` reverse the
+     normalization and store results in the destination pixel type.
+
+   Rationale:
+   - Centralizes rounding and clamping behavior for all destination types.
+   - Ensures output matches OIIO scalar semantics.
+
+4) Generic kernel runners (streaming arrays)
+   - `RunHwyUnaryCmd`, `RunHwyCmd` (binary), `RunHwyTernaryCmd`
+   - These are the primary entry points for most hwy kernels.
+
+   Rationale:
+   - Encapsulates lane iteration and tail processing once.
+   - The call sites only provide the per-lane math lambda, not the boilerplate.
+
+
+Native integer runners: when they are valid
+-------------------------------------------
+
+Some operations are "scale-invariant" under OIIO's normalized integer model.
+For example, for unsigned integer add:
+- `(a/max + b/max)` in float space, then clamped to [0,1], then scaled by max
+  matches saturated integer add `SaturatedAdd(a, b)` for the same bit depth.
+
+For those cases, `imagebufalgo_hwy_pvt.h` provides:
+- `RunHwyUnaryNativeInt<T>`
+- `RunHwyBinaryNativeInt<T>`
+
+These should only be used when all of the following are true:
+- The operation is known to be scale-invariant under the normalization model.
+- Input and output types are the same integral type.
+- The operation does not depend on mixed types or float-range behavior.
+
+Rationale:
+- Avoids promotion/demotion overhead and can be materially faster.
+- Must be opt-in and explicit, because many operations are NOT compatible with
+  raw integer arithmetic (e.g. multiplication, division, pow).
+
+
+Local pixel pointer helpers: reducing boilerplate safely
+-------------------------------------------------------
+
+Most hwy call sites need repeated pointer and stride computations:
+- Pixel size in bytes.
+- Scanline size in bytes.
+- Base pointer to local pixels.
+- Per-row pointer for a given ROI and scanline.
+- Per-pixel pointer for non-contiguous fallbacks.
+
+To centralize that, `imagebufalgo_hwy_pvt.h` defines:
+- `HwyPixels(ImageBuf&)` and `HwyPixels(const ImageBuf&)`
+  returning a small view (`HwyLocalPixelsView`) with:
+  - base pointer (`std::byte*` / `const std::byte*`)
+  - `pixel_bytes`, `scanline_bytes`
+  - `xbegin`, `ybegin`, `nchannels`
+- `RoiNChannels(roi)` for `roi.chend - roi.chbegin`
+- `ChannelsContiguous<T>(view, nchannels)`:
+  true only when the pixel stride exactly equals `nchannels * sizeof(T)`
+- `PixelBase(view, x, y)`, `ChannelPtr<T>(view, x, y, ch)`
+- `RoiRowPtr<T>(view, y, roi)` for the start of the ROI row at `roi.xbegin` and
+  `roi.chbegin`.
+
+Rationale:
+- Avoids duplicating fragile byte-offset math across many ops.
+- Makes it visually obvious what the code is doing: "get row pointer" vs
+  "compute offset by hand."
+- Makes non-contiguous fallback paths less error-prone by reusing the same
+  pointer computations.
+
+Important: these helpers are only valid for `ImageBuf` instances with local
+pixels (`localpixels()` non-null). The call sites must check that before using
+them.
+
+
+Contiguous fast path vs non-contiguous fallback
+-----------------------------------------------
+
+Most operations implement two paths:
+
+1) Contiguous fast path:
+   - Used when pixels are tightly packed for the ROI's channel range.
+   - The operation is executed as a 1D stream of length:
+     `roi.width() * (roi.chend - roi.chbegin)`
+   - Uses `RunHwy*Cmd` (or native-int runner) and benefits from:
+     - fewer branches
+     - fewer pointer computations
+     - auto tail handling
+
+2) Non-contiguous fallback:
+   - Used when pixels have padding, unusual strides, or channel subsets that do
+     not form a dense stream.
+   - Typically loops pixel-by-pixel and channel-by-channel.
+   - May still use the `ChannelPtr` helpers to compute correct addresses.
+
+Rationale:
+- The contiguous path is where SIMD delivers large gains.
+- Trying to SIMD-optimize arbitrary strided layouts often increases complexity
+  and risk for marginal benefit. Keeping a scalar fallback preserves
+  correctness and maintainability.
+
+
+How to add a new hwy kernel
+---------------------------
+
+Step 1: Choose the kernel shape
+- Unary: `R = f(A)` -> use `RunHwyUnaryCmd`
+- Binary: `R = f(A, B)` -> use `RunHwyCmd`
+- Ternary: `R = f(A, B, C)` -> use `RunHwyTernaryCmd`
+
+Step 2: Decide if a native-int fast path is valid
+- Only for scale-invariant ops and same-type integral inputs/outputs.
+- Use `RunHwyUnaryNativeInt` / `RunHwyBinaryNativeInt` when safe.
+- Otherwise, always use the promote/demote runners.
+
+Step 3: Implement the hwy body with a contig check
+Typical structure inside `*_impl_hwy`:
+- Acquire views once:
+  - `auto Rv = HwyPixels(R);`
+  - `auto Av = HwyPixels(A);` etc.
+- In the parallel callback:
+  - compute `nchannels = RoiNChannels(roi)`
+  - compute `contig = ChannelsContiguous<...>(...)` for each image
+  - for each scanline y:
+    - `Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);`
+    - `const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);` etc.
+    - if contig: call `RunHwy*` with `n = roi.width() * nchannels`
+    - else: fall back per pixel, per channel
+
+Step 4: Keep the scalar path as the reference
+- The scalar implementation should remain correct for all layouts and types.
+- The hwy path should match scalar results for supported cases.
+
+
+Design rationale summary
+------------------------
+
+This design intentionally separates concerns:
+- Type conversion and normalization are centralized (`LoadPromote`,
+  `DemoteStore`).
+- SIMD lane iteration and tail handling are centralized (`RunHwy*` runners).
+- Image address computations are centralized (`HwyPixels`, `RoiRowPtr`,
+  `ChannelPtr`).
+- Operation-specific code is reduced to short lambdas expressing the math.
+
+This makes the hwy layer:
+- Easier to maintain: fewer places to fix bugs when semantics change.
+- Easier to extend: adding an op mostly means writing the math lambda and the
+  dispatch glue.
+- Safer: correctness for unusual layouts remains in scalar fallbacks.
+
+
+Notes on `half`
+---------------
+
+The hwy conversion helpers handle `half` by converting through
+`hwy::float16_t`. This currently assumes the underlying `half` representation
+is compatible with how Highway loads/stores 16-bit floats.
+
+If this assumption is revisited in the future, it should be changed as a
+separate, explicit correctness/performance project.
+
+
+<!-- SPDX-License-Identifier: CC-BY-4.0 -->
+<!-- Copyright Contributors to the OpenImageIO Project. -->
+
+
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 12467ae6b6..34084f1743 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -225,6 +225,9 @@ if (USE_QT AND OPENGL_FOUND)
 endif ()
 
 
+# Google Highway for SIMD
+checked_find_package (hwy REQUIRED)
+
 # Tessil/robin-map
 checked_find_package (Robinmap REQUIRED
                       VERSION_MIN 1.2.0
diff --git a/src/doc/imagebufalgo.rst b/src/doc/imagebufalgo.rst
index b013ce0d20..200417accc 100644
--- a/src/doc/imagebufalgo.rst
+++ b/src/doc/imagebufalgo.rst
@@ -152,6 +152,68 @@ the computation without spawning additional threads, which might tend to
 crowd out the other application threads.
 
 
+SIMD Performance and Data Types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many ImageBufAlgo operations use SIMD (Single Instruction, Multiple Data)
+optimizations powered by the Google Highway library to achieve significant
+performance improvements, particularly for integer image formats.
+
+**Integer Type Optimizations:**
+
+OpenImageIO treats all integer images as normalized Standard Dynamic Range
+(SDR) data:
+
+* Unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) are
+  normalized to the [0.0, 1.0] range: ``float_value = int_value / max_value``
+* Signed integers (``int8``, ``int16``, ``int32``, ``int64``) are normalized
+  to approximately the [-1.0, 1.0] range: ``float_value = int_value / max_value``
+
+Most ImageBufAlgo operations convert integer data to float, perform the
+operation, and convert back. Highway SIMD provides 3-5x speedup for these
+operations compared to scalar code.
+
+**Scale-Invariant Operations:**
+
+Certain operations are *scale-invariant*, meaning they produce identical
+results whether performed on raw integers or normalized floats. For these
+operations, OpenImageIO uses native integer SIMD paths that avoid float
+conversion entirely, achieving 6-12x speedup (2-3x faster than the float
+promotion path):
+
+* ``add``, ``sub`` (with saturation)
+* ``min``, ``max``
+* ``abs``, ``absdiff``
+
+These optimizations automatically activate when all input and output images
+have matching integer types (e.g., all ``uint8``). When types differ or when
+mixing integer and float images, the standard float promotion path is used.
+
+**Controlling SIMD Optimizations:**
+
+Highway SIMD is enabled by default. To disable it globally::
+
+    OIIO::attribute("enable_hwy", 0);
+
+Or via environment variable::
+
+    export OPENIMAGEIO_ENABLE_HWY=0
+
+This is primarily useful for debugging or performance comparison. In normal
+use, the optimizations should remain enabled for best performance.
+
+**Performance Expectations:**
+
+Typical speedups with Highway SIMD (compared to scalar code):
+
+* Float operations: 3-5x faster
+* Integer operations (with float conversion): 3-5x faster
+* Integer scale-invariant operations (native int): 6-12x faster
+* Half-float operations: 3-5x faster
+
+Actual performance depends on the specific operation, image size, data types,
+and hardware capabilities (AVX2, AVX-512, ARM NEON, etc.).
+
 
 .. _sec-iba-patterns:
 
diff --git a/src/doc/imageioapi.rst b/src/doc/imageioapi.rst
index d2d6b192b4..dca5a66da5 100644
--- a/src/doc/imageioapi.rst
+++ b/src/doc/imageioapi.rst
@@ -397,16 +397,36 @@ inside the source code.
     line, but not the full human-readable command line. (This was added in
     OpenImageIO 2.5.11.)
 
+.. cpp:var:: OPENIMAGEIO_ENABLE_HWY
+
+    Controls whether to use Google Highway SIMD library optimizations for
+    ImageBufAlgo operations. If set to "1" (the default), Highway SIMD
+    optimizations will be enabled for supported operations, providing
+    significant performance improvements (typically 3-12x faster) on integer
+    image types. If set to "0", these optimizations will be disabled and fall
+    back to scalar implementations.
+
+    This can also be controlled at runtime via::
+
+        OIIO::attribute("enable_hwy", 1);  // enable (default)
+        OIIO::attribute("enable_hwy", 0);  // disable
+
+    Note: Highway SIMD optimizations are particularly beneficial for integer
+    image formats (uint8, uint16, int8, int16, uint32, int32, etc.) and provide
+    additional speedup for scale-invariant operations (add, sub, min, max,
+    absdiff) that can operate directly on integer data without float conversion.
+    (This was added in OpenImageIO 3.1.)
+
 .. cpp:var:: OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH
 
-    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier. 
-    If set to "1", all directories under ``PATH`` will be added to the DLL load 
+    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier.
+    If set to "1", all directories under ``PATH`` will be added to the DLL load
     path before attempting to import the OpenImageIO module. (This was added in
     OpenImageIO 3.0.3.0)
 
-    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style" 
-    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH` 
-    environment variable (added in OpenImageIO 2.4.0/2.3.18). 
+    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style"
+    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH`
+    environment variable (added in OpenImageIO 2.4.0/2.3.18).
 
-    In other words, to reproduce the default Python-module-loading behavior of 
+    In other words, to reproduce the default Python-module-loading behavior of
     earlier versions of OIIO, set ``OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1``.
diff --git a/src/hwy_test/CMakeLists.txt b/src/hwy_test/CMakeLists.txt
new file mode 100644
index 0000000000..735b88def5
--- /dev/null
+++ b/src/hwy_test/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright Contributors to the OpenImageIO project.
+# SPDX-License-Identifier: Apache-2.0
+# https://github.com/AcademySoftwareFoundation/OpenImageIO
+
+fancy_add_executable (LINK_LIBRARIES OpenImageIO)
diff --git a/src/hwy_test/hwy_test.cpp b/src/hwy_test/hwy_test.cpp
new file mode 100644
index 0000000000..4fbf51af32
--- /dev/null
+++ b/src/hwy_test/hwy_test.cpp
@@ -0,0 +1,1096 @@
+// Copyright Contributors to the OpenImageIO project.
+// SPDX-License-Identifier: Apache-2.0
+// https://github.com/AcademySoftwareFoundation/OpenImageIO
+
+/// Benchmark Highway SIMD vs Scalar implementations
+/// Compares performance by toggling OIIO::attribute("enable_hwy", 0/1)
+
+#include <OpenImageIO/imagebufalgo.h>
+#include <OpenImageIO/imageio.h>
+#include <OpenImageIO/timer.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+using namespace OIIO;
+
+struct BenchResult {
+    double scalar_ms;
+    double simd_ms;
+    double speedup;
+};
+
+struct TestResult {
+    const char* type_name;
+    BenchResult bench;
+    float max_error;
+    bool pass;
+};
+
+// Run a benchmark function multiple times and return average time in milliseconds
+template<typename Func>
+double
+benchmark_ms(Func&& func, int iterations = 100, int warmup = 5)
+{
+    // Warmup
+    for (int i = 0; i < warmup; ++i) {
+        func();
+    }
+
+    Timer timer;
+    for (int i = 0; i < iterations; ++i) {
+        func();
+    }
+    return timer() * 1000.0 / iterations;  // Convert to ms
+}
+
+// Compare two ImageBufs and return max error between them
+// Tolerance accounts for rounding differences between SIMD (round-to-nearest)
+// and scalar (truncate) conversions. For uint8: 1/255 = 0.004, uint16: 1/65535 = 0.00002
+float
+verify_match(const ImageBuf& scalar_result, const ImageBuf& simd_result,
+             float tolerance = 0.005f)
+{
+    auto comp = ImageBufAlgo::compare(scalar_result, simd_result, tolerance,
+                                      tolerance);
+    return comp.maxerror;
+}
+
+// Benchmark add operation
+BenchResult
+bench_add(const ImageBuf& A, const ImageBuf& B, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    // Scalar version
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::add(R, A, B); },
+                                    iterations);
+
+    // SIMD version
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::add(R, A, B); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark sub operation
+BenchResult
+bench_sub(const ImageBuf& A, const ImageBuf& B, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::sub(R, A, B); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::sub(R, A, B); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark mul operation
+BenchResult
+bench_mul(const ImageBuf& A, const ImageBuf& B, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::mul(R, A, B); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::mul(R, A, B); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark pow operation
+BenchResult
+bench_pow(const ImageBuf& A, cspan<float> exponent, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::pow(R, A, exponent); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::pow(R, A, exponent); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark rangecompress operation
+BenchResult
+bench_rangecompress(const ImageBuf& A, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::rangecompress(R, A); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::rangecompress(R, A); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark rangeexpand operation
+BenchResult
+bench_rangeexpand(const ImageBuf& A, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::rangeexpand(R, A); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::rangeexpand(R, A); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark premult operation
+BenchResult
+bench_premult(const ImageBuf& A, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::premult(R, A); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::premult(R, A); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark unpremult operation
+BenchResult
+bench_unpremult(const ImageBuf& A, int iterations = 100)
+{
+    BenchResult result;
+    ImageBuf R(A.spec());
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::unpremult(R, A); },
+                                    iterations);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::unpremult(R, A); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+    return result;
+}
+
+// Benchmark resample operation
+BenchResult
+bench_resample(const ImageBuf& A, int new_width, int new_height,
+               int iterations = 50)
+{
+    BenchResult result;
+    ImageSpec newspec = A.spec();
+    newspec.width     = new_width;
+    newspec.height    = new_height;
+
+    // Scalar version - ensure proper allocation
+    ImageBuf R_scalar(newspec);
+    ImageBufAlgo::zero(R_scalar);  // Ensure buffer is allocated!
+
+    OIIO::attribute("enable_hwy", 0);
+    result.scalar_ms
+        = benchmark_ms([&]() { ImageBufAlgo::resample(R_scalar, A); },
+                       iterations);
+
+    // SIMD version
+    ImageBuf R_simd(newspec);
+    ImageBufAlgo::zero(R_simd);
+
+    OIIO::attribute("enable_hwy", 1);
+    result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::resample(R_simd, A); },
+                                  iterations);
+
+    result.speedup = result.scalar_ms / result.simd_ms;
+
+    // Validate results - check for differences
+    auto comp = ImageBufAlgo::compare(R_scalar, R_simd, 0.001f, 0.001f);
+    if (comp.maxerror > 0.001f) {
+        printf("    \033[33m[INFO] max error: %.6f at (%d, %d, c%d)\033[0m\n",
+               comp.maxerror, comp.maxx, comp.maxy, comp.maxc);
+
+        // Print actual pixel values at the error location
+        std::vector<float> scalar_pixel(R_scalar.nchannels());
+        std::vector<float> simd_pixel(R_simd.nchannels());
+        R_scalar.getpixel(comp.maxx, comp.maxy, scalar_pixel.data());
+        R_simd.getpixel(comp.maxx, comp.maxy, simd_pixel.data());
+        printf("    Scalar ch%d: %.6f, SIMD ch%d: %.6f, diff: %.6f\n",
+               comp.maxc, scalar_pixel[comp.maxc], comp.maxc,
+               simd_pixel[comp.maxc],
+               std::abs(scalar_pixel[comp.maxc] - simd_pixel[comp.maxc]));
+    }
+
+    return result;
+}
+
+// Print results
+void
+print_result(const char* type_name, const BenchResult& result)
+{
+    const char* color = result.speedup > 1.0 ? "\033[32m" : "\033[31m";
+    const char* reset = "\033[0m";
+    printf("%-10s | %10.2f | %10.2f | %s%6.2fx%s\n", type_name,
+           result.scalar_ms, result.simd_ms, color, result.speedup, reset);
+}
+
+void
+print_header()
+{
+    printf("%-10s | %10s | %10s | %-8s\n", "Type", "Scalar(ms)", "SIMD(ms)",
+           "Speedup");
+    printf("----------------------------------------------------\n");
+}
+
+void
+print_test_results(const char* test_name,
+                   const std::vector<TestResult>& results)
+{
+    printf("\n[ %s ]\n", test_name);
+    printf("-----------------------------------------------\n");
+
+    // Print all timing results
+    for (const auto& r : results) {
+        const char* color = r.bench.speedup > 1.0 ? "\033[32m" : "\033[31m";
+        const char* reset = "\033[0m";
+        printf("%-10s | %10.2f | %10.2f | %s%6.2fx%s\n", r.type_name,
+               r.bench.scalar_ms, r.bench.simd_ms, color, r.bench.speedup,
+               reset);
+    }
+
+    // Print separator
+    printf("-----------------------------------------------\n");
+
+    // Print all PASS/FAIL results
+    for (const auto& r : results) {
+        const char* color = r.pass ? "\033[32m" : "\033[31m";
+        const char* reset = "\033[0m";
+        printf("%-6s: %s%s%s (max_err: %.6f)\n", r.type_name, color,
+               r.pass ? "PASS" : "FAIL", reset, r.max_error);
+    }
+}
+
+// Get appropriate file extension for type
+const char*
+get_extension(TypeDesc format)
+{
+    if (format == TypeDesc::HALF)
+        return ".exr";
+    return ".tif";
+}
+
+// Save image with appropriate format
+void
+save_image(const ImageBuf& buf, const char* basename, const char* type_name)
+{
+    char filename[256];
+    snprintf(filename, sizeof(filename), "%s_%s%s", basename, type_name,
+             get_extension(buf.spec().format));
+    if (!buf.write(filename)) {
+        printf("    Warning: Failed to save %s\n", filename);
+    }
+}
+
+// Create test images
+ImageBuf
+create_test_image(int width, int height, int nchannels, TypeDesc format)
+{
+    ImageSpec spec(width, height, nchannels, format);
+    ImageBuf buf(spec);
+
+    // Create a gradient to ensure meaningful resampling
+    std::vector<float> tl(nchannels), tr(nchannels), bl(nchannels),
+        br(nchannels);
+    for (int c = 0; c < nchannels; ++c) {
+        tl[c] = 0.0f;
+        tr[c] = 1.0f;
+        bl[c] = 0.5f;
+        br[c] = 0.0f;
+        if (c % 2 == 1) {  // Vary channels
+            tl[c] = 1.0f;
+            tr[c] = 0.0f;
+            bl[c] = 0.0f;
+            br[c] = 1.0f;
+        }
+    }
+    ImageBufAlgo::fill(buf, tl, tr, bl, br);
+    return buf;
+}
+
+ImageBuf
+create_checkerboard_image(int width, int height, int nchannels, TypeDesc format,
+                          int checker_size = 64)
+{
+    ImageSpec spec(width, height, nchannels, format);
+    ImageBuf buf(spec);
+
+    // Fill with checkerboard pattern
+    ImageBufAlgo::checker(buf, checker_size, checker_size, nchannels,
+                          { 0.1f, 0.1f, 0.1f }, { 0.9f, 0.9f, 0.9f }, 0, 0, 0);
+    return buf;
+}
+
+ImageBuf
+create_rgba_image(int width, int height, TypeDesc format)
+{
+    ImageSpec spec(width, height, 4, format);
+    spec.alpha_channel = 3;
+    ImageBuf buf(spec);
+    // Fill with semi-transparent colors
+    ImageBufAlgo::fill(buf, { 0.8f, 0.6f, 0.4f, 0.7f });
+    return buf;
+}
+
+int
+main(int argc, char* argv[])
+{
+    // Default parameters
+    int width      = 2048;
+    int height     = 2048;
+    int iterations = 20;
+
+    // Parse command line args
+    for (int i = 1; i < argc; ++i) {
+        if (strcmp(argv[i], "--size") == 0 && i + 1 < argc) {
+            if (sscanf(argv[++i], "%dx%d", &width, &height) != 2) {
+                fprintf(stderr,
+                        "Invalid size format. Use WxH (e.g., 2048x2048)\n");
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--iterations") == 0 && i + 1 < argc) {
+            iterations = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--help") == 0) {
+            printf("Usage: %s [options]\n", argv[0]);
+            printf("Options:\n");
+            printf("  --size WxH         Image size (default: 2048x2048)\n");
+            printf("  --iterations N     Number of iterations (default: 20)\n");
+            printf("  --help             Show this help\n");
+            return 0;
+        }
+    }
+
+    printf("Highway SIMD Benchmark\n");
+    printf("======================\n");
+    printf("Image size: %dx%d\n", width, height);
+    printf("Iterations: %d\n", iterations);
+
+    // Verify enable_hwy attribute works
+    int hwy_enabled = 0;
+    OIIO::getattribute("enable_hwy", hwy_enabled);
+    printf("Initial enable_hwy: %d\n", hwy_enabled);
+
+    // Test types
+    struct TestConfig {
+        const char* name;
+        TypeDesc format;
+    };
+
+    std::vector<TestConfig> configs = {
+        { "uint8", TypeDesc::UINT8 },   { "uint16", TypeDesc::UINT16 },
+        { "uint32", TypeDesc::UINT32 }, { "float", TypeDesc::FLOAT },
+        { "half", TypeDesc::HALF },     { "double", TypeDesc::DOUBLE },
+    };
+
+    // Add
+    std::vector<TestResult> add_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_add(A, B, iterations);
+
+        // Verify: compute scalar and SIMD results
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::add(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::add(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        add_results.push_back(result);
+
+        save_image(A, "src_A", cfg.name);
+        save_image(B, "src_B", cfg.name);
+        save_image(R_simd, "result_add", cfg.name);
+    }
+    print_test_results("Add", add_results);
+
+    // Sub
+    std::vector<TestResult> sub_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_sub(A, B, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::sub(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::sub(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        sub_results.push_back(result);
+
+        save_image(R_simd, "result_sub", cfg.name);
+    }
+    print_test_results("Sub", sub_results);
+
+    // Mul
+    std::vector<TestResult> mul_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_mul(A, B, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::mul(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::mul(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        mul_results.push_back(result);
+
+        save_image(R_simd, "result_mul", cfg.name);
+    }
+    print_test_results("Mul", mul_results);
+
+    // Pow
+    std::vector<TestResult> pow_results;
+    float exponent_vals[] = { 2.2f, 2.2f, 2.2f };
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_pow(A, exponent_vals, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::pow(R_scalar, A, exponent_vals);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::pow(R_simd, A, exponent_vals);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        pow_results.push_back(result);
+
+        save_image(R_simd, "result_pow", cfg.name);
+    }
+    print_test_results("Pow", pow_results);
+
+
+    // Div
+    std::vector<TestResult> div_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_div = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::div(R, A, B); }, iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::div(R, A, B); },
+                                          iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_div(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::div(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::div(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        div_results.push_back(result);
+
+        save_image(R_simd, "result_div", cfg.name);
+    }
+    print_test_results("Div", div_results);
+
+    // Min
+    std::vector<TestResult> min_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_min = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::min(R, A, B); }, iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::min(R, A, B); },
+                                          iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_min(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::min(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::min(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        min_results.push_back(result);
+
+        save_image(R_simd, "result_min", cfg.name);
+    }
+    print_test_results("Min", min_results);
+
+    // Max
+    std::vector<TestResult> max_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_max = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::max(R, A, B); }, iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::max(R, A, B); },
+                                          iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_max(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::max(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::max(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        max_results.push_back(result);
+
+        save_image(R_simd, "result_max", cfg.name);
+    }
+    print_test_results("Max", max_results);
+
+    // Abs
+    std::vector<TestResult> abs_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_abs = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::abs(R, A); },
+                                            iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::abs(R, A); },
+                                          iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_abs(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::abs(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::abs(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        abs_results.push_back(result);
+
+        save_image(R_simd, "result_abs", cfg.name);
+    }
+    print_test_results("Abs", abs_results);
+
+    // Absdiff
+    std::vector<TestResult> absdiff_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_absdiff = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::absdiff(R, A, B); },
+                               iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms
+                = benchmark_ms([&]() { ImageBufAlgo::absdiff(R, A, B); },
+                               iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_absdiff(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::absdiff(R_scalar, A, B);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::absdiff(R_simd, A, B);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        absdiff_results.push_back(result);
+
+        save_image(R_simd, "result_absdiff", cfg.name);
+    }
+    print_test_results("Absdiff", absdiff_results);
+
+    // MAD
+    std::vector<TestResult> mad_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf B = create_test_image(width, height, 3, cfg.format);
+        ImageBuf C = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_mad = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::mad(R, A, B, C); }, iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms
+                = benchmark_ms([&]() { ImageBufAlgo::mad(R, A, B, C); }, iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_mad(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::mad(R_scalar, A, B, C);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::mad(R_simd, A, B, C);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        mad_results.push_back(result);
+
+        save_image(R_simd, "result_mad", cfg.name);
+    }
+    print_test_results("MAD", mad_results);
+
+    // Clamp
+    std::vector<TestResult> clamp_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_clamp = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::clamp(R, A, 0.1f, 0.9f); },
+                               iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms
+                = benchmark_ms([&]() { ImageBufAlgo::clamp(R, A, 0.1f, 0.9f); },
+                               iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_clamp(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::clamp(R_scalar, A, 0.1f, 0.9f);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::clamp(R_simd, A, 0.1f, 0.9f);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        clamp_results.push_back(result);
+
+        save_image(R_simd, "result_clamp", cfg.name);
+    }
+    print_test_results("Clamp", clamp_results);
+
+    // RangeCompress
+    std::vector<TestResult> rangecompress_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_rangecompress(A, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::rangecompress(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::rangecompress(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        rangecompress_results.push_back(result);
+
+        save_image(R_simd, "result_rangecompress", cfg.name);
+    }
+    print_test_results("RangeCompress", rangecompress_results);
+
+    // RangeExpand
+    std::vector<TestResult> rangeexpand_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_rangeexpand(A, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::rangeexpand(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::rangeexpand(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        rangeexpand_results.push_back(result);
+
+        save_image(R_simd, "result_rangeexpand", cfg.name);
+    }
+    print_test_results("RangeExpand", rangeexpand_results);
+
+    // Premult
+    std::vector<TestResult> premult_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_rgba_image(width, height, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_premult(A, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::premult(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::premult(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        premult_results.push_back(result);
+
+        save_image(A, "src_RGBA", cfg.name);
+        save_image(R_simd, "result_premult", cfg.name);
+    }
+    print_test_results("Premult", premult_results);
+
+    // Unpremult
+    std::vector<TestResult> unpremult_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_rgba_image(width, height, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_unpremult(A, iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::unpremult(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::unpremult(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        unpremult_results.push_back(result);
+
+        save_image(R_simd, "result_unpremult", cfg.name);
+    }
+    print_test_results("Unpremult", unpremult_results);
+
+
+    // Invert
+    std::vector<TestResult> invert_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        auto bench_invert = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms
+                = benchmark_ms([&]() { ImageBufAlgo::invert(R, A); }, iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::invert(R, A); },
+                                          iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_invert(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::invert(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::invert(R_simd, A);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        invert_results.push_back(result);
+
+        save_image(R_simd, "result_invert", cfg.name);
+    }
+    print_test_results("Invert", invert_results);
+
+    // Contrast Remap (linear stretch)
+    std::vector<TestResult> contrast_remap_results;
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_test_image(width, height, 3, cfg.format);
+        ImageBuf R_scalar(A.spec());
+        ImageBuf R_simd(A.spec());
+
+        // Linear stretch: remap [0.2, 0.8] -> [0.0, 1.0]
+        float black_vals[] = { 0.2f, 0.2f, 0.2f };
+        float white_vals[] = { 0.8f, 0.8f, 0.8f };
+
+        auto bench_contrast = [&](int iters = 100) {
+            BenchResult result;
+            ImageBuf R(A.spec());
+            OIIO::attribute("enable_hwy", 0);
+            result.scalar_ms = benchmark_ms(
+                [&]() {
+                    ImageBufAlgo::contrast_remap(R, A, black_vals, white_vals);
+                },
+                iters);
+            OIIO::attribute("enable_hwy", 1);
+            result.simd_ms = benchmark_ms(
+                [&]() {
+                    ImageBufAlgo::contrast_remap(R, A, black_vals, white_vals);
+                },
+                iters);
+            result.speedup = result.scalar_ms / result.simd_ms;
+            return result;
+        };
+
+        TestResult result;
+        result.type_name = cfg.name;
+        result.bench     = bench_contrast(iterations);
+
+        // Verify
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::contrast_remap(R_scalar, A, black_vals, white_vals);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::contrast_remap(R_simd, A, black_vals, white_vals);
+
+        result.max_error = verify_match(R_scalar, R_simd);
+        result.pass      = result.max_error < 0.005f;
+        contrast_remap_results.push_back(result);
+
+        save_image(R_simd, "result_contrast_remap", cfg.name);
+    }
+    print_test_results("Contrast Remap (linear)", contrast_remap_results);
+
+    // Resample 75%
+    printf("\n[ Resample 75%% ]\n");
+    //print_header();
+    int resample_iters = std::max(1, iterations / 2);
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format);
+        ImageSpec newspec = A.spec();
+        newspec.width     = width * 3 / 4;
+        newspec.height    = height * 3 / 4;
+
+        // Create separate buffers for scalar and SIMD
+        ImageBuf R_scalar(newspec);
+        ImageBuf R_simd(newspec);
+        ImageBufAlgo::zero(R_scalar);
+        ImageBufAlgo::zero(R_simd);
+
+        print_result(cfg.name, bench_resample(A, width * 3 / 4, height * 3 / 4,
+                                              resample_iters));
+
+        // Save both scalar and SIMD results for comparison
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::resample(R_scalar, A);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::resample(R_simd, A);
+
+        save_image(A, "src_checkerboard", cfg.name);
+        save_image(R_scalar, "result_resample75_scalar", cfg.name);
+        save_image(R_simd, "result_resample75_simd", cfg.name);
+    }
+
+    // Resample 50%
+    printf("\n[ Resample 50%% ]\n");
+    //print_header();
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format);
+        ImageSpec newspec = A.spec();
+        newspec.width     = width / 2;
+        newspec.height    = height / 2;
+        ImageBuf R(newspec);
+        ImageBufAlgo::zero(R);
+
+        print_result(cfg.name,
+                     bench_resample(A, width / 2, height / 2, resample_iters));
+
+        // Save final result
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::resample(R, A);
+        save_image(R, "result_resample50", cfg.name);
+    }
+
+    // Resample 25%
+    printf("\n[ Resample 25%% ]\n");
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format);
+        ImageSpec newspec = A.spec();
+        newspec.width     = width / 4;
+        newspec.height    = height / 4;
+        ImageBuf R(newspec);
+        ImageBufAlgo::zero(R);
+
+        print_result(cfg.name,
+                     bench_resample(A, width / 4, height / 4, resample_iters));
+
+        // Save final result
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::resample(R, A);
+        save_image(R, "result_resample25", cfg.name);
+    }
+
+    // Resample 16.66%
+    printf("\n[ Resample 16.66%% ]\n");
+    for (const auto& cfg : configs) {
+        ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format);
+        ImageSpec newspec = A.spec();
+        newspec.width     = width / 6;
+        newspec.height    = height / 6;
+        ImageBuf R(newspec);
+        ImageBufAlgo::zero(R);
+
+        print_result(cfg.name,
+                     bench_resample(A, width / 6, height / 6, resample_iters));
+
+        // Save final result
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::resample(R, A);
+        save_image(R, "result_resample25", cfg.name);
+    }
+
+    print_header();
+
+    printf("\nBenchmark complete!\n");
+    printf("Note: Speedup > 1.0x means SIMD is faster (shown in green)\n");
+    printf("      Speedup < 1.0x means scalar is faster (shown in red)\n");
+
+    return 0;
+}
diff --git a/src/include/imageio_pvt.h b/src/include/imageio_pvt.h
index 273375cd77..13d5c06140 100644
--- a/src/include/imageio_pvt.h
+++ b/src/include/imageio_pvt.h
@@ -43,6 +43,7 @@ extern int oiio_log_times;
 extern int openexr_core;
 extern int jpeg_com_attributes;
 extern int png_linear_premult;
+extern int enable_hwy;
 extern int limit_channels;
 extern int limit_imagesize_MB;
 extern int imagebuf_print_uncaught_errors;
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index f2459b2d32..7e8dadd1ad 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -165,6 +165,8 @@ target_link_libraries (OpenImageIO
             $<TARGET_NAME_IF_EXISTS:Freetype::Freetype>
             ${BZIP2_LIBRARIES}
             ZLIB::ZLIB
+            hwy::hwy
+            hwy::hwy_contrib
             ${CMAKE_DL_LIBS}
         )
 
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index c7a4d83e9c..9cf1c61183 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -18,6 +18,8 @@
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#include "imagebufalgo_hwy_pvt.h"
+
 #include "imageio_pvt.h"
 
 
@@ -26,8 +28,8 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class Atype, class Btype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -44,7 +46,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -58,6 +61,288 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
+template<class T>
+static bool
+add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated add - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedAdd(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating add in scalar
+                        int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                           : (T)sum;
+                        } else {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (sum < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)sum;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                // Process whole line as one vector stream
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Add(a, b);
+                                               });
+            } else {
+                // Process pixel by pixel (scalar fallback for strided channels)
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            + static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] + (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant add when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+    return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return add_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return add_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+// Native integer sub using SaturatedSub (scale-invariant, no float conversion)
+template<class T>
+static bool
+sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated sub - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedSub(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating sub in scalar
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
+                                           ? (a_ptr[c] - b_ptr[c])
+                                           : T(0);
+                        } else {
+                            int64_t diff = (int64_t)a_ptr[c]
+                                           - (int64_t)b_ptr[c];
+                            r_ptr[c] = (diff > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (diff < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)diff;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Sub(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            - static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant sub when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+    return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
 static bool
 add_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -155,8 +440,8 @@ ImageBufAlgo::add(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
new file mode 100644
index 0000000000..fe4c9b0d8a
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -0,0 +1,970 @@
+// Copyright Contributors to the OpenImageIO project.
+// SPDX-License-Identifier: Apache-2.0
+// https://github.com/AcademySoftwareFoundation/OpenImageIO
+
+#pragma once
+
+#include <OpenImageIO/half.h>
+#include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/imageio.h>
+#include <algorithm>
+#include <cstddef>
+#include <hwy/contrib/math/math-inl.h>
+#include <hwy/highway.h>
+#include <type_traits>
+
+OIIO_NAMESPACE_BEGIN
+
+// Alias for Highway's namespace for convenience
+namespace hn = hwy::HWY_NAMESPACE;
+
+// -----------------------------------------------------------------------
+// ImageBuf local pixel helpers (header-only)
+// -----------------------------------------------------------------------
+
+template<class ByteT> struct HwyLocalPixelsView {
+    ByteT* base           = nullptr;
+    size_t pixel_bytes    = 0;
+    size_t scanline_bytes = 0;
+    int xbegin            = 0;
+    int ybegin            = 0;
+    int nchannels         = 0;
+};
+
+inline HwyLocalPixelsView<std::byte>
+HwyPixels(ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline HwyLocalPixelsView<const std::byte>
+HwyPixels(const ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<const std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline int
+RoiNChannels(const ROI& roi) noexcept
+{
+    return roi.chend - roi.chbegin;
+}
+
+template<class T, class ByteT>
+inline bool
+ChannelsContiguous(const HwyLocalPixelsView<ByteT>& v, int nchannels) noexcept
+{
+    return size_t(nchannels) * sizeof(T) == v.pixel_bytes;
+}
+
+template<class ByteT>
+inline ByteT*
+PixelBase(const HwyLocalPixelsView<ByteT>& v, int x, int y) noexcept
+{
+    return v.base + size_t(y - v.ybegin) * v.scanline_bytes
+           + size_t(x - v.xbegin) * v.pixel_bytes;
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+ChannelPtr(const HwyLocalPixelsView<ByteT>& v, int x, int y, int ch) noexcept
+{
+    using RetT = std::conditional_t<std::is_const_v<ByteT>, const T, T>;
+    return reinterpret_cast<RetT*>(PixelBase(v, x, y) + size_t(ch) * sizeof(T));
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+RoiRowPtr(const HwyLocalPixelsView<ByteT>& v, int y, const ROI& roi) noexcept
+{
+    return ChannelPtr<T>(v, roi.xbegin, y, roi.chbegin);
+}
+
+// -----------------------------------------------------------------------
+// Type Traits
+// -----------------------------------------------------------------------
+
+/// Determine the appropriate SIMD math type for a given result type.
+/// Promotes smaller types to float, keeps double as double.
+/// Note: uint32_t uses float (not double) for image processing performance.
+/// In OIIO, uint32 images are normalized to 0-1 range like uint8/uint16,
+/// so float precision (24-bit mantissa) is sufficient and much faster than double.
+template<typename T> struct SimdMathType {
+    using type = float;
+};
+template<> struct SimdMathType<double> {
+    using type = double;
+};
+
+// -----------------------------------------------------------------------
+// Load and Promote
+// -----------------------------------------------------------------------
+
+/// Load and promote source data to target SIMD type.
+/// Handles type conversions from various source formats (uint8_t, int8_t, uint16_t,
+/// int16_t, uint32_t, int32_t, uint64_t, int64_t, half, float, double) to the
+/// target SIMD computation type.
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @return SIMD vector with promoted values
+template<class D, typename SrcT>
+inline auto
+LoadPromote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::Load(d, ptr);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::Load(d16, reinterpret_cast<const T16*>(ptr));
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::Load(d_u8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::Load(d_i8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::Load(d_u16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::Load(d_i16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::Load(d_u32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::Load(d_i32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        // Note: Precision loss expected for large values (>24 bits)
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::Load(d_u64, ptr);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::Load(d_i64, ptr);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+/// Load and promote partial source data to target SIMD type.
+/// Same as LoadPromote but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @param count Number of elements to load (must be <= lane count)
+/// @return SIMD vector with promoted values (undefined in unused lanes)
+template<class D, typename SrcT>
+inline auto
+LoadPromoteN(D d, const SrcT* ptr, size_t count)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::LoadN(d, ptr, count);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::LoadN(d16, reinterpret_cast<const T16*>(ptr), count);
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::LoadN(d_u8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::LoadN(d_i8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::LoadN(d_u16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::LoadN(d_i16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::LoadN(d_u32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::LoadN(d_i32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::LoadN(d_u64, ptr, count);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::LoadN(d_i64, ptr, count);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Demote and Store
+// -----------------------------------------------------------------------
+
+/// Demote SIMD values and store to destination type.
+/// Handles type conversions from SIMD computation type (float/double) back to
+/// various destination formats with proper rounding and clamping for integer types.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStore(D d, DstT* ptr, VecT v)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::Store(v, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::Store(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr));
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::Store(v_u8, d_u8, ptr);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -128-127 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::Store(v_i8, d_i8, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::Store(v_u16, d_u16, ptr);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -32768-32767 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::Store(v_i16, d_i16, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::Store(v_u32, d_u32, ptr);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] to int32 range
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -2147483648-2147483647 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::Store(v_i32, d_i32, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        // Note: Precision loss expected (float has only 24-bit mantissa)
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::Store(v_u64, d_u64, ptr);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::Store(v_i64, d_i64, ptr);
+    }
+}
+
+/// Demote and store partial SIMD values to destination type.
+/// Same as DemoteStore but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+/// @param count Number of elements to store (must be <= lane count)
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStoreN(D d, DstT* ptr, VecT v, size_t count)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::StoreN(v, d, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::StoreN(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr), count);
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::StoreN(v_u8, d_u8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-128, 127] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::StoreN(v_i8, d_i8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::StoreN(v_u16, d_u16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-32768, 32767] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::StoreN(v_i16, d_i16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::StoreN(v_u32, d_u32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] range to [-2147483648, 2147483647] range
+        VecD v_val    = (VecD)v;
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::StoreN(v_i32, d_i32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::StoreN(v_u64, d_u64, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::StoreN(v_i64, d_i64, ptr, count);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Native Integer Kernel Runners (No Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like abs, where int_op(a) == denorm(float_op(norm(a))).
+/// Much faster than promotion path - operates directly on integer SIMD vectors.
+/// @param r Destination array (same type as source)
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Abs(va); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyUnaryNativeInt(T* r, const T* a, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto res = op(d, va);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto res = op(d, va);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like saturated add, min, max, where:
+/// int_op(a, b) == denorm(float_op(norm(a), norm(b))).
+/// Much faster than promotion path - no conversion overhead.
+/// @param r Destination array (same type as sources)
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::SaturatedAdd(va, vb); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyBinaryNativeInt(T* r, const T* a, const T* b, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto vb  = hn::Load(d, b + x);
+        auto res = op(d, va, vb);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto vb  = hn::LoadN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Generic Kernel Runners (With Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on an array.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Sqrt(va); }
+template<typename Rtype, typename Atype, typename OpFunc>
+inline void
+RunHwyUnaryCmd(Rtype* r, const Atype* a, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto res = op(d, va);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto res = op(d, va);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on two arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::Add(va, vb); }
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline void
+RunHwyCmd(Rtype* r, const Atype* a, const Btype* b, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto res = op(d, va, vb);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a ternary SIMD operation on three arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param c Third source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b, vector_c) and returning result
+///           Example: [](auto d, auto va, auto vb, auto vc) { return hn::MulAdd(va, vb, vc); }
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline void
+RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
+                 size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto vc  = LoadPromote(d, c + x);
+        auto res = op(d, va, vb, vc);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto vc  = LoadPromoteN(d, c + x, remaining);
+        auto res = op(d, va, vb, vc);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Interleaved Channel Load/Store Helpers
+// -----------------------------------------------------------------------
+
+/// Load 4 interleaved channels (RGBA) with type promotion.
+/// For matching types, uses Highway's native LoadInterleaved4.
+/// For type promotion, loads and manually deinterleaves.
+/// @param d Highway descriptor tag for the target SIMD type
+/// @param ptr Pointer to interleaved RGBA data (R0,G0,B0,A0,R1,G1,B1,A1,...)
+/// @return Tuple of (R, G, B, A) SIMD vectors in promoted type
+template<class D, typename SrcT>
+inline auto
+LoadInterleaved4Promote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+    using Vec   = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        // No promotion needed - use Highway's optimized LoadInterleaved4
+        Vec r, g, b, a;
+        hn::LoadInterleaved4(d, ptr, r, g, b, a);
+        return std::make_tuple(r, g, b, a);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Load interleaved half data as float16_t
+        hn::Vec<decltype(d16)> r16, g16, b16, a16;
+        hn::LoadInterleaved4(d16, reinterpret_cast<const T16*>(ptr), r16, g16,
+                             b16, a16);
+
+        // Promote to computation type
+        Vec r_vec = hn::PromoteTo(d, r16);
+        Vec g_vec = hn::PromoteTo(d, g16);
+        Vec b_vec = hn::PromoteTo(d, b16);
+        Vec a_vec = hn::PromoteTo(d, a16);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    } else {
+        // Generic type promotion - deinterleave manually with normalization
+        const size_t N = hn::Lanes(d);
+        SrcT r_src[hn::MaxLanes(d)];
+        SrcT g_src[hn::MaxLanes(d)];
+        SrcT b_src[hn::MaxLanes(d)];
+        SrcT a_src[hn::MaxLanes(d)];
+
+        for (size_t i = 0; i < N; ++i) {
+            r_src[i] = ptr[i * 4 + 0];
+            g_src[i] = ptr[i * 4 + 1];
+            b_src[i] = ptr[i * 4 + 2];
+            a_src[i] = ptr[i * 4 + 3];
+        }
+
+        // Use LoadPromote for proper normalization of integer types
+        auto r_vec = LoadPromote(d, r_src);
+        auto g_vec = LoadPromote(d, g_src);
+        auto b_vec = LoadPromote(d, b_src);
+        auto a_vec = LoadPromote(d, a_src);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    }
+}
+
+/// Store 4 interleaved channels (RGBA) with type demotion.
+/// For matching types, uses Highway's native StoreInterleaved4.
+/// For type demotion, manually interleaves and stores.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination interleaved RGBA data
+/// @param r Red channel SIMD vector
+/// @param g Green channel SIMD vector
+/// @param b Blue channel SIMD vector
+/// @param a Alpha channel SIMD vector
+template<class D, typename DstT, typename VecT>
+inline void
+StoreInterleaved4Demote(D d, DstT* ptr, VecT r, VecT g, VecT b, VecT a)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        // No demotion needed - use Highway's optimized StoreInterleaved4
+        hn::StoreInterleaved4(r, g, b, a, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Demote to float16_t
+        auto r16 = hn::DemoteTo(d16, r);
+        auto g16 = hn::DemoteTo(d16, g);
+        auto b16 = hn::DemoteTo(d16, b);
+        auto a16 = hn::DemoteTo(d16, a);
+
+        // Store interleaved float16_t data
+        hn::StoreInterleaved4(r16, g16, b16, a16, d16,
+                              reinterpret_cast<T16*>(ptr));
+    } else {
+        // Generic type demotion - use DemoteStore for each channel then interleave
+        const size_t N = hn::Lanes(d);
+
+        // Temporary arrays for demoted values
+        DstT r_demoted[hn::MaxLanes(d)];
+        DstT g_demoted[hn::MaxLanes(d)];
+        DstT b_demoted[hn::MaxLanes(d)];
+        DstT a_demoted[hn::MaxLanes(d)];
+
+        // Use DemoteStoreN to properly denormalize integer types
+        DemoteStoreN(d, r_demoted, r, N);
+        DemoteStoreN(d, g_demoted, g, N);
+        DemoteStoreN(d, b_demoted, b, N);
+        DemoteStoreN(d, a_demoted, a, N);
+
+        // Interleave the demoted values
+        for (size_t i = 0; i < N; ++i) {
+            ptr[i * 4 + 0] = r_demoted[i];
+            ptr[i * 4 + 1] = g_demoted[i];
+            ptr[i * 4 + 2] = b_demoted[i];
+            ptr[i * 4 + 3] = a_demoted[i];
+        }
+    }
+}
+
+// -----------------------------------------------------------------------
+// Rangecompress/Rangeexpand SIMD Kernels
+// -----------------------------------------------------------------------
+
+/// Apply rangecompress formula to a SIMD vector.
+/// Formula (courtesy Sony Pictures Imageworks):
+///   if (|x| <= 0.18) return x
+///   else return copysign(a + b * log(c * |x| + 1), x)
+/// where a = -0.545768857, b = 0.183516696, c = 284.357788
+/// @param d Highway descriptor tag
+/// @param x Input SIMD vector
+/// @return Compressed SIMD vector
+template<class D, typename VecT>
+inline auto
+rangecompress_simd(D d, VecT x)
+{
+    using T = typename D::T;
+
+    // Constants from Sony Pictures Imageworks
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_x            = hn::Abs(x);
+    auto mask_passthrough = hn::Le(abs_x, hn::Set(d, x1));
+
+    // compressed = a + b * log(c * |x| + 1.0)
+    auto c_vec      = hn::Set(d, c);
+    auto one        = hn::Set(d, static_cast<T>(1.0));
+    auto temp       = hn::MulAdd(c_vec, abs_x, one);  // c * |x| + 1.0
+    auto log_val    = hn::Log(d, temp);
+    auto b_vec      = hn::Set(d, b);
+    auto a_vec      = hn::Set(d, a);
+    auto compressed = hn::MulAdd(b_vec, log_val, a_vec);  // a + b * log
+
+    // Apply sign of original x
+    auto result = hn::CopySign(compressed, x);
+
+    // If |x| <= x1, return x; else return compressed
+    return hn::IfThenElse(mask_passthrough, x, result);
+}
+
+/// Apply rangeexpand formula to a SIMD vector (inverse of rangecompress).
+/// Formula:
+///   if (|y| <= 0.18) return y
+///   else x = exp((|y| - a) / b); x = (x - 1) / c
+///        if x < 0.18 then x = (-x_intermediate - 1) / c
+///        return copysign(x, y)
+/// @param d Highway descriptor tag
+/// @param y Input SIMD vector (compressed values)
+/// @return Expanded SIMD vector
+template<class D, typename VecT>
+inline auto
+rangeexpand_simd(D d, VecT y)
+{
+    using T = typename D::T;
+
+    // Constants (same as rangecompress)
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_y            = hn::Abs(y);
+    auto mask_passthrough = hn::Le(abs_y, hn::Set(d, x1));
+
+    // x_intermediate = exp((|y| - a) / b)
+    auto a_vec        = hn::Set(d, a);
+    auto b_vec        = hn::Set(d, b);
+    auto intermediate = hn::Div(hn::Sub(abs_y, a_vec), b_vec);  // (|y| - a) / b
+    auto x_intermediate = hn::Exp(d, intermediate);
+
+    // x = (x_intermediate - 1.0) / c
+    auto one   = hn::Set(d, static_cast<T>(1.0));
+    auto c_vec = hn::Set(d, c);
+    auto x     = hn::Div(hn::Sub(x_intermediate, one), c_vec);
+
+    // If x < x1, use alternate solution: (-x_intermediate - 1.0) / c
+    auto mask_alternate = hn::Lt(x, hn::Set(d, x1));
+    auto x_alternate    = hn::Div(hn::Sub(hn::Neg(x_intermediate), one), c_vec);
+    x                   = hn::IfThenElse(mask_alternate, x_alternate, x);
+
+    // Apply sign of input y
+    auto result = hn::CopySign(x, y);
+
+    return hn::IfThenElse(mask_passthrough, y, result);
+}
+
+OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index 5707fcd6ac..74ca2b03d7 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -12,6 +12,7 @@
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#include "imagebufalgo_hwy_pvt.h"
 #include "imageio_pvt.h"
 
 
@@ -21,67 +22,88 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class ABCtype>
 static bool
-mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
-         ROI roi, int nthreads)
+mad_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                const ImageBuf& C, ROI roi, int nthreads)
+{
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        ImageBuf::Iterator<Rtype> r(R, roi);
+        ImageBuf::ConstIterator<ABCtype> a(A, roi);
+        ImageBuf::ConstIterator<ABCtype> b(B, roi);
+        ImageBuf::ConstIterator<ABCtype> c(C, roi);
+        for (; !r.done(); ++r, ++a, ++b, ++c) {
+            for (int ch = roi.chbegin; ch < roi.chend; ++ch)
+                r[ch] = a[ch] * b[ch] + c[ch];
+        }
+    });
+    return true;
+}
+
+
+
+template<class Rtype, class ABCtype>
+static bool
+mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+             const ImageBuf& C, ROI roi, int nthreads)
 {
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        if ((std::is_same<Rtype, float>::value
-             || std::is_same<Rtype, half>::value)
-            && (std::is_same<ABCtype, float>::value
-                || std::is_same<ABCtype, half>::value)
-            // && R.localpixels() // has to be, because it's writable
-            && A.localpixels() && B.localpixels()
-            && C.localpixels()
-            // && R.contains_roi(roi)  // has to be, because IBAPrep
-            && A.contains_roi(roi) && B.contains_roi(roi) && C.contains_roi(roi)
-            && roi.chbegin == 0 && roi.chend == R.nchannels()
-            && roi.chend == A.nchannels() && roi.chend == B.nchannels()
-            && roi.chend == C.nchannels()) {
-            // Special case when all inputs are either float or half, with in-
-            // memory contiguous data and we're operating on the full channel
-            // range: skip iterators: For these circumstances, we can operate on
-            // the raw memory very efficiently. Otherwise, we will need the
-            // magic of the the Iterators (and pay the price).
-            int nxvalues = roi.width() * R.nchannels();
-            for (int z = roi.zbegin; z < roi.zend; ++z)
-                for (int y = roi.ybegin; y < roi.yend; ++y) {
-                    Rtype* rraw = (Rtype*)R.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* araw
-                        = (const ABCtype*)A.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* braw
-                        = (const ABCtype*)B.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* craw
-                        = (const ABCtype*)C.pixeladdr(roi.xbegin, y, z);
-                    OIIO_DASSERT(araw && braw && craw);
-                    // The straightforward loop auto-vectorizes very well,
-                    // there's no benefit to using explicit SIMD here.
-                    for (int x = 0; x < nxvalues; ++x)
-                        rraw[x] = araw[x] * braw[x] + craw[x];
-                    // But if you did want to explicitly vectorize, this is
-                    // how it would look:
-                    // int simdend = nxvalues & (~3); // how many float4's?
-                    // for (int x = 0; x < simdend; x += 4) {
-                    //     simd::float4 a_simd(araw+x), b_simd(braw+x), c_simd(craw+x);
-                    //     simd::float4 r_simd = a_simd * b_simd + c_simd;
-                    //     r_simd.store (rraw+x);
-                    // }
-                    // for (int x = simdend; x < nxvalues; ++x)
-                    //     rraw[x] = araw[x] * braw[x] + craw[x];
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Av, nchannels)
+                            && ChannelsContiguous<ABCtype>(Bv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Cv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                // Use Highway SIMD for a*b+c (fused multiply-add)
+                RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n,
+                                                 [](auto d, auto a, auto b,
+                                                    auto c) {
+                                                     return hn::MulAdd(a, b, c);
+                                                 });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const ABCtype* a_ptr = ChannelPtr<ABCtype>(Av, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* b_ptr = ChannelPtr<ABCtype>(Bv, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* c_ptr = ChannelPtr<ABCtype>(Cv, x, y,
+                                                               roi.chbegin);
+                    for (int ch = 0; ch < nchannels; ++ch) {
+                        r_ptr[ch] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[ch])
+                                * static_cast<float>(b_ptr[ch])
+                            + static_cast<float>(c_ptr[ch]));
+                    }
                 }
-        } else {
-            ImageBuf::Iterator<Rtype> r(R, roi);
-            ImageBuf::ConstIterator<ABCtype> a(A, roi);
-            ImageBuf::ConstIterator<ABCtype> b(B, roi);
-            ImageBuf::ConstIterator<ABCtype> c(C, roi);
-            for (; !r.done(); ++r, ++a, ++b, ++c) {
-                for (int ch = roi.chbegin; ch < roi.chend; ++ch)
-                    r[ch] = a[ch] * b[ch] + c[ch];
             }
         }
     });
     return true;
 }
 
+template<class Rtype, class ABCtype>
+static bool
+mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
+         ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels() && C.localpixels())
+        return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+    return mad_impl_scalar<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+}
+
 
 
 template<class Rtype, class ABCtype>
@@ -232,11 +254,73 @@ ImageBufAlgo::mad(Image_or_Const A, Image_or_Const B, Image_or_Const C, ROI roi,
 
 
 
+// Highway SIMD implementation for invert: 1 - x
+template<class Rtype, class Atype>
+static bool
+invert_impl_hwy(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyUnaryCmd<Rtype, Atype>(
+                    r_row, a_row, n, [](auto d, auto va) {
+                        auto one = hn::Set(d, static_cast<MathT>(1.0));
+                        return hn::Sub(one, va);
+                    });
+            } else {
+                // Non-contiguous fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            1.0f - static_cast<float>(a_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+// Dispatcher for invert
+template<class Rtype, class Atype>
+static bool
+invert_impl(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return invert_impl_hwy<Rtype, Atype>(R, A, roi, nthreads);
+
+    // Scalar fallback: use mad(A, -1.0, 1.0)
+    return ImageBufAlgo::mad(R, A, -1.0, 1.0, roi, nthreads);
+}
+
+
 bool
 ImageBufAlgo::invert(ImageBuf& dst, const ImageBuf& A, ROI roi, int nthreads)
 {
-    // Calculate invert as simply 1-A == A*(-1)+1
-    return mad(dst, A, -1.0, 1.0, roi, nthreads);
+    OIIO::pvt::LoggedTimer logtime("IBA::invert");
+    if (!IBAprep(roi, &dst, &A))
+        return false;
+    bool ok;
+    OIIO_DISPATCH_COMMON_TYPES2(ok, "invert", invert_impl, dst.spec().format,
+                                A.spec().format, dst, A, roi, nthreads);
+    return ok;
 }
 
 
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index 4fa1a6cba0..8c5c4582e8 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -12,6 +12,7 @@
 
 #include <OpenImageIO/half.h>
 
+#include "imagebufalgo_hwy_pvt.h"
 #include <OpenImageIO/dassert.h>
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/imagebuf.h>
@@ -86,8 +87,8 @@ ImageBufAlgo::scale(const ImageBuf& A, const ImageBuf& B, KWArgs options,
 
 template<class Rtype, class Atype, class Btype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -104,7 +105,8 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::ConstIterator<Atype> a(A, roi);
@@ -117,6 +119,100 @@ mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Mul(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            * static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] * (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    return mul_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return mul_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return mul_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
 static bool
 mul_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -198,8 +294,8 @@ ImageBufAlgo::mul(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+div_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -216,6 +312,69 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 
 
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(
+                    r_row, a_row, b_row, n, [](auto d, auto a, auto b) {
+                        // Check for zero division: if b == 0, return 0
+                        auto zero = hn::Zero(d);
+                        auto mask = hn::Eq(b, zero);
+                        return hn::IfThenElse(mask, zero, hn::Div(a, b));
+                    });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        float v  = static_cast<float>(b_ptr[c]);
+                        r_ptr[c] = (v == 0.0f)
+                                       ? static_cast<Rtype>(0.0f)
+                                       : static_cast<Rtype>(
+                                             static_cast<float>(a_ptr[c]) / v);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    return div_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::div(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi,
                   int nthreads)
diff --git a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp
index 04daeaad27..1be1acc9ad 100644
--- a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp
+++ b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp
@@ -19,6 +19,7 @@
 #include <OpenImageIO/imagebufalgo_util.h>
 #include <OpenImageIO/simd.h>
 
+#include "imagebufalgo_hwy_pvt.h"
 #include "imageio_pvt.h"
 
 
@@ -27,8 +28,8 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class Atype, class Btype>
 static bool
-min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+min_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -45,7 +46,8 @@ min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-min_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+min_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -59,6 +61,155 @@ min_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+// Native integer min (scale-invariant, no float conversion)
+template<class T>
+static bool
+min_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer min - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::Min(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = std::min(a_ptr[c], b_ptr[c]);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+min_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Min(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            std::min(static_cast<float>(a_ptr[c]),
+                                     static_cast<float>(b_ptr[c])));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+min_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)std::min((SimdType)a_ptr[c],
+                                               (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant min when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return min_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return min_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+    return min_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+min_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return min_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return min_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::min(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi,
                   int nthreads)
@@ -124,8 +275,8 @@ ImageBufAlgo::min(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+max_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -142,7 +293,8 @@ max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-max_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+max_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -156,6 +308,155 @@ max_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+// Native integer max (scale-invariant, no float conversion)
+template<class T>
+static bool
+max_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer max - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::Max(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = std::max(a_ptr[c], b_ptr[c]);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+max_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Max(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            std::max(static_cast<float>(a_ptr[c]),
+                                     static_cast<float>(b_ptr[c])));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+max_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)std::max((SimdType)a_ptr[c],
+                                               (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant max when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return max_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return max_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+    return max_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+max_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return max_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return max_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::max(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi,
                   int nthreads)
@@ -221,8 +522,8 @@ ImageBufAlgo::max(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class D, class S>
 static bool
-clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max,
-       bool clampalpha01, ROI roi, int nthreads)
+clamp_scalar(ImageBuf& dst, const ImageBuf& src, const float* min,
+             const float* max, bool clampalpha01, ROI roi, int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::ConstIterator<S> s(src, roi);
@@ -240,6 +541,105 @@ clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max,
 }
 
 
+template<class Dtype, class Stype>
+static bool
+clamp_hwy(ImageBuf& dst, const ImageBuf& src, const float* min_vals,
+          const float* max_vals, bool clampalpha01, ROI roi, int nthreads)
+{
+    using MathT = typename SimdMathType<Dtype>::type;
+
+    auto Dv = HwyPixels(dst);
+    auto Sv = HwyPixels(src);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Dtype>(Dv, nchannels)
+                            && ChannelsContiguous<Stype>(Sv, nchannels);
+
+        // Set up Highway
+        const hn::ScalableTag<MathT> d;
+        size_t lanes = hn::Lanes(d);
+
+        // Pre-compute min/max pattern repeated to fill vector lanes
+        // Pattern: [min[0], min[1], ..., min[nch-1], min[0], min[1], ...]
+        MathT min_pattern[hn::MaxLanes(d)];
+        MathT max_pattern[hn::MaxLanes(d)];
+        for (size_t i = 0; i < lanes; ++i) {
+            int ch         = static_cast<int>(i % nchannels);
+            min_pattern[i] = static_cast<MathT>(min_vals[roi.chbegin + ch]);
+            max_pattern[i] = static_cast<MathT>(max_vals[roi.chbegin + ch]);
+        }
+        auto v_min = hn::Load(d, min_pattern);
+        auto v_max = hn::Load(d, max_pattern);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Dtype* d_row       = RoiRowPtr<Dtype>(Dv, y, roi);
+            const Stype* s_row = RoiRowPtr<Stype>(Sv, y, roi);
+
+            if (contig) {
+                size_t total = static_cast<size_t>(roi.width()) * nchannels;
+                size_t x     = 0;
+
+                // Process full vectors (pattern wraps correctly even when lanes % nchannels != 0)
+                if (nchannels > 0) {
+                    for (; x + lanes <= total; x += lanes) {
+                        auto va  = LoadPromote(d, s_row + x);
+                        auto res = hn::Clamp(va, v_min, v_max);
+                        DemoteStore(d, d_row + x, res);
+                    }
+                }
+
+                // Handle remaining values with partial vector load/store
+                if (x < total) {
+                    size_t remaining = total - x;
+                    auto va          = LoadPromoteN(d, s_row + x, remaining);
+                    auto res         = hn::Clamp(va, v_min, v_max);
+                    DemoteStoreN(d, d_row + x, res, remaining);
+                }
+            } else {
+                // Non-contiguous: scalar fallback per pixel
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Dtype* d_ptr = reinterpret_cast<Dtype*>(
+                        PixelBase(Dv, x, y));
+                    const Stype* s_ptr = reinterpret_cast<const Stype*>(
+                        PixelBase(Sv, x, y));
+                    for (int c = roi.chbegin; c < roi.chend; ++c) {
+                        d_ptr[c] = static_cast<Dtype>(
+                            OIIO::clamp<float>(static_cast<float>(s_ptr[c]),
+                                               min_vals[c], max_vals[c]));
+                    }
+                }
+            }
+        }
+
+        // Handle clampalpha01 separately (clamp alpha to [0,1])
+        int a = src.spec().alpha_channel;
+        if (clampalpha01 && a >= roi.chbegin && a < roi.chend) {
+            for (int y = roi.ybegin; y < roi.yend; ++y) {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Dtype* d_ptr = reinterpret_cast<Dtype*>(
+                        PixelBase(Dv, x, y));
+                    d_ptr[a] = static_cast<Dtype>(
+                        OIIO::clamp<float>(static_cast<float>(d_ptr[a]), 0.0f,
+                                           1.0f));
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+template<class D, class S>
+static bool
+clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max,
+       bool clampalpha01, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels())
+        return clamp_hwy<D, S>(dst, src, min, max, clampalpha01, roi, nthreads);
+    return clamp_scalar<D, S>(dst, src, min, max, clampalpha01, roi, nthreads);
+}
+
+
 
 bool
 ImageBufAlgo::clamp(ImageBuf& dst, const ImageBuf& src, cspan<float> min,
@@ -277,8 +677,8 @@ ImageBufAlgo::clamp(const ImageBuf& src, cspan<float> min, cspan<float> max,
 
 template<class Rtype, class Atype, class Btype>
 static bool
-absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-             int nthreads)
+absdiff_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                    int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -294,8 +694,8 @@ absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
-             int nthreads)
+absdiff_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                    int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -309,6 +709,170 @@ absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
 
 
 
+// Native integer absdiff (scale-invariant, no float conversion)
+template<class T>
+static bool
+absdiff_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                            ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer absdiff - much faster than float conversion!
+                // AbsDiff(a,b) = |a - b| = max(a,b) - min(a,b)
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(
+                    r_row, a_row, b_row, n, [](auto d, auto a, auto b) {
+                        if constexpr (std::is_unsigned_v<T>) {
+                            return hn::Sub(hn::Max(a, b), hn::Min(a, b));
+                        } else {
+                            return hn::Abs(hn::SaturatedSub(a, b));
+                        }
+                    });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
+                                           ? (a_ptr[c] - b_ptr[c])
+                                           : (b_ptr[c] - a_ptr[c]);
+                        } else {
+                            int64_t diff = (int64_t)a_ptr[c]
+                                           - (int64_t)b_ptr[c];
+                            r_ptr[c] = (T)std::abs(diff);
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+absdiff_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                 int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Abs(
+                                                       hn::Sub(a, b));
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            std::abs(static_cast<float>(a_ptr[c])
+                                     - static_cast<float>(b_ptr[c])));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+absdiff_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                 int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)std::abs((SimdType)a_ptr[c]
+                                               - (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant absdiff when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return absdiff_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return absdiff_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+    return absdiff_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return absdiff_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return absdiff_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::absdiff(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_,
                       ROI roi, int nthreads)
@@ -396,7 +960,8 @@ ImageBufAlgo::abs(const ImageBuf& A, ROI roi, int nthreads)
 
 template<class Rtype, class Atype>
 static bool
-pow_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+pow_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::ConstIterator<Atype> a(A, roi);
@@ -408,6 +973,100 @@ pow_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 }
 
 
+
+template<class Rtype, class Atype>
+static bool
+pow_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    bool scalar_pow = (b.size() == 1);
+    float p_val     = b[0];
+
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+
+            if (contig && scalar_pow) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyUnaryCmd<Rtype, Atype>(
+                    r_row, a_row, n, [p_val](auto d, auto va) {
+                        auto vpow = hn::Set(d, static_cast<MathT>(p_val));
+                        return hn::Exp(d, hn::Mul(vpow, hn::Log(d, va)));
+                    });
+            } else {
+                // Normalize: unsigned ints to [0,1], signed ints to [-1,1]
+                constexpr float norm_factor
+                    = std::is_integral_v<Atype>
+                          ? (std::is_same_v<Atype, uint32_t>
+                                 ? 1.0f / 4294967295.0f
+                             : std::is_same_v<Atype, int32_t>
+                                 ? 1.0f / 2147483647.0f
+                             : std::is_same_v<Atype, uint16_t> ? 1.0f / 65535.0f
+                             : std::is_same_v<Atype, int16_t>  ? 1.0f / 32767.0f
+                             : std::is_same_v<Atype, uint8_t>  ? 1.0f / 255.0f
+                             : std::is_same_v<Atype, int8_t>   ? 1.0f / 127.0f
+                                                               : 1.0f / 255.0f)
+                          : 1.0f;
+                constexpr float denorm_factor
+                    = std::is_integral_v<Rtype>
+                          ? (std::is_same_v<Rtype, uint32_t>   ? 4294967295.0f
+                             : std::is_same_v<Rtype, int32_t>  ? 2147483647.0f
+                             : std::is_same_v<Rtype, uint16_t> ? 65535.0f
+                             : std::is_same_v<Rtype, int16_t>  ? 32767.0f
+                             : std::is_same_v<Rtype, uint8_t>  ? 255.0f
+                             : std::is_same_v<Rtype, int8_t>   ? 127.0f
+                                                               : 255.0f)
+                          : 1.0f;
+
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        using SimdType
+                            = std::conditional_t<std::is_same_v<Rtype, double>,
+                                                 double, float>;
+                        SimdType normalized = static_cast<SimdType>(a_ptr[c])
+                                              * norm_factor;
+                        SimdType result
+                            = pow(normalized,
+                                  static_cast<SimdType>(b[roi.chbegin + c]));
+                        // Only add rounding offset for integer types
+                        if constexpr (std::is_integral_v<Rtype>) {
+                            r_ptr[c] = static_cast<Rtype>(result * denorm_factor
+                                                          + 0.5f);
+                        } else {
+                            r_ptr[c] = static_cast<Rtype>(result
+                                                          * denorm_factor);
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+pow_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return pow_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+    return pow_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+
 bool
 ImageBufAlgo::pow(ImageBuf& dst, const ImageBuf& A, cspan<float> b, ROI roi,
                   int nthreads)
@@ -604,11 +1263,179 @@ rangeexpand(float y)
 
 
 
+template<class Rtype, class Atype>
+static bool
+rangecompress_hwy(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi,
+                  int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    const ImageSpec& Rspec = R.spec();
+    const ImageSpec& Aspec = A.spec();
+    int alpha_channel      = Aspec.alpha_channel;
+    int z_channel          = Aspec.z_channel;
+    int nchannels          = roi.chend - roi.chbegin;
+
+    // Luma weights
+    constexpr float wr = 0.21264f, wg = 0.71517f, wb = 0.07219f;
+
+    // Check if luma mode is viable
+    bool can_use_luma
+        = useluma && roi.nchannels() >= 3
+          && !(alpha_channel >= roi.chbegin && alpha_channel < roi.chbegin + 3)
+          && !(z_channel >= roi.chbegin && z_channel < roi.chbegin + 3);
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        size_t r_pixel_bytes    = Rspec.pixel_bytes();
+        size_t a_pixel_bytes    = Aspec.pixel_bytes();
+        size_t r_scanline_bytes = Rspec.scanline_bytes();
+        size_t a_scanline_bytes = Aspec.scanline_bytes();
+
+        char* r_base       = reinterpret_cast<char*>(R.localpixels());
+        const char* a_base = reinterpret_cast<const char*>(A.localpixels());
+
+        bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes)
+                      && (nchannels * sizeof(Atype) == a_pixel_bytes);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes
+                          + (roi.xbegin - R.xbegin()) * r_pixel_bytes;
+            const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes
+                                + (roi.xbegin - A.xbegin()) * a_pixel_bytes;
+
+            r_row += roi.chbegin * sizeof(Rtype);
+            a_row += roi.chbegin * sizeof(Atype);
+
+            if (contig && !can_use_luma && alpha_channel < 0 && z_channel < 0) {
+                // Per-channel mode with no alpha/z to skip: process all channels
+                size_t n = static_cast<size_t>(roi.width()) * nchannels;
+                RunHwyUnaryCmd<Rtype, Atype>(
+                    reinterpret_cast<Rtype*>(r_row),
+                    reinterpret_cast<const Atype*>(a_row), n,
+                    [](auto d, auto va) { return rangecompress_simd(d, va); });
+            } else if (contig && can_use_luma && nchannels >= 3) {
+                // Luma mode: process RGB with luma-based scaling
+                const hn::ScalableTag<MathT> d;
+                const size_t N = hn::Lanes(d);
+
+                Rtype* r_ptr       = reinterpret_cast<Rtype*>(r_row);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(a_row);
+
+                int x = 0;
+                for (; x + static_cast<int>(N) <= roi.width();
+                     x += static_cast<int>(N)) {
+                    // Load RGB for N pixels
+                    auto r_vec = LoadPromote(d, a_ptr + x * nchannels + 0);
+                    auto g_vec = LoadPromote(d, a_ptr + x * nchannels + 1);
+                    auto b_vec = LoadPromote(d, a_ptr + x * nchannels + 2);
+
+                    // Compute luma: 0.21264*R + 0.71517*G + 0.07219*B
+                    auto luma = hn::MulAdd(
+                        hn::Set(d, static_cast<MathT>(wr)), r_vec,
+                        hn::MulAdd(hn::Set(d, static_cast<MathT>(wg)), g_vec,
+                                   hn::Mul(hn::Set(d, static_cast<MathT>(wb)),
+                                           b_vec)));
+
+                    // Compress luma
+                    auto compressed_luma = rangecompress_simd(d, luma);
+
+                    // Compute scale = compressed_luma / luma (avoid div by zero)
+                    auto zero      = hn::Set(d, static_cast<MathT>(0.0));
+                    auto is_zero   = hn::Eq(luma, zero);
+                    auto safe_luma = hn::IfThenElse(
+                        is_zero, hn::Set(d, static_cast<MathT>(1.0)), luma);
+                    auto scale = hn::Div(compressed_luma, safe_luma);
+                    scale      = hn::IfThenElse(is_zero, zero, scale);
+
+                    // Apply scale to RGB
+                    r_vec = hn::Mul(r_vec, scale);
+                    g_vec = hn::Mul(g_vec, scale);
+                    b_vec = hn::Mul(b_vec, scale);
+
+                    // Store RGB
+                    DemoteStore(d, r_ptr + x * nchannels + 0, r_vec);
+                    DemoteStore(d, r_ptr + x * nchannels + 1, g_vec);
+                    DemoteStore(d, r_ptr + x * nchannels + 2, b_vec);
+
+                    // Copy remaining channels (alpha, etc.) - scalar
+                    for (size_t i = 0;
+                         i < N && x + static_cast<int>(i) < roi.width(); ++i) {
+                        for (int c = 3; c < nchannels; ++c) {
+                            r_ptr[(x + static_cast<int>(i)) * nchannels + c]
+                                = a_ptr[(x + static_cast<int>(i)) * nchannels
+                                        + c];
+                        }
+                    }
+                }
+
+                // Scalar tail for remaining pixels
+                for (; x < roi.width(); ++x) {
+                    float r     = static_cast<float>(a_ptr[x * nchannels + 0]);
+                    float g     = static_cast<float>(a_ptr[x * nchannels + 1]);
+                    float b     = static_cast<float>(a_ptr[x * nchannels + 2]);
+                    float luma  = wr * r + wg * g + wb * b;
+                    float scale = luma > 0.0f ? rangecompress(luma) / luma
+                                              : 0.0f;
+                    r_ptr[x * nchannels + 0] = static_cast<Rtype>(r * scale);
+                    r_ptr[x * nchannels + 1] = static_cast<Rtype>(g * scale);
+                    r_ptr[x * nchannels + 2] = static_cast<Rtype>(b * scale);
+                    for (int c = 3; c < nchannels; ++c) {
+                        r_ptr[x * nchannels + c] = a_ptr[x * nchannels + c];
+                    }
+                }
+            } else {
+                // Fallback: scalar per-pixel processing with channel skipping
+                for (int x = 0; x < roi.width(); ++x) {
+                    Rtype* r_pixel = reinterpret_cast<Rtype*>(
+                        r_row + x * r_pixel_bytes);
+                    const Atype* a_pixel = reinterpret_cast<const Atype*>(
+                        a_row + x * a_pixel_bytes);
+
+                    if (can_use_luma) {
+                        float r_val = static_cast<float>(a_pixel[0]);
+                        float g_val = static_cast<float>(a_pixel[1]);
+                        float b_val = static_cast<float>(a_pixel[2]);
+                        float luma  = wr * r_val + wg * g_val + wb * b_val;
+                        float scale = luma > 0.0f ? rangecompress(luma) / luma
+                                                  : 0.0f;
+                        for (int c = 0; c < nchannels; ++c) {
+                            int abs_c = roi.chbegin + c;
+                            if (abs_c == alpha_channel || abs_c == z_channel)
+                                r_pixel[c] = a_pixel[c];
+                            else
+                                r_pixel[c] = static_cast<Rtype>(
+                                    static_cast<float>(a_pixel[c]) * scale);
+                        }
+                    } else {
+                        for (int c = 0; c < nchannels; ++c) {
+                            int abs_c = roi.chbegin + c;
+                            if (abs_c == alpha_channel || abs_c == z_channel)
+                                r_pixel[c] = a_pixel[c];
+                            else
+                                r_pixel[c] = static_cast<Rtype>(rangecompress(
+                                    static_cast<float>(a_pixel[c])));
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+
 template<class Rtype, class Atype>
 static bool
 rangecompress_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi,
                int nthreads)
 {
+    // Use SIMD fast path if Highway enabled and buffers are in local memory
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) {
+        return rangecompress_hwy<Rtype, Atype>(R, A, useluma, roi, nthreads);
+    }
+
+    // Original scalar implementation for non-local buffers
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         const ImageSpec& Aspec(A.spec());
         int alpha_channel = Aspec.alpha_channel;
@@ -672,11 +1499,178 @@ rangecompress_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi,
 
 
 
+template<class Rtype, class Atype>
+static bool
+rangeexpand_hwy(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi,
+                int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    const ImageSpec& Rspec = R.spec();
+    const ImageSpec& Aspec = A.spec();
+    int alpha_channel      = Aspec.alpha_channel;
+    int z_channel          = Aspec.z_channel;
+    int nchannels          = roi.chend - roi.chbegin;
+
+    // Luma weights
+    constexpr float wr = 0.21264f, wg = 0.71517f, wb = 0.07219f;
+
+    // Check if luma mode is viable
+    bool can_use_luma
+        = useluma && roi.nchannels() >= 3
+          && !(alpha_channel >= roi.chbegin && alpha_channel < roi.chbegin + 3)
+          && !(z_channel >= roi.chbegin && z_channel < roi.chbegin + 3);
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        size_t r_pixel_bytes    = Rspec.pixel_bytes();
+        size_t a_pixel_bytes    = Aspec.pixel_bytes();
+        size_t r_scanline_bytes = Rspec.scanline_bytes();
+        size_t a_scanline_bytes = Aspec.scanline_bytes();
+
+        char* r_base       = reinterpret_cast<char*>(R.localpixels());
+        const char* a_base = reinterpret_cast<const char*>(A.localpixels());
+
+        bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes)
+                      && (nchannels * sizeof(Atype) == a_pixel_bytes);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes
+                          + (roi.xbegin - R.xbegin()) * r_pixel_bytes;
+            const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes
+                                + (roi.xbegin - A.xbegin()) * a_pixel_bytes;
+
+            r_row += roi.chbegin * sizeof(Rtype);
+            a_row += roi.chbegin * sizeof(Atype);
+
+            if (contig && !can_use_luma && alpha_channel < 0 && z_channel < 0) {
+                // Per-channel mode with no alpha/z to skip: process all channels
+                size_t n = static_cast<size_t>(roi.width()) * nchannels;
+                RunHwyUnaryCmd<Rtype, Atype>(
+                    reinterpret_cast<Rtype*>(r_row),
+                    reinterpret_cast<const Atype*>(a_row), n,
+                    [](auto d, auto va) { return rangeexpand_simd(d, va); });
+            } else if (contig && can_use_luma && nchannels >= 3) {
+                // Luma mode: process RGB with luma-based scaling
+                const hn::ScalableTag<MathT> d;
+                const size_t N = hn::Lanes(d);
+
+                Rtype* r_ptr       = reinterpret_cast<Rtype*>(r_row);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(a_row);
+
+                int x = 0;
+                for (; x + static_cast<int>(N) <= roi.width();
+                     x += static_cast<int>(N)) {
+                    // Load RGB for N pixels
+                    auto r_vec = LoadPromote(d, a_ptr + x * nchannels + 0);
+                    auto g_vec = LoadPromote(d, a_ptr + x * nchannels + 1);
+                    auto b_vec = LoadPromote(d, a_ptr + x * nchannels + 2);
+
+                    // Compute luma: 0.21264*R + 0.71517*G + 0.07219*B
+                    auto luma = hn::MulAdd(
+                        hn::Set(d, static_cast<MathT>(wr)), r_vec,
+                        hn::MulAdd(hn::Set(d, static_cast<MathT>(wg)), g_vec,
+                                   hn::Mul(hn::Set(d, static_cast<MathT>(wb)),
+                                           b_vec)));
+
+                    // Expand luma
+                    auto expanded_luma = rangeexpand_simd(d, luma);
+
+                    // Compute scale = expanded_luma / luma (avoid div by zero)
+                    auto zero      = hn::Set(d, static_cast<MathT>(0.0));
+                    auto is_zero   = hn::Eq(luma, zero);
+                    auto safe_luma = hn::IfThenElse(
+                        is_zero, hn::Set(d, static_cast<MathT>(1.0)), luma);
+                    auto scale = hn::Div(expanded_luma, safe_luma);
+                    scale      = hn::IfThenElse(is_zero, zero, scale);
+
+                    // Apply scale to RGB
+                    r_vec = hn::Mul(r_vec, scale);
+                    g_vec = hn::Mul(g_vec, scale);
+                    b_vec = hn::Mul(b_vec, scale);
+
+                    // Store RGB
+                    DemoteStore(d, r_ptr + x * nchannels + 0, r_vec);
+                    DemoteStore(d, r_ptr + x * nchannels + 1, g_vec);
+                    DemoteStore(d, r_ptr + x * nchannels + 2, b_vec);
+
+                    // Copy remaining channels (alpha, etc.) - scalar
+                    for (size_t i = 0;
+                         i < N && x + static_cast<int>(i) < roi.width(); ++i) {
+                        for (int c = 3; c < nchannels; ++c) {
+                            r_ptr[(x + static_cast<int>(i)) * nchannels + c]
+                                = a_ptr[(x + static_cast<int>(i)) * nchannels
+                                        + c];
+                        }
+                    }
+                }
+
+                // Scalar tail for remaining pixels
+                for (; x < roi.width(); ++x) {
+                    float r     = static_cast<float>(a_ptr[x * nchannels + 0]);
+                    float g     = static_cast<float>(a_ptr[x * nchannels + 1]);
+                    float b     = static_cast<float>(a_ptr[x * nchannels + 2]);
+                    float luma  = wr * r + wg * g + wb * b;
+                    float scale = luma > 0.0f ? rangeexpand(luma) / luma : 0.0f;
+                    r_ptr[x * nchannels + 0] = static_cast<Rtype>(r * scale);
+                    r_ptr[x * nchannels + 1] = static_cast<Rtype>(g * scale);
+                    r_ptr[x * nchannels + 2] = static_cast<Rtype>(b * scale);
+                    for (int c = 3; c < nchannels; ++c) {
+                        r_ptr[x * nchannels + c] = a_ptr[x * nchannels + c];
+                    }
+                }
+            } else {
+                // Fallback: scalar per-pixel processing with channel skipping
+                for (int x = 0; x < roi.width(); ++x) {
+                    Rtype* r_pixel = reinterpret_cast<Rtype*>(
+                        r_row + x * r_pixel_bytes);
+                    const Atype* a_pixel = reinterpret_cast<const Atype*>(
+                        a_row + x * a_pixel_bytes);
+
+                    if (can_use_luma) {
+                        float r_val = static_cast<float>(a_pixel[0]);
+                        float g_val = static_cast<float>(a_pixel[1]);
+                        float b_val = static_cast<float>(a_pixel[2]);
+                        float luma  = wr * r_val + wg * g_val + wb * b_val;
+                        float scale = luma > 0.0f ? rangeexpand(luma) / luma
+                                                  : 0.0f;
+                        for (int c = 0; c < nchannels; ++c) {
+                            int abs_c = roi.chbegin + c;
+                            if (abs_c == alpha_channel || abs_c == z_channel)
+                                r_pixel[c] = a_pixel[c];
+                            else
+                                r_pixel[c] = static_cast<Rtype>(
+                                    static_cast<float>(a_pixel[c]) * scale);
+                        }
+                    } else {
+                        for (int c = 0; c < nchannels; ++c) {
+                            int abs_c = roi.chbegin + c;
+                            if (abs_c == alpha_channel || abs_c == z_channel)
+                                r_pixel[c] = a_pixel[c];
+                            else
+                                r_pixel[c] = static_cast<Rtype>(rangeexpand(
+                                    static_cast<float>(a_pixel[c])));
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+
 template<class Rtype, class Atype>
 static bool
 rangeexpand_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi,
              int nthreads)
 {
+    // Use SIMD fast path if Highway enabled and buffers are in local memory
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) {
+        return rangeexpand_hwy<Rtype, Atype>(R, A, useluma, roi, nthreads);
+    }
+
+    // Original scalar implementation for non-local buffers
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         const ImageSpec& Aspec(A.spec());
         int alpha_channel = Aspec.alpha_channel;
@@ -800,6 +1794,12 @@ template<class Rtype, class Atype>
 static bool
 unpremult_(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads)
 {
+    // Use SIMD fast path if Highway enabled and buffers are in local memory
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) {
+        return unpremult_hwy<Rtype, Atype>(R, A, roi, nthreads);
+    }
+
+    // Original scalar implementation for non-local buffers
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         int alpha_channel = A.spec().alpha_channel;
         int z_channel     = A.spec().z_channel;
@@ -873,11 +1873,389 @@ ImageBufAlgo::unpremult(const ImageBuf& src, ROI roi, int nthreads)
 
 
 
+template<class Rtype, class Atype>
+static bool
+premult_hwy(ImageBuf& R, const ImageBuf& A, bool preserve_alpha0, ROI roi,
+            int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    const ImageSpec& Rspec = R.spec();
+    const ImageSpec& Aspec = A.spec();
+    int alpha_channel      = Aspec.alpha_channel;
+    int z_channel          = Aspec.z_channel;
+    int nchannels          = roi.chend - roi.chbegin;
+
+    // Check if we can use the RGBA fast path
+    bool can_use_rgba_simd = (nchannels == 4 && alpha_channel == 3
+                              && z_channel < 0 && roi.chbegin == 0);
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        size_t r_pixel_bytes    = Rspec.pixel_bytes();
+        size_t a_pixel_bytes    = Aspec.pixel_bytes();
+        size_t r_scanline_bytes = Rspec.scanline_bytes();
+        size_t a_scanline_bytes = Aspec.scanline_bytes();
+
+        char* r_base       = reinterpret_cast<char*>(R.localpixels());
+        const char* a_base = reinterpret_cast<const char*>(A.localpixels());
+
+        bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes)
+                      && (nchannels * sizeof(Atype) == a_pixel_bytes);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes
+                          + (roi.xbegin - R.xbegin()) * r_pixel_bytes;
+            const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes
+                                + (roi.xbegin - A.xbegin()) * a_pixel_bytes;
+
+            r_row += roi.chbegin * sizeof(Rtype);
+            a_row += roi.chbegin * sizeof(Atype);
+
+            if (contig && can_use_rgba_simd) {
+                // RGBA fast path: interleaved load/store
+                const hn::ScalableTag<MathT> d;
+                const size_t N = hn::Lanes(d);
+
+                Rtype* r_ptr       = reinterpret_cast<Rtype*>(r_row);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(a_row);
+
+                int x = 0;
+                for (; x + static_cast<int>(N) <= roi.width();
+                     x += static_cast<int>(N)) {
+                    // Load N RGBA pixels
+                    auto [r_vec, g_vec, b_vec, a_vec]
+                        = LoadInterleaved4Promote<decltype(d), Atype>(
+                            d, a_ptr + x * 4);
+
+                    // Premultiply: RGB *= A
+                    if (preserve_alpha0) {
+                        auto zero      = hn::Set(d, static_cast<MathT>(0.0));
+                        auto one       = hn::Set(d, static_cast<MathT>(1.0));
+                        auto is_zero   = hn::Eq(a_vec, zero);
+                        auto is_one    = hn::Eq(a_vec, one);
+                        auto skip_mask = hn::Or(is_zero, is_one);
+
+                        r_vec = hn::IfThenElse(skip_mask, r_vec,
+                                               hn::Mul(r_vec, a_vec));
+                        g_vec = hn::IfThenElse(skip_mask, g_vec,
+                                               hn::Mul(g_vec, a_vec));
+                        b_vec = hn::IfThenElse(skip_mask, b_vec,
+                                               hn::Mul(b_vec, a_vec));
+                    } else {
+                        auto one    = hn::Set(d, static_cast<MathT>(1.0));
+                        auto is_one = hn::Eq(a_vec, one);
+
+                        r_vec = hn::IfThenElse(is_one, r_vec,
+                                               hn::Mul(r_vec, a_vec));
+                        g_vec = hn::IfThenElse(is_one, g_vec,
+                                               hn::Mul(g_vec, a_vec));
+                        b_vec = hn::IfThenElse(is_one, b_vec,
+                                               hn::Mul(b_vec, a_vec));
+                    }
+                    // a_vec unchanged
+
+                    // Store N RGBA pixels
+                    StoreInterleaved4Demote<decltype(d), Rtype>(d,
+                                                                r_ptr + x * 4,
+                                                                r_vec, g_vec,
+                                                                b_vec, a_vec);
+                }
+
+                // Scalar tail for remaining pixels
+                for (; x < roi.width(); ++x) {
+                    // Normalize: unsigned ints to [0,1], signed ints to [-1,1]
+                    constexpr float norm_factor
+                        = std::is_integral_v<Atype>
+                              ? (std::is_same_v<Atype, uint32_t>
+                                     ? 1.0f / 4294967295.0f
+                                 : std::is_same_v<Atype, int32_t>
+                                     ? 1.0f / 2147483647.0f
+                                 : std::is_same_v<Atype, uint16_t>
+                                     ? 1.0f / 65535.0f
+                                 : std::is_same_v<Atype, int16_t>
+                                     ? 1.0f / 32767.0f
+                                 : std::is_same_v<Atype, uint8_t>
+                                     ? 1.0f / 255.0f
+                                 : std::is_same_v<Atype, int8_t>
+                                     ? 1.0f / 127.0f
+                                     : 1.0f / 255.0f)
+                              : 1.0f;
+                    constexpr float denorm_factor
+                        = std::is_integral_v<Rtype>
+                              ? (std::is_same_v<Rtype, uint32_t> ? 4294967295.0f
+                                 : std::is_same_v<Rtype, int32_t>
+                                     ? 2147483647.0f
+                                 : std::is_same_v<Rtype, uint16_t> ? 65535.0f
+                                 : std::is_same_v<Rtype, int16_t>  ? 32767.0f
+                                 : std::is_same_v<Rtype, uint8_t>  ? 255.0f
+                                 : std::is_same_v<Rtype, int8_t>   ? 127.0f
+                                                                   : 255.0f)
+                              : 1.0f;
+
+                    float alpha = static_cast<float>(a_ptr[x * 4 + 3])
+                                  * norm_factor;
+                    if ((preserve_alpha0 && alpha == 0.0f) || alpha == 1.0f) {
+                        if (&R != &A) {
+                            r_ptr[x * 4 + 0] = a_ptr[x * 4 + 0];
+                            r_ptr[x * 4 + 1] = a_ptr[x * 4 + 1];
+                            r_ptr[x * 4 + 2] = a_ptr[x * 4 + 2];
+                            r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3];
+                        }
+                        continue;
+                    }
+                    // Only add rounding offset for integer types
+                    if constexpr (std::is_integral_v<Rtype>) {
+                        r_ptr[x * 4 + 0] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 0]) * norm_factor
+                                * alpha * denorm_factor
+                            + 0.5f);
+                        r_ptr[x * 4 + 1] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 1]) * norm_factor
+                                * alpha * denorm_factor
+                            + 0.5f);
+                        r_ptr[x * 4 + 2] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 2]) * norm_factor
+                                * alpha * denorm_factor
+                            + 0.5f);
+                    } else {
+                        r_ptr[x * 4 + 0] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 0]) * norm_factor
+                            * alpha * denorm_factor);
+                        r_ptr[x * 4 + 1] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 1]) * norm_factor
+                            * alpha * denorm_factor);
+                        r_ptr[x * 4 + 2] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[x * 4 + 2]) * norm_factor
+                            * alpha * denorm_factor);
+                    }
+                    r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3];
+                }
+            } else {
+                // Fallback to scalar per-pixel processing
+                for (int x = 0; x < roi.width(); ++x) {
+                    Rtype* r_pixel = reinterpret_cast<Rtype*>(
+                        r_row + x * r_pixel_bytes);
+                    const Atype* a_pixel = reinterpret_cast<const Atype*>(
+                        a_row + x * a_pixel_bytes);
+
+                    float alpha = static_cast<float>(a_pixel[alpha_channel]);
+                    bool skip   = (alpha == 1.0f)
+                                || (preserve_alpha0 && alpha == 0.0f);
+
+                    for (int c = 0; c < nchannels; ++c) {
+                        int abs_c = roi.chbegin + c;
+                        if (abs_c == alpha_channel || abs_c == z_channel) {
+                            r_pixel[c] = a_pixel[c];
+                        } else if (skip) {
+                            r_pixel[c] = a_pixel[c];
+                        } else {
+                            r_pixel[c] = static_cast<Rtype>(
+                                static_cast<float>(a_pixel[c]) * alpha);
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+
+template<class Rtype, class Atype>
+static bool
+unpremult_hwy(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+
+    const ImageSpec& Rspec = R.spec();
+    const ImageSpec& Aspec = A.spec();
+    int alpha_channel      = Aspec.alpha_channel;
+    int z_channel          = Aspec.z_channel;
+    int nchannels          = roi.chend - roi.chbegin;
+
+    // Check if we can use the RGBA fast path
+    bool can_use_rgba_simd = (nchannels == 4 && alpha_channel == 3
+                              && z_channel < 0 && roi.chbegin == 0);
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        size_t r_pixel_bytes    = Rspec.pixel_bytes();
+        size_t a_pixel_bytes    = Aspec.pixel_bytes();
+        size_t r_scanline_bytes = Rspec.scanline_bytes();
+        size_t a_scanline_bytes = Aspec.scanline_bytes();
+
+        char* r_base       = reinterpret_cast<char*>(R.localpixels());
+        const char* a_base = reinterpret_cast<const char*>(A.localpixels());
+
+        bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes)
+                      && (nchannels * sizeof(Atype) == a_pixel_bytes);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes
+                          + (roi.xbegin - R.xbegin()) * r_pixel_bytes;
+            const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes
+                                + (roi.xbegin - A.xbegin()) * a_pixel_bytes;
+
+            r_row += roi.chbegin * sizeof(Rtype);
+            a_row += roi.chbegin * sizeof(Atype);
+
+            if (contig && can_use_rgba_simd) {
+                // RGBA fast path: interleaved load/store
+                const hn::ScalableTag<MathT> d;
+                const size_t N = hn::Lanes(d);
+
+                Rtype* r_ptr       = reinterpret_cast<Rtype*>(r_row);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(a_row);
+
+                int x = 0;
+                for (; x + static_cast<int>(N) <= roi.width();
+                     x += static_cast<int>(N)) {
+                    // Load N RGBA pixels
+                    auto [r_vec, g_vec, b_vec, a_vec]
+                        = LoadInterleaved4Promote<decltype(d), Atype>(
+                            d, a_ptr + x * 4);
+
+                    // Unpremultiply: RGB /= A (with div-by-zero protection)
+                    auto zero      = hn::Set(d, static_cast<MathT>(0.0));
+                    auto one       = hn::Set(d, static_cast<MathT>(1.0));
+                    auto is_zero   = hn::Eq(a_vec, zero);
+                    auto is_one    = hn::Eq(a_vec, one);
+                    auto skip_mask = hn::Or(is_zero, is_one);
+
+                    // Avoid division by zero
+                    auto safe_a = hn::IfThenElse(is_zero, one, a_vec);
+
+                    r_vec = hn::IfThenElse(skip_mask, r_vec,
+                                           hn::Div(r_vec, safe_a));
+                    g_vec = hn::IfThenElse(skip_mask, g_vec,
+                                           hn::Div(g_vec, safe_a));
+                    b_vec = hn::IfThenElse(skip_mask, b_vec,
+                                           hn::Div(b_vec, safe_a));
+                    // a_vec unchanged
+
+                    // Store N RGBA pixels
+                    StoreInterleaved4Demote<decltype(d), Rtype>(d,
+                                                                r_ptr + x * 4,
+                                                                r_vec, g_vec,
+                                                                b_vec, a_vec);
+                }
+
+                // Scalar tail for remaining pixels
+                for (; x < roi.width(); ++x) {
+                    // Normalize: unsigned ints to [0,1], signed ints to [-1,1]
+                    constexpr float norm_factor
+                        = std::is_integral_v<Atype>
+                              ? (std::is_same_v<Atype, uint32_t>
+                                     ? 1.0f / 4294967295.0f
+                                 : std::is_same_v<Atype, int32_t>
+                                     ? 1.0f / 2147483647.0f
+                                 : std::is_same_v<Atype, uint16_t>
+                                     ? 1.0f / 65535.0f
+                                 : std::is_same_v<Atype, int16_t>
+                                     ? 1.0f / 32767.0f
+                                 : std::is_same_v<Atype, uint8_t>
+                                     ? 1.0f / 255.0f
+                                 : std::is_same_v<Atype, int8_t>
+                                     ? 1.0f / 127.0f
+                                     : 1.0f / 255.0f)
+                              : 1.0f;
+                    constexpr float denorm_factor
+                        = std::is_integral_v<Rtype>
+                              ? (std::is_same_v<Rtype, uint32_t> ? 4294967295.0f
+                                 : std::is_same_v<Rtype, int32_t>
+                                     ? 2147483647.0f
+                                 : std::is_same_v<Rtype, uint16_t> ? 65535.0f
+                                 : std::is_same_v<Rtype, int16_t>  ? 32767.0f
+                                 : std::is_same_v<Rtype, uint8_t>  ? 255.0f
+                                 : std::is_same_v<Rtype, int8_t>   ? 127.0f
+                                                                   : 255.0f)
+                              : 1.0f;
+
+                    float alpha = static_cast<float>(a_ptr[x * 4 + 3])
+                                  * norm_factor;
+                    if (alpha == 0.0f || alpha == 1.0f) {
+                        if (&R != &A) {
+                            r_ptr[x * 4 + 0] = a_ptr[x * 4 + 0];
+                            r_ptr[x * 4 + 1] = a_ptr[x * 4 + 1];
+                            r_ptr[x * 4 + 2] = a_ptr[x * 4 + 2];
+                            r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3];
+                        }
+                        continue;
+                    }
+                    // Only add rounding offset for integer types
+                    if constexpr (std::is_integral_v<Rtype>) {
+                        r_ptr[x * 4 + 0] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 0]) * norm_factor
+                             / alpha)
+                                * denorm_factor
+                            + 0.5f);
+                        r_ptr[x * 4 + 1] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 1]) * norm_factor
+                             / alpha)
+                                * denorm_factor
+                            + 0.5f);
+                        r_ptr[x * 4 + 2] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 2]) * norm_factor
+                             / alpha)
+                                * denorm_factor
+                            + 0.5f);
+                    } else {
+                        r_ptr[x * 4 + 0] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 0]) * norm_factor
+                             / alpha)
+                            * denorm_factor);
+                        r_ptr[x * 4 + 1] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 1]) * norm_factor
+                             / alpha)
+                            * denorm_factor);
+                        r_ptr[x * 4 + 2] = static_cast<Rtype>(
+                            (static_cast<float>(a_ptr[x * 4 + 2]) * norm_factor
+                             / alpha)
+                            * denorm_factor);
+                    }
+                    r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3];
+                }
+            } else {
+                // Fallback to scalar per-pixel processing
+                for (int x = 0; x < roi.width(); ++x) {
+                    Rtype* r_pixel = reinterpret_cast<Rtype*>(
+                        r_row + x * r_pixel_bytes);
+                    const Atype* a_pixel = reinterpret_cast<const Atype*>(
+                        a_row + x * a_pixel_bytes);
+
+                    float alpha = static_cast<float>(a_pixel[alpha_channel]);
+
+                    for (int c = 0; c < nchannels; ++c) {
+                        int abs_c = roi.chbegin + c;
+                        if (abs_c == alpha_channel || abs_c == z_channel) {
+                            r_pixel[c] = a_pixel[c];
+                        } else if (alpha == 0.0f || alpha == 1.0f) {
+                            r_pixel[c] = a_pixel[c];
+                        } else {
+                            r_pixel[c] = static_cast<Rtype>(
+                                static_cast<float>(a_pixel[c]) / alpha);
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
+
 template<class Rtype, class Atype>
 static bool
 premult_(ImageBuf& R, const ImageBuf& A, bool preserve_alpha0, ROI roi,
          int nthreads)
 {
+    // Use SIMD fast path if Highway enabled and buffers are in local memory
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) {
+        return premult_hwy<Rtype, Atype>(R, A, preserve_alpha0, roi, nthreads);
+    }
+
+    // Original scalar implementation for non-local buffers
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         int alpha_channel = A.spec().alpha_channel;
         int z_channel     = A.spec().z_channel;
@@ -997,6 +2375,113 @@ allspan(cspan<T> s, const T& v)
 
 
 
+// Highway SIMD implementation for contrast_remap (linear stretch only, no sigmoid)
+template<class D, class S>
+static bool
+contrast_remap_hwy(ImageBuf& dst, const ImageBuf& src, cspan<float> black,
+                   cspan<float> white, cspan<float> min, cspan<float> max,
+                   bool do_minmax, ROI roi, int nthreads)
+{
+    using MathT = typename SimdMathType<D>::type;
+
+    auto Dv = HwyPixels(dst);
+    auto Sv = HwyPixels(src);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<D>(Dv, nchannels)
+                            && ChannelsContiguous<S>(Sv, nchannels);
+
+        const hn::ScalableTag<MathT> d;
+        size_t lanes = hn::Lanes(d);
+
+        // Pre-compute per-channel constants for linear stretch
+        // Formula: (x - black) * scale = x * scale + (-black * scale)
+        // This allows using FMA (MulAdd) instead of Sub + Mul
+        MathT scale_pattern[hn::MaxLanes(d)];   // 1/(white-black)
+        MathT offset_pattern[hn::MaxLanes(d)];  // -black * scale
+        MathT min_pattern[hn::MaxLanes(d)];
+        MathT max_pattern[hn::MaxLanes(d)];
+
+        for (size_t i = 0; i < lanes; ++i) {
+            int ch          = static_cast<int>(i % nchannels);
+            MathT black_val = static_cast<MathT>(black[roi.chbegin + ch]);
+            MathT white_val = static_cast<MathT>(white[roi.chbegin + ch]);
+            MathT scale     = static_cast<MathT>(1.0) / (white_val - black_val);
+            scale_pattern[i]  = scale;
+            offset_pattern[i] = -black_val
+                                * scale;  // Precompute offset for FMA
+            min_pattern[i] = static_cast<MathT>(min[roi.chbegin + ch]);
+            max_pattern[i] = static_cast<MathT>(max[roi.chbegin + ch]);
+        }
+        auto v_scale  = hn::Load(d, scale_pattern);
+        auto v_offset = hn::Load(d, offset_pattern);
+        auto v_min    = hn::Load(d, min_pattern);
+        auto v_max    = hn::Load(d, max_pattern);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            D* d_row       = RoiRowPtr<D>(Dv, y, roi);
+            const S* s_row = RoiRowPtr<S>(Sv, y, roi);
+
+            if (contig && nchannels > 0) {
+                size_t total = static_cast<size_t>(roi.width()) * nchannels;
+                size_t x     = 0;
+                // SIMD loop - pattern wraps correctly even when lanes % nchannels != 0
+                for (; x + lanes <= total; x += lanes) {
+                    auto va = LoadPromote(d, s_row + x);
+                    // Linear stretch using FMA: x * scale + offset
+                    // where offset = -black * scale
+                    auto stretched = hn::MulAdd(va, v_scale, v_offset);
+                    // Optional remap to [min, max]: min + stretched * (max - min)
+                    auto res = do_minmax
+                                   ? hn::MulAdd(stretched,
+                                                hn::Sub(v_max, v_min), v_min)
+                                   : stretched;
+                    DemoteStore(d, d_row + x, res);
+                }
+                // Scalar tail for remaining pixels
+                for (; x < total; ++x) {
+                    int ch          = static_cast<int>(x % nchannels);
+                    float val       = static_cast<float>(s_row[x]);
+                    float black_val = black[roi.chbegin + ch];
+                    float white_val = white[roi.chbegin + ch];
+                    float scale     = 1.0f / (white_val - black_val);
+                    float offset    = -black_val * scale;
+                    float result    = val * scale + offset;
+                    if (do_minmax) {
+                        float min_val = min[roi.chbegin + ch];
+                        float max_val = max[roi.chbegin + ch];
+                        result        = result * (max_val - min_val) + min_val;
+                    }
+                    d_row[x] = static_cast<D>(result);
+                }
+            } else {
+                // Non-contiguous fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    D* d_ptr       = reinterpret_cast<D*>(PixelBase(Dv, x, y));
+                    const S* s_ptr = reinterpret_cast<const S*>(
+                        PixelBase(Sv, x, y));
+                    for (int c = roi.chbegin; c < roi.chend; ++c) {
+                        float val       = static_cast<float>(s_ptr[c]);
+                        float black_val = black[c];
+                        float white_val = white[c];
+                        float scale     = 1.0f / (white_val - black_val);
+                        float offset    = -black_val * scale;
+                        float result    = val * scale + offset;  // FMA
+                        if (do_minmax) {
+                            float min_val = min[c];
+                            float max_val = max[c];
+                            result = result * (max_val - min_val) + min_val;
+                        }
+                        d_ptr[c] = static_cast<D>(result);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+
 template<class D, class S>
 static bool
 contrast_remap_(ImageBuf& dst, const ImageBuf& src, cspan<float> black,
@@ -1011,6 +2496,14 @@ contrast_remap_(ImageBuf& dst, const ImageBuf& src, cspan<float> black,
     bool use_sigmoid = !allspan(scontrast, 1.0f);
     bool do_minmax   = !(allspan(min, 0.0f) && allspan(max, 1.0f));
 
+    // Use Highway SIMD for simple linear stretch (no sigmoid)
+    if (OIIO::pvt::enable_hwy && !use_sigmoid && !same_black_white
+        && dst.localpixels() && src.localpixels()) {
+        return contrast_remap_hwy<D, S>(dst, src, black, white, min, max,
+                                        do_minmax, roi, nthreads);
+    }
+
+    // Scalar fallback for complex cases (sigmoid, thresholding, etc.)
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         if (same_black_white) {
             // Special case -- black & white are the same value, which is
diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index 0abbb1ace8..2863d1ae90 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -21,6 +21,8 @@
 
 #include <Imath/ImathBox.h>
 
+#include "imagebufalgo_hwy_pvt.h"
+
 OIIO_NAMESPACE_3_1_BEGIN
 
 
@@ -932,11 +934,7 @@ ImageBufAlgo::fit(ImageBuf& dst, const ImageBuf& src, KWArgs options, ROI roi,
     OIIO::pvt::LoggedTimer logtime("IBA::fit");
 
     static const ustring recognized[] = {
-        filtername_us,
-        filterwidth_us,
-        filterptr_us,
-        fillmode_us,
-        exact_us,
+        filtername_us, filterwidth_us, filterptr_us, fillmode_us, exact_us,
 #if 0 /* Not currently recognized */
         wrap_us,
         edgeclamp_us,
@@ -1072,15 +1070,39 @@ ImageBufAlgo::fit(const ImageBuf& src, KWArgs options, ROI roi, int nthreads)
 
 template<typename DSTTYPE, typename SRCTYPE>
 static bool
-resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
-          int nthreads)
+resample_scalar(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+                int nthreads)
 {
+    // This operates just like the internals of ImageBuf::interppixel(), but
+    // reuses the provided iterator to avoid the overhead of constructing a new
+    // one each time. This speeds it up by 20x! The iterator `it` must already
+    // be associated with `img`, but it need not be positioned correctly.
+    auto interppixel =
+        [](const ImageBuf& img, ImageBuf::ConstIterator<SRCTYPE>& it, float x,
+           float y, span<float> pixel, ImageBuf::WrapMode wrap) -> bool {
+        int n             = std::min(int(pixel.size()), img.spec().nchannels);
+        float* localpixel = OIIO_ALLOCA(float, n * 4);
+        float* p[4]       = { localpixel, localpixel + n, localpixel + 2 * n,
+                              localpixel + 3 * n };
+        x -= 0.5f;
+        y -= 0.5f;
+        int xtexel, ytexel;
+        float xfrac, yfrac;
+        xfrac = floorfrac(x, &xtexel);
+        yfrac = floorfrac(y, &ytexel);
+        it.rerange(xtexel, xtexel + 2, ytexel, ytexel + 2, 0, 1, wrap);
+        for (int i = 0; i < 4; ++i, ++it)
+            for (int c = 0; c < n; ++c)
+                p[i][c] = it[c];  //NOSONAR
+        bilerp(p[0], p[1], p[2], p[3], xfrac, yfrac, n, pixel.data());
+        return true;
+    };
+
     OIIO_ASSERT(src.deep() == dst.deep());
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         const ImageSpec& srcspec(src.spec());
         const ImageSpec& dstspec(dst.spec());
         int nchannels = src.nchannels();
-        bool deep     = src.deep();
 
         // Local copies of the source image window, converted to float
         float srcfx = srcspec.full_x;
@@ -1109,25 +1131,10 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
                 float s      = (x - dstfx + 0.5f) * dstpixelwidth;
                 float src_xf = srcfx + s * srcfw;
                 int src_x    = ifloor(src_xf);
-                if (deep) {
-                    srcpel.pos(src_x, src_y, 0);
-                    int nsamps = srcpel.deep_samples();
-                    OIIO_DASSERT(nsamps == out.deep_samples());
-                    if (!nsamps || nsamps != out.deep_samples())
-                        continue;
-                    for (int c = 0; c < nchannels; ++c) {
-                        if (dstspec.channelformat(c) == TypeDesc::UINT32)
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(
-                                    c, samp, srcpel.deep_value_uint(c, samp));
-                        else
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(c, samp,
-                                                   srcpel.deep_value(c, samp));
-                    }
-                } else if (interpolate) {
+                if (interpolate) {
                     // Non-deep image, bilinearly interpolate
-                    src.interppixel(src_xf, src_yf, pel, ImageBuf::WrapClamp);
+                    interppixel(src, srcpel, src_xf, src_yf, pel,
+                                ImageBuf::WrapClamp);
                     for (int c = roi.chbegin; c < roi.chend; ++c)
                         out[c] = pel[c];
                 } else {
@@ -1142,6 +1149,261 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
     return true;
 }
 
+static bool
+resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+              int nthreads)
+{
+    // If it's deep, figure out the sample allocations first, because
+    // it's not thread-safe to do that simultaneously with copying the
+    // values.
+    const ImageSpec& srcspec(src.spec());
+    const ImageSpec& dstspec(dst.spec());
+    float srcfx          = srcspec.full_x;
+    float srcfy          = srcspec.full_y;
+    float srcfw          = srcspec.full_width;
+    float srcfh          = srcspec.full_height;
+    float dstpixelwidth  = 1.0f / dstspec.full_width;
+    float dstpixelheight = 1.0f / dstspec.full_height;
+    ImageBuf::ConstIterator<float> srcpel(src, roi);
+    ImageBuf::Iterator<float> dstpel(dst, roi);
+    for (; !dstpel.done(); ++dstpel, ++srcpel) {
+        float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
+        float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
+        int src_y = ifloor(srcfy + t * srcfh);
+        int src_x = ifloor(srcfx + s * srcfw);
+        srcpel.pos(src_x, src_y, 0);
+        dstpel.set_deep_samples(srcpel.deep_samples());
+    }
+
+    OIIO_ASSERT(src.deep() == dst.deep());
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+        int nchannels = src.nchannels();
+
+        // Local copies of the source image window, converted to float
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        ImageBuf::Iterator<float> out(dst, roi);
+        ImageBuf::ConstIterator<float> srcpel(src);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            // s,t are NDC space
+            float t = (y - dstfy + 0.5f) * dstpixelheight;
+            // src_xf, src_xf are image space float coordinates
+            float src_yf = srcfy + t * srcfh;
+            // src_x, src_y are image space integer coordinates of the floor
+            int src_y = ifloor(src_yf);
+            for (int x = roi.xbegin; x < roi.xend; ++x, ++out) {
+                float s      = (x - dstfx + 0.5f) * dstpixelwidth;
+                float src_xf = srcfx + s * srcfw;
+                int src_x    = ifloor(src_xf);
+                srcpel.pos(src_x, src_y, 0);
+                int nsamps = srcpel.deep_samples();
+                OIIO_DASSERT(nsamps == out.deep_samples());
+                if (!nsamps || nsamps != out.deep_samples())
+                    continue;
+                for (int c = 0; c < nchannels; ++c) {
+                    if (dstspec.channelformat(c) == TypeDesc::UINT32)
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value_uint(c, samp));
+                    else
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value(c, samp));
+                }
+            }
+        }
+    });
+
+    return true;
+}
+
+
+
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_hwy(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<DSTTYPE, double>, double, float>;
+    using D      = hn::ScalableTag<SimdType>;
+    using Rebind = hn::Rebind<int32_t, D>;
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+
+        // Local copies of the source image window, converted to SimdType
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        const size_t src_scanline_bytes = srcspec.scanline_bytes();
+        const size_t dst_scanline_bytes = dstspec.scanline_bytes();
+        const size_t src_pixel_bytes    = srcspec.pixel_bytes();
+        const size_t dst_pixel_bytes    = dstspec.pixel_bytes();
+
+        const uint8_t* src_base = (const uint8_t*)src.localpixels();
+        uint8_t* dst_base       = (uint8_t*)dst.localpixels();
+
+        D d;
+        Rebind d_i32;
+        int N = hn::Lanes(d);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            float t      = (y - dstfy + 0.5f) * dstpixelheight;
+            float src_yf = srcfy + t * srcfh;
+            // Pixel-center convention: subtract 0.5 before interpolation
+            src_yf -= 0.5f;
+            int src_y   = ifloor(src_yf);
+            SimdType fy = (SimdType)(src_yf - src_y);
+
+            // Clamp Y to valid range
+            int src_y_clamped = clamp(src_y, src.ybegin(), src.yend() - 1);
+            // Neighbor Y (for bilinear)
+            int src_y_next_clamped = clamp(src_y + 1, src.ybegin(),
+                                           src.yend() - 1);
+
+            // Pre-calculate row pointers
+            const uint8_t* row0 = src_base
+                                  + (src_y_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+            const uint8_t* row1 = src_base
+                                  + (src_y_next_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+
+            uint8_t* dst_row = dst_base
+                               + (y - dst.ybegin()) * dst_scanline_bytes;
+
+            for (int x = roi.xbegin; x < roi.xend; x += N) {
+                // Handle remaining pixels if less than N
+                int n = std::min(N, roi.xend - x);
+
+                // Compute src_xf for N pixels
+                auto idx_i32 = hn::Iota(d_i32, (float)x);
+
+                auto x_simd     = hn::ConvertTo(d, idx_i32);
+                auto s          = hn::Mul(hn::Sub(hn::Add(x_simd,
+                                                          hn::Set(d, (SimdType)0.5f)),
+                                                  hn::Set(d, (SimdType)dstfx)),
+                                          hn::Set(d, (SimdType)dstpixelwidth));
+                auto src_xf_vec = hn::MulAdd(s, hn::Set(d, (SimdType)srcfw),
+                                             hn::Set(d, (SimdType)srcfx));
+                // Pixel-center convention: subtract 0.5 before interpolation
+                src_xf_vec = hn::Sub(src_xf_vec, hn::Set(d, (SimdType)0.5f));
+
+                auto src_x_vec = hn::Floor(src_xf_vec);
+                auto fx        = hn::Sub(src_xf_vec, src_x_vec);
+                auto ix        = hn::ConvertTo(d_i32, src_x_vec);
+
+                // Clamp X
+                auto min_x = hn::Set(d_i32, src.xbegin());
+                auto max_x = hn::Set(d_i32, src.xend() - 1);
+                auto ix0   = hn::Min(hn::Max(ix, min_x), max_x);
+                auto ix1
+                    = hn::Min(hn::Max(hn::Add(ix, hn::Set(d_i32, 1)), min_x),
+                              max_x);
+
+                // Adjust to 0-based offset from buffer start
+                auto x_offset  = hn::Sub(ix0, min_x);
+                auto x1_offset = hn::Sub(ix1, min_x);
+
+                // Loop over channels
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    // Manual gather loop for now to be safe with types and offsets
+                    SimdType v00_arr[16], v01_arr[16], v10_arr[16], v11_arr[16];
+                    int32_t x0_arr[16], x1_arr[16];
+                    hn::Store(x_offset, d_i32, x0_arr);
+                    hn::Store(x1_offset, d_i32, x1_arr);
+
+                    for (int i = 0; i < n; ++i) {
+                        size_t off0 = (size_t)x0_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+                        size_t off1 = (size_t)x1_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+
+                        auto load_val = [](const uint8_t* ptr) -> SimdType {
+                            return (SimdType)(*(const SRCTYPE*)ptr);
+                        };
+
+                        v00_arr[i] = load_val(row0 + off0);
+                        v01_arr[i] = load_val(row0 + off1);
+                        v10_arr[i] = load_val(row1 + off0);
+                        v11_arr[i] = load_val(row1 + off1);
+                    }
+
+                    auto val00 = hn::Load(d, v00_arr);
+                    auto val01 = hn::Load(d, v01_arr);
+                    auto val10 = hn::Load(d, v10_arr);
+                    auto val11 = hn::Load(d, v11_arr);
+
+                    // Bilinear Interpolation
+                    auto one = hn::Set(d, (SimdType)1.0f);
+                    auto w00 = hn::Mul(hn::Sub(one, fx),
+                                       hn::Sub(one, hn::Set(d, fy)));
+                    auto w01 = hn::Mul(fx, hn::Sub(one, hn::Set(d, fy)));
+                    auto w10 = hn::Mul(hn::Sub(one, fx), hn::Set(d, fy));
+                    auto w11 = hn::Mul(fx, hn::Set(d, fy));
+
+                    // Use FMA (Fused Multiply-Add) for better performance
+                    auto res = hn::Mul(val00, w00);
+                    res      = hn::MulAdd(val01, w01,
+                                          res);  // res = res + val01 * w01
+                    res      = hn::MulAdd(val10, w10,
+                                          res);  // res = res + val10 * w10
+                    res      = hn::MulAdd(val11, w11,
+                                          res);  // res = res + val11 * w11
+
+                    // Store
+                    SimdType res_arr[16];
+                    hn::Store(res, d, res_arr);
+                    for (int i = 0; i < n; ++i) {
+                        DSTTYPE* dptr
+                            = (DSTTYPE*)(dst_row
+                                         + (size_t)(x - roi.xbegin + i)
+                                               * dst_pixel_bytes
+                                         + (size_t)c * sizeof(DSTTYPE));
+                        *dptr = (DSTTYPE)res_arr[i];
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+          int nthreads)
+{
+    if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels())
+        return resample_hwy<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                              nthreads);
+
+    return resample_scalar<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                             nthreads);
+}
 
 
 bool
@@ -1155,27 +1417,7 @@ ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate,
         return false;
 
     if (dst.deep()) {
-        // If it's deep, figure out the sample allocations first, because
-        // it's not thread-safe to do that simultaneously with copying the
-        // values.
-        const ImageSpec& srcspec(src.spec());
-        const ImageSpec& dstspec(dst.spec());
-        float srcfx          = srcspec.full_x;
-        float srcfy          = srcspec.full_y;
-        float srcfw          = srcspec.full_width;
-        float srcfh          = srcspec.full_height;
-        float dstpixelwidth  = 1.0f / dstspec.full_width;
-        float dstpixelheight = 1.0f / dstspec.full_height;
-        ImageBuf::ConstIterator<float> srcpel(src, roi);
-        ImageBuf::Iterator<float> dstpel(dst, roi);
-        for (; !dstpel.done(); ++dstpel, ++srcpel) {
-            float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
-            float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
-            int src_y = ifloor(srcfy + t * srcfh);
-            int src_x = ifloor(srcfx + s * srcfw);
-            srcpel.pos(src_x, src_y, 0);
-            dstpel.set_deep_samples(srcpel.deep_samples());
-        }
+        return resample_deep(dst, src, interpolate, roi, nthreads);
     }
 
     bool ok;
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index 909f8529d4..aa8babf9b4 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -53,6 +53,7 @@ int png_linear_premult(0);
 int tiff_half(0);
 int tiff_multithread(1);
 int dds_bc5normal(0);
+int enable_hwy(1);  // Enable Google Highway SIMD optimizations by default
 int limit_channels(1024);
 int limit_imagesize_MB(std::min(32 * 1024,
                                 int(Sysutil::physical_memory() >> 20)));
@@ -406,6 +407,10 @@ attribute(string_view name, TypeDesc type, const void* val)
         dds_bc5normal = *(const int*)val;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        enable_hwy = *(const int*)val;
+        return true;
+    }
     if (name == "limits:channels" && type == TypeInt) {
         limit_channels = *(const int*)val;
         return true;
@@ -612,6 +617,10 @@ getattribute(string_view name, TypeDesc type, void* val)
         *(int*)val = dds_bc5normal;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        *(int*)val = enable_hwy;
+        return true;
+    }
     if (name == "oiio:print_uncaught_errors" && type == TypeInt) {
         *(int*)val = oiio_print_uncaught_errors;
         return true;