diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ed1589cfc..d576057003 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,6 +276,7 @@ if (SKBUILD) set (ENABLE_idiff OFF) set (ENABLE_igrep OFF) set (ENABLE_iinfo OFF) + set (ENABLE_hwy_test OFF) set (ENABLE_testtex OFF) set (ENABLE_iv OFF) endif () @@ -286,6 +287,7 @@ if (OIIO_BUILD_TOOLS AND NOT BUILD_OIIOUTIL_ONLY) add_subdirectory (src/idiff) add_subdirectory (src/igrep) add_subdirectory (src/iinfo) + add_subdirectory (src/hwy_test) add_subdirectory (src/maketx) add_subdirectory (src/oiiotool) add_subdirectory (src/testtex) diff --git a/docs/dev/Architecture.md b/docs/dev/Architecture.md index 5e52bf4143..72f81d8907 100644 --- a/docs/dev/Architecture.md +++ b/docs/dev/Architecture.md @@ -117,6 +117,10 @@ objects. These algorithms include simple operations like copying, resizing, and compositing images, as well as more complex operations like color conversions, resizing, filtering, etc. +Some performance-critical `ImageBufAlgo` implementations have SIMD-accelerated +paths using Google Highway. For implementation details and guidance for adding +new kernels, see `docs/dev/ImageBufAlgo_Highway.md`. + ## Image caching: TextureSystem and ImageCache There are situations where ImageBuf is still not the right abstraction, diff --git a/docs/dev/ImageBufAlgo_Highway.md b/docs/dev/ImageBufAlgo_Highway.md new file mode 100644 index 0000000000..960766978c --- /dev/null +++ b/docs/dev/ImageBufAlgo_Highway.md @@ -0,0 +1,264 @@ +ImageBufAlgo Highway (hwy) Implementation Guide +============================================== + +This document explains how OpenImageIO uses Google Highway (hwy) to accelerate +selected `ImageBufAlgo` operations, and how to add or modify kernels in a way +that preserves OIIO semantics while keeping the code maintainable. + +This is a developer-facing document about the implementation structure in +`src/libOpenImageIO/`. It does not describe the public API behavior of the +algorithms. + + +Goals and non-goals +------------------- + +Goals: +- Make the hwy-backed code paths easy to read and easy to extend. +- Centralize repetitive boilerplate (type conversion, tails, ROI pointer math). +- Preserve OIIO's numeric semantics (normalized integer model). +- Keep scalar fallbacks as the source of truth for tricky layout cases. + +Non-goals: +- Explain Highway itself. Refer to the upstream Highway documentation. +- Guarantee that every ImageBufAlgo op has a hwy implementation. + + +Where the code lives +-------------------- + +Core helpers: +- `src/libOpenImageIO/imagebufalgo_hwy_pvt.h` + +Typical hwy call sites: +- `src/libOpenImageIO/imagebufalgo_addsub.cpp` +- `src/libOpenImageIO/imagebufalgo_muldiv.cpp` +- `src/libOpenImageIO/imagebufalgo_mad.cpp` +- `src/libOpenImageIO/imagebufalgo_pixelmath.cpp` +- `src/libOpenImageIO/imagebufalgo_xform.cpp` (some ops are hwy-accelerated) + + +Enabling and gating the hwy path +------------------------------- + +The hwy path is only used when: +- Highway usage is enabled at runtime (`OIIO::pvt::enable_hwy`). +- The relevant `ImageBuf` objects have local pixel storage (`localpixels()` is + non-null), meaning the data is in process memory rather than accessed through + an `ImageCache` tile abstraction. +- The operation can be safely expressed as contiguous streams of pixels/channels + for the hot path, or the code falls back to a scalar implementation for + strided/non-contiguous layouts. + +The common gating pattern looks like: +- In a typed `*_impl` dispatcher: check `OIIO::pvt::enable_hwy` and `localpixels` + and then call a `*_impl_hwy` function; otherwise call `*_impl_scalar`. + +Important: the hwy path is an optimization. Correctness must not depend on hwy. + + +OIIO numeric semantics: why we promote to float +---------------------------------------------- + +OIIO treats integer image pixels as normalized values: +- Unsigned integers represent [0, 1]. +- Signed integers represent approximately [-1, 1] with clamping for INT_MIN. + +Therefore, most pixel math must be performed in float (or double) space, even +when the stored data is integer. This is why the hwy layer uses the +"LoadPromote/Operate/DemoteStore" pattern. + +For additional discussion (and pitfalls of saturating integer arithmetic), see: +- `HIGHWAY_SATURATING_ANALYSIS.md` + + +The core pattern: LoadPromote -> RunHwy* -> DemoteStore +------------------------------------------------------- + +The helper header `imagebufalgo_hwy_pvt.h` defines the reusable building blocks: + +1) Computation type selection + - `SimdMathType` selects `float` for most types, and `double` only when + the destination type is `double`. + + Rationale: + - Float math is significantly faster on many targets. + - For OIIO, integer images are normalized to [0,1] (or ~[-1,1]), so float + precision is sufficient for typical image processing workloads. + +2) Load and promote (with normalization) + - `LoadPromote(d, ptr)` and `LoadPromoteN(d, ptr, count)` load values and + normalize integer ranges into the computation space. + + Rationale: + - Consolidates all normalization and conversion logic in one place. + - Prevents subtle drift where each operation re-implements integer scaling. + - Ensures tail handling ("N" variants) is correct and consistent. + +3) Demote and store (with denormalization/clamp/round) + - `DemoteStore(d, ptr, v)` and `DemoteStoreN(d, ptr, v, count)` reverse the + normalization and store results in the destination pixel type. + + Rationale: + - Centralizes rounding and clamping behavior for all destination types. + - Ensures output matches OIIO scalar semantics. + +4) Generic kernel runners (streaming arrays) + - `RunHwyUnaryCmd`, `RunHwyCmd` (binary), `RunHwyTernaryCmd` + - These are the primary entry points for most hwy kernels. + + Rationale: + - Encapsulates lane iteration and tail processing once. + - The call sites only provide the per-lane math lambda, not the boilerplate. + + +Native integer runners: when they are valid +------------------------------------------- + +Some operations are "scale-invariant" under OIIO's normalized integer model. +For example, for unsigned integer add: +- `(a/max + b/max)` in float space, then clamped to [0,1], then scaled by max + matches saturated integer add `SaturatedAdd(a, b)` for the same bit depth. + +For those cases, `imagebufalgo_hwy_pvt.h` provides: +- `RunHwyUnaryNativeInt` +- `RunHwyBinaryNativeInt` + +These should only be used when all of the following are true: +- The operation is known to be scale-invariant under the normalization model. +- Input and output types are the same integral type. +- The operation does not depend on mixed types or float-range behavior. + +Rationale: +- Avoids promotion/demotion overhead and can be materially faster. +- Must be opt-in and explicit, because many operations are NOT compatible with + raw integer arithmetic (e.g. multiplication, division, pow). + + +Local pixel pointer helpers: reducing boilerplate safely +------------------------------------------------------- + +Most hwy call sites need repeated pointer and stride computations: +- Pixel size in bytes. +- Scanline size in bytes. +- Base pointer to local pixels. +- Per-row pointer for a given ROI and scanline. +- Per-pixel pointer for non-contiguous fallbacks. + +To centralize that, `imagebufalgo_hwy_pvt.h` defines: +- `HwyPixels(ImageBuf&)` and `HwyPixels(const ImageBuf&)` + returning a small view (`HwyLocalPixelsView`) with: + - base pointer (`std::byte*` / `const std::byte*`) + - `pixel_bytes`, `scanline_bytes` + - `xbegin`, `ybegin`, `nchannels` +- `RoiNChannels(roi)` for `roi.chend - roi.chbegin` +- `ChannelsContiguous(view, nchannels)`: + true only when the pixel stride exactly equals `nchannels * sizeof(T)` +- `PixelBase(view, x, y)`, `ChannelPtr(view, x, y, ch)` +- `RoiRowPtr(view, y, roi)` for the start of the ROI row at `roi.xbegin` and + `roi.chbegin`. + +Rationale: +- Avoids duplicating fragile byte-offset math across many ops. +- Makes it visually obvious what the code is doing: "get row pointer" vs + "compute offset by hand." +- Makes non-contiguous fallback paths less error-prone by reusing the same + pointer computations. + +Important: these helpers are only valid for `ImageBuf` instances with local +pixels (`localpixels()` non-null). The call sites must check that before using +them. + + +Contiguous fast path vs non-contiguous fallback +----------------------------------------------- + +Most operations implement two paths: + +1) Contiguous fast path: + - Used when pixels are tightly packed for the ROI's channel range. + - The operation is executed as a 1D stream of length: + `roi.width() * (roi.chend - roi.chbegin)` + - Uses `RunHwy*Cmd` (or native-int runner) and benefits from: + - fewer branches + - fewer pointer computations + - auto tail handling + +2) Non-contiguous fallback: + - Used when pixels have padding, unusual strides, or channel subsets that do + not form a dense stream. + - Typically loops pixel-by-pixel and channel-by-channel. + - May still use the `ChannelPtr` helpers to compute correct addresses. + +Rationale: +- The contiguous path is where SIMD delivers large gains. +- Trying to SIMD-optimize arbitrary strided layouts often increases complexity + and risk for marginal benefit. Keeping a scalar fallback preserves + correctness and maintainability. + + +How to add a new hwy kernel +--------------------------- + +Step 1: Choose the kernel shape +- Unary: `R = f(A)` -> use `RunHwyUnaryCmd` +- Binary: `R = f(A, B)` -> use `RunHwyCmd` +- Ternary: `R = f(A, B, C)` -> use `RunHwyTernaryCmd` + +Step 2: Decide if a native-int fast path is valid +- Only for scale-invariant ops and same-type integral inputs/outputs. +- Use `RunHwyUnaryNativeInt` / `RunHwyBinaryNativeInt` when safe. +- Otherwise, always use the promote/demote runners. + +Step 3: Implement the hwy body with a contig check +Typical structure inside `*_impl_hwy`: +- Acquire views once: + - `auto Rv = HwyPixels(R);` + - `auto Av = HwyPixels(A);` etc. +- In the parallel callback: + - compute `nchannels = RoiNChannels(roi)` + - compute `contig = ChannelsContiguous<...>(...)` for each image + - for each scanline y: + - `Rtype* r_row = RoiRowPtr(Rv, y, roi);` + - `const Atype* a_row = RoiRowPtr(Av, y, roi);` etc. + - if contig: call `RunHwy*` with `n = roi.width() * nchannels` + - else: fall back per pixel, per channel + +Step 4: Keep the scalar path as the reference +- The scalar implementation should remain correct for all layouts and types. +- The hwy path should match scalar results for supported cases. + + +Design rationale summary +------------------------ + +This design intentionally separates concerns: +- Type conversion and normalization are centralized (`LoadPromote`, + `DemoteStore`). +- SIMD lane iteration and tail handling are centralized (`RunHwy*` runners). +- Image address computations are centralized (`HwyPixels`, `RoiRowPtr`, + `ChannelPtr`). +- Operation-specific code is reduced to short lambdas expressing the math. + +This makes the hwy layer: +- Easier to maintain: fewer places to fix bugs when semantics change. +- Easier to extend: adding an op mostly means writing the math lambda and the + dispatch glue. +- Safer: correctness for unusual layouts remains in scalar fallbacks. + + +Notes on `half` +--------------- + +The hwy conversion helpers handle `half` by converting through +`hwy::float16_t`. This currently assumes the underlying `half` representation +is compatible with how Highway loads/stores 16-bit floats. + +If this assumption is revisited in the future, it should be changed as a +separate, explicit correctness/performance project. + + + + + + diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake index 12467ae6b6..34084f1743 100644 --- a/src/cmake/externalpackages.cmake +++ b/src/cmake/externalpackages.cmake @@ -225,6 +225,9 @@ if (USE_QT AND OPENGL_FOUND) endif () +# Google Highway for SIMD +checked_find_package (hwy REQUIRED) + # Tessil/robin-map checked_find_package (Robinmap REQUIRED VERSION_MIN 1.2.0 diff --git a/src/doc/imagebufalgo.rst b/src/doc/imagebufalgo.rst index b013ce0d20..200417accc 100644 --- a/src/doc/imagebufalgo.rst +++ b/src/doc/imagebufalgo.rst @@ -152,6 +152,68 @@ the computation without spawning additional threads, which might tend to crowd out the other application threads. +SIMD Performance and Data Types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many ImageBufAlgo operations use SIMD (Single Instruction, Multiple Data) +optimizations powered by the Google Highway library to achieve significant +performance improvements, particularly for integer image formats. + +**Integer Type Optimizations:** + +OpenImageIO treats all integer images as normalized Standard Dynamic Range +(SDR) data: + +* Unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) are + normalized to the [0.0, 1.0] range: ``float_value = int_value / max_value`` +* Signed integers (``int8``, ``int16``, ``int32``, ``int64``) are normalized + to approximately the [-1.0, 1.0] range: ``float_value = int_value / max_value`` + +Most ImageBufAlgo operations convert integer data to float, perform the +operation, and convert back. Highway SIMD provides 3-5x speedup for these +operations compared to scalar code. + +**Scale-Invariant Operations:** + +Certain operations are *scale-invariant*, meaning they produce identical +results whether performed on raw integers or normalized floats. For these +operations, OpenImageIO uses native integer SIMD paths that avoid float +conversion entirely, achieving 6-12x speedup (2-3x faster than the float +promotion path): + +* ``add``, ``sub`` (with saturation) +* ``min``, ``max`` +* ``abs``, ``absdiff`` + +These optimizations automatically activate when all input and output images +have matching integer types (e.g., all ``uint8``). When types differ or when +mixing integer and float images, the standard float promotion path is used. + +**Controlling SIMD Optimizations:** + +Highway SIMD is enabled by default. To disable it globally:: + + OIIO::attribute("enable_hwy", 0); + +Or via environment variable:: + + export OPENIMAGEIO_ENABLE_HWY=0 + +This is primarily useful for debugging or performance comparison. In normal +use, the optimizations should remain enabled for best performance. + +**Performance Expectations:** + +Typical speedups with Highway SIMD (compared to scalar code): + +* Float operations: 3-5x faster +* Integer operations (with float conversion): 3-5x faster +* Integer scale-invariant operations (native int): 6-12x faster +* Half-float operations: 3-5x faster + +Actual performance depends on the specific operation, image size, data types, +and hardware capabilities (AVX2, AVX-512, ARM NEON, etc.). + .. _sec-iba-patterns: diff --git a/src/doc/imageioapi.rst b/src/doc/imageioapi.rst index d2d6b192b4..dca5a66da5 100644 --- a/src/doc/imageioapi.rst +++ b/src/doc/imageioapi.rst @@ -397,16 +397,36 @@ inside the source code. line, but not the full human-readable command line. (This was added in OpenImageIO 2.5.11.) +.. cpp:var:: OPENIMAGEIO_ENABLE_HWY + + Controls whether to use Google Highway SIMD library optimizations for + ImageBufAlgo operations. If set to "1" (the default), Highway SIMD + optimizations will be enabled for supported operations, providing + significant performance improvements (typically 3-12x faster) on integer + image types. If set to "0", these optimizations will be disabled and fall + back to scalar implementations. + + This can also be controlled at runtime via:: + + OIIO::attribute("enable_hwy", 1); // enable (default) + OIIO::attribute("enable_hwy", 0); // disable + + Note: Highway SIMD optimizations are particularly beneficial for integer + image formats (uint8, uint16, int8, int16, uint32, int32, etc.) and provide + additional speedup for scale-invariant operations (add, sub, min, max, + absdiff) that can operate directly on integer data without float conversion. + (This was added in OpenImageIO 3.1.) + .. cpp:var:: OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH - Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier. - If set to "1", all directories under ``PATH`` will be added to the DLL load + Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier. + If set to "1", all directories under ``PATH`` will be added to the DLL load path before attempting to import the OpenImageIO module. (This was added in OpenImageIO 3.0.3.0) - Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style" - Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH` - environment variable (added in OpenImageIO 2.4.0/2.3.18). + Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style" + Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH` + environment variable (added in OpenImageIO 2.4.0/2.3.18). - In other words, to reproduce the default Python-module-loading behavior of + In other words, to reproduce the default Python-module-loading behavior of earlier versions of OIIO, set ``OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1``. diff --git a/src/hwy_test/CMakeLists.txt b/src/hwy_test/CMakeLists.txt new file mode 100644 index 0000000000..735b88def5 --- /dev/null +++ b/src/hwy_test/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright Contributors to the OpenImageIO project. +# SPDX-License-Identifier: Apache-2.0 +# https://github.com/AcademySoftwareFoundation/OpenImageIO + +fancy_add_executable (LINK_LIBRARIES OpenImageIO) diff --git a/src/hwy_test/hwy_test.cpp b/src/hwy_test/hwy_test.cpp new file mode 100644 index 0000000000..4fbf51af32 --- /dev/null +++ b/src/hwy_test/hwy_test.cpp @@ -0,0 +1,1096 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +/// Benchmark Highway SIMD vs Scalar implementations +/// Compares performance by toggling OIIO::attribute("enable_hwy", 0/1) + +#include +#include +#include + +#include +#include +#include + +using namespace OIIO; + +struct BenchResult { + double scalar_ms; + double simd_ms; + double speedup; +}; + +struct TestResult { + const char* type_name; + BenchResult bench; + float max_error; + bool pass; +}; + +// Run a benchmark function multiple times and return average time in milliseconds +template +double +benchmark_ms(Func&& func, int iterations = 100, int warmup = 5) +{ + // Warmup + for (int i = 0; i < warmup; ++i) { + func(); + } + + Timer timer; + for (int i = 0; i < iterations; ++i) { + func(); + } + return timer() * 1000.0 / iterations; // Convert to ms +} + +// Compare two ImageBufs and return max error between them +// Tolerance accounts for rounding differences between SIMD (round-to-nearest) +// and scalar (truncate) conversions. For uint8: 1/255 = 0.004, uint16: 1/65535 = 0.00002 +float +verify_match(const ImageBuf& scalar_result, const ImageBuf& simd_result, + float tolerance = 0.005f) +{ + auto comp = ImageBufAlgo::compare(scalar_result, simd_result, tolerance, + tolerance); + return comp.maxerror; +} + +// Benchmark add operation +BenchResult +bench_add(const ImageBuf& A, const ImageBuf& B, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + // Scalar version + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::add(R, A, B); }, + iterations); + + // SIMD version + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::add(R, A, B); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark sub operation +BenchResult +bench_sub(const ImageBuf& A, const ImageBuf& B, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::sub(R, A, B); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::sub(R, A, B); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark mul operation +BenchResult +bench_mul(const ImageBuf& A, const ImageBuf& B, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::mul(R, A, B); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::mul(R, A, B); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark pow operation +BenchResult +bench_pow(const ImageBuf& A, cspan exponent, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::pow(R, A, exponent); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::pow(R, A, exponent); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark rangecompress operation +BenchResult +bench_rangecompress(const ImageBuf& A, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::rangecompress(R, A); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::rangecompress(R, A); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark rangeexpand operation +BenchResult +bench_rangeexpand(const ImageBuf& A, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::rangeexpand(R, A); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::rangeexpand(R, A); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark premult operation +BenchResult +bench_premult(const ImageBuf& A, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::premult(R, A); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::premult(R, A); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark unpremult operation +BenchResult +bench_unpremult(const ImageBuf& A, int iterations = 100) +{ + BenchResult result; + ImageBuf R(A.spec()); + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::unpremult(R, A); }, + iterations); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::unpremult(R, A); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + return result; +} + +// Benchmark resample operation +BenchResult +bench_resample(const ImageBuf& A, int new_width, int new_height, + int iterations = 50) +{ + BenchResult result; + ImageSpec newspec = A.spec(); + newspec.width = new_width; + newspec.height = new_height; + + // Scalar version - ensure proper allocation + ImageBuf R_scalar(newspec); + ImageBufAlgo::zero(R_scalar); // Ensure buffer is allocated! + + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::resample(R_scalar, A); }, + iterations); + + // SIMD version + ImageBuf R_simd(newspec); + ImageBufAlgo::zero(R_simd); + + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::resample(R_simd, A); }, + iterations); + + result.speedup = result.scalar_ms / result.simd_ms; + + // Validate results - check for differences + auto comp = ImageBufAlgo::compare(R_scalar, R_simd, 0.001f, 0.001f); + if (comp.maxerror > 0.001f) { + printf(" \033[33m[INFO] max error: %.6f at (%d, %d, c%d)\033[0m\n", + comp.maxerror, comp.maxx, comp.maxy, comp.maxc); + + // Print actual pixel values at the error location + std::vector scalar_pixel(R_scalar.nchannels()); + std::vector simd_pixel(R_simd.nchannels()); + R_scalar.getpixel(comp.maxx, comp.maxy, scalar_pixel.data()); + R_simd.getpixel(comp.maxx, comp.maxy, simd_pixel.data()); + printf(" Scalar ch%d: %.6f, SIMD ch%d: %.6f, diff: %.6f\n", + comp.maxc, scalar_pixel[comp.maxc], comp.maxc, + simd_pixel[comp.maxc], + std::abs(scalar_pixel[comp.maxc] - simd_pixel[comp.maxc])); + } + + return result; +} + +// Print results +void +print_result(const char* type_name, const BenchResult& result) +{ + const char* color = result.speedup > 1.0 ? "\033[32m" : "\033[31m"; + const char* reset = "\033[0m"; + printf("%-10s | %10.2f | %10.2f | %s%6.2fx%s\n", type_name, + result.scalar_ms, result.simd_ms, color, result.speedup, reset); +} + +void +print_header() +{ + printf("%-10s | %10s | %10s | %-8s\n", "Type", "Scalar(ms)", "SIMD(ms)", + "Speedup"); + printf("----------------------------------------------------\n"); +} + +void +print_test_results(const char* test_name, + const std::vector& results) +{ + printf("\n[ %s ]\n", test_name); + printf("-----------------------------------------------\n"); + + // Print all timing results + for (const auto& r : results) { + const char* color = r.bench.speedup > 1.0 ? "\033[32m" : "\033[31m"; + const char* reset = "\033[0m"; + printf("%-10s | %10.2f | %10.2f | %s%6.2fx%s\n", r.type_name, + r.bench.scalar_ms, r.bench.simd_ms, color, r.bench.speedup, + reset); + } + + // Print separator + printf("-----------------------------------------------\n"); + + // Print all PASS/FAIL results + for (const auto& r : results) { + const char* color = r.pass ? "\033[32m" : "\033[31m"; + const char* reset = "\033[0m"; + printf("%-6s: %s%s%s (max_err: %.6f)\n", r.type_name, color, + r.pass ? "PASS" : "FAIL", reset, r.max_error); + } +} + +// Get appropriate file extension for type +const char* +get_extension(TypeDesc format) +{ + if (format == TypeDesc::HALF) + return ".exr"; + return ".tif"; +} + +// Save image with appropriate format +void +save_image(const ImageBuf& buf, const char* basename, const char* type_name) +{ + char filename[256]; + snprintf(filename, sizeof(filename), "%s_%s%s", basename, type_name, + get_extension(buf.spec().format)); + if (!buf.write(filename)) { + printf(" Warning: Failed to save %s\n", filename); + } +} + +// Create test images +ImageBuf +create_test_image(int width, int height, int nchannels, TypeDesc format) +{ + ImageSpec spec(width, height, nchannels, format); + ImageBuf buf(spec); + + // Create a gradient to ensure meaningful resampling + std::vector tl(nchannels), tr(nchannels), bl(nchannels), + br(nchannels); + for (int c = 0; c < nchannels; ++c) { + tl[c] = 0.0f; + tr[c] = 1.0f; + bl[c] = 0.5f; + br[c] = 0.0f; + if (c % 2 == 1) { // Vary channels + tl[c] = 1.0f; + tr[c] = 0.0f; + bl[c] = 0.0f; + br[c] = 1.0f; + } + } + ImageBufAlgo::fill(buf, tl, tr, bl, br); + return buf; +} + +ImageBuf +create_checkerboard_image(int width, int height, int nchannels, TypeDesc format, + int checker_size = 64) +{ + ImageSpec spec(width, height, nchannels, format); + ImageBuf buf(spec); + + // Fill with checkerboard pattern + ImageBufAlgo::checker(buf, checker_size, checker_size, nchannels, + { 0.1f, 0.1f, 0.1f }, { 0.9f, 0.9f, 0.9f }, 0, 0, 0); + return buf; +} + +ImageBuf +create_rgba_image(int width, int height, TypeDesc format) +{ + ImageSpec spec(width, height, 4, format); + spec.alpha_channel = 3; + ImageBuf buf(spec); + // Fill with semi-transparent colors + ImageBufAlgo::fill(buf, { 0.8f, 0.6f, 0.4f, 0.7f }); + return buf; +} + +int +main(int argc, char* argv[]) +{ + // Default parameters + int width = 2048; + int height = 2048; + int iterations = 20; + + // Parse command line args + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--size") == 0 && i + 1 < argc) { + if (sscanf(argv[++i], "%dx%d", &width, &height) != 2) { + fprintf(stderr, + "Invalid size format. Use WxH (e.g., 2048x2048)\n"); + return 1; + } + } else if (strcmp(argv[i], "--iterations") == 0 && i + 1 < argc) { + iterations = atoi(argv[++i]); + } else if (strcmp(argv[i], "--help") == 0) { + printf("Usage: %s [options]\n", argv[0]); + printf("Options:\n"); + printf(" --size WxH Image size (default: 2048x2048)\n"); + printf(" --iterations N Number of iterations (default: 20)\n"); + printf(" --help Show this help\n"); + return 0; + } + } + + printf("Highway SIMD Benchmark\n"); + printf("======================\n"); + printf("Image size: %dx%d\n", width, height); + printf("Iterations: %d\n", iterations); + + // Verify enable_hwy attribute works + int hwy_enabled = 0; + OIIO::getattribute("enable_hwy", hwy_enabled); + printf("Initial enable_hwy: %d\n", hwy_enabled); + + // Test types + struct TestConfig { + const char* name; + TypeDesc format; + }; + + std::vector configs = { + { "uint8", TypeDesc::UINT8 }, { "uint16", TypeDesc::UINT16 }, + { "uint32", TypeDesc::UINT32 }, { "float", TypeDesc::FLOAT }, + { "half", TypeDesc::HALF }, { "double", TypeDesc::DOUBLE }, + }; + + // Add + std::vector add_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_add(A, B, iterations); + + // Verify: compute scalar and SIMD results + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::add(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::add(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + add_results.push_back(result); + + save_image(A, "src_A", cfg.name); + save_image(B, "src_B", cfg.name); + save_image(R_simd, "result_add", cfg.name); + } + print_test_results("Add", add_results); + + // Sub + std::vector sub_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_sub(A, B, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::sub(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::sub(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + sub_results.push_back(result); + + save_image(R_simd, "result_sub", cfg.name); + } + print_test_results("Sub", sub_results); + + // Mul + std::vector mul_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_mul(A, B, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::mul(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::mul(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + mul_results.push_back(result); + + save_image(R_simd, "result_mul", cfg.name); + } + print_test_results("Mul", mul_results); + + // Pow + std::vector pow_results; + float exponent_vals[] = { 2.2f, 2.2f, 2.2f }; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_pow(A, exponent_vals, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::pow(R_scalar, A, exponent_vals); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::pow(R_simd, A, exponent_vals); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + pow_results.push_back(result); + + save_image(R_simd, "result_pow", cfg.name); + } + print_test_results("Pow", pow_results); + + + // Div + std::vector div_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_div = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::div(R, A, B); }, iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::div(R, A, B); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_div(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::div(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::div(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + div_results.push_back(result); + + save_image(R_simd, "result_div", cfg.name); + } + print_test_results("Div", div_results); + + // Min + std::vector min_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_min = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::min(R, A, B); }, iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::min(R, A, B); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_min(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::min(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::min(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + min_results.push_back(result); + + save_image(R_simd, "result_min", cfg.name); + } + print_test_results("Min", min_results); + + // Max + std::vector max_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_max = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::max(R, A, B); }, iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::max(R, A, B); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_max(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::max(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::max(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + max_results.push_back(result); + + save_image(R_simd, "result_max", cfg.name); + } + print_test_results("Max", max_results); + + // Abs + std::vector abs_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_abs = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms([&]() { ImageBufAlgo::abs(R, A); }, + iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::abs(R, A); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_abs(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::abs(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::abs(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + abs_results.push_back(result); + + save_image(R_simd, "result_abs", cfg.name); + } + print_test_results("Abs", abs_results); + + // Absdiff + std::vector absdiff_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_absdiff = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::absdiff(R, A, B); }, + iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms + = benchmark_ms([&]() { ImageBufAlgo::absdiff(R, A, B); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_absdiff(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::absdiff(R_scalar, A, B); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::absdiff(R_simd, A, B); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + absdiff_results.push_back(result); + + save_image(R_simd, "result_absdiff", cfg.name); + } + print_test_results("Absdiff", absdiff_results); + + // MAD + std::vector mad_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf B = create_test_image(width, height, 3, cfg.format); + ImageBuf C = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_mad = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::mad(R, A, B, C); }, iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms + = benchmark_ms([&]() { ImageBufAlgo::mad(R, A, B, C); }, iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_mad(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::mad(R_scalar, A, B, C); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::mad(R_simd, A, B, C); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + mad_results.push_back(result); + + save_image(R_simd, "result_mad", cfg.name); + } + print_test_results("MAD", mad_results); + + // Clamp + std::vector clamp_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_clamp = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::clamp(R, A, 0.1f, 0.9f); }, + iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms + = benchmark_ms([&]() { ImageBufAlgo::clamp(R, A, 0.1f, 0.9f); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_clamp(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::clamp(R_scalar, A, 0.1f, 0.9f); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::clamp(R_simd, A, 0.1f, 0.9f); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + clamp_results.push_back(result); + + save_image(R_simd, "result_clamp", cfg.name); + } + print_test_results("Clamp", clamp_results); + + // RangeCompress + std::vector rangecompress_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_rangecompress(A, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::rangecompress(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::rangecompress(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + rangecompress_results.push_back(result); + + save_image(R_simd, "result_rangecompress", cfg.name); + } + print_test_results("RangeCompress", rangecompress_results); + + // RangeExpand + std::vector rangeexpand_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_rangeexpand(A, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::rangeexpand(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::rangeexpand(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + rangeexpand_results.push_back(result); + + save_image(R_simd, "result_rangeexpand", cfg.name); + } + print_test_results("RangeExpand", rangeexpand_results); + + // Premult + std::vector premult_results; + for (const auto& cfg : configs) { + ImageBuf A = create_rgba_image(width, height, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_premult(A, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::premult(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::premult(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + premult_results.push_back(result); + + save_image(A, "src_RGBA", cfg.name); + save_image(R_simd, "result_premult", cfg.name); + } + print_test_results("Premult", premult_results); + + // Unpremult + std::vector unpremult_results; + for (const auto& cfg : configs) { + ImageBuf A = create_rgba_image(width, height, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_unpremult(A, iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::unpremult(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::unpremult(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + unpremult_results.push_back(result); + + save_image(R_simd, "result_unpremult", cfg.name); + } + print_test_results("Unpremult", unpremult_results); + + + // Invert + std::vector invert_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + auto bench_invert = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms + = benchmark_ms([&]() { ImageBufAlgo::invert(R, A); }, iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms([&]() { ImageBufAlgo::invert(R, A); }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_invert(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::invert(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::invert(R_simd, A); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + invert_results.push_back(result); + + save_image(R_simd, "result_invert", cfg.name); + } + print_test_results("Invert", invert_results); + + // Contrast Remap (linear stretch) + std::vector contrast_remap_results; + for (const auto& cfg : configs) { + ImageBuf A = create_test_image(width, height, 3, cfg.format); + ImageBuf R_scalar(A.spec()); + ImageBuf R_simd(A.spec()); + + // Linear stretch: remap [0.2, 0.8] -> [0.0, 1.0] + float black_vals[] = { 0.2f, 0.2f, 0.2f }; + float white_vals[] = { 0.8f, 0.8f, 0.8f }; + + auto bench_contrast = [&](int iters = 100) { + BenchResult result; + ImageBuf R(A.spec()); + OIIO::attribute("enable_hwy", 0); + result.scalar_ms = benchmark_ms( + [&]() { + ImageBufAlgo::contrast_remap(R, A, black_vals, white_vals); + }, + iters); + OIIO::attribute("enable_hwy", 1); + result.simd_ms = benchmark_ms( + [&]() { + ImageBufAlgo::contrast_remap(R, A, black_vals, white_vals); + }, + iters); + result.speedup = result.scalar_ms / result.simd_ms; + return result; + }; + + TestResult result; + result.type_name = cfg.name; + result.bench = bench_contrast(iterations); + + // Verify + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::contrast_remap(R_scalar, A, black_vals, white_vals); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::contrast_remap(R_simd, A, black_vals, white_vals); + + result.max_error = verify_match(R_scalar, R_simd); + result.pass = result.max_error < 0.005f; + contrast_remap_results.push_back(result); + + save_image(R_simd, "result_contrast_remap", cfg.name); + } + print_test_results("Contrast Remap (linear)", contrast_remap_results); + + // Resample 75% + printf("\n[ Resample 75%% ]\n"); + //print_header(); + int resample_iters = std::max(1, iterations / 2); + for (const auto& cfg : configs) { + ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format); + ImageSpec newspec = A.spec(); + newspec.width = width * 3 / 4; + newspec.height = height * 3 / 4; + + // Create separate buffers for scalar and SIMD + ImageBuf R_scalar(newspec); + ImageBuf R_simd(newspec); + ImageBufAlgo::zero(R_scalar); + ImageBufAlgo::zero(R_simd); + + print_result(cfg.name, bench_resample(A, width * 3 / 4, height * 3 / 4, + resample_iters)); + + // Save both scalar and SIMD results for comparison + OIIO::attribute("enable_hwy", 0); + ImageBufAlgo::resample(R_scalar, A); + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::resample(R_simd, A); + + save_image(A, "src_checkerboard", cfg.name); + save_image(R_scalar, "result_resample75_scalar", cfg.name); + save_image(R_simd, "result_resample75_simd", cfg.name); + } + + // Resample 50% + printf("\n[ Resample 50%% ]\n"); + //print_header(); + for (const auto& cfg : configs) { + ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format); + ImageSpec newspec = A.spec(); + newspec.width = width / 2; + newspec.height = height / 2; + ImageBuf R(newspec); + ImageBufAlgo::zero(R); + + print_result(cfg.name, + bench_resample(A, width / 2, height / 2, resample_iters)); + + // Save final result + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::resample(R, A); + save_image(R, "result_resample50", cfg.name); + } + + // Resample 25% + printf("\n[ Resample 25%% ]\n"); + for (const auto& cfg : configs) { + ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format); + ImageSpec newspec = A.spec(); + newspec.width = width / 4; + newspec.height = height / 4; + ImageBuf R(newspec); + ImageBufAlgo::zero(R); + + print_result(cfg.name, + bench_resample(A, width / 4, height / 4, resample_iters)); + + // Save final result + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::resample(R, A); + save_image(R, "result_resample25", cfg.name); + } + + // Resample 16.66% + printf("\n[ Resample 16.66%% ]\n"); + for (const auto& cfg : configs) { + ImageBuf A = create_checkerboard_image(width, height, 3, cfg.format); + ImageSpec newspec = A.spec(); + newspec.width = width / 6; + newspec.height = height / 6; + ImageBuf R(newspec); + ImageBufAlgo::zero(R); + + print_result(cfg.name, + bench_resample(A, width / 6, height / 6, resample_iters)); + + // Save final result + OIIO::attribute("enable_hwy", 1); + ImageBufAlgo::resample(R, A); + save_image(R, "result_resample25", cfg.name); + } + + print_header(); + + printf("\nBenchmark complete!\n"); + printf("Note: Speedup > 1.0x means SIMD is faster (shown in green)\n"); + printf(" Speedup < 1.0x means scalar is faster (shown in red)\n"); + + return 0; +} diff --git a/src/include/imageio_pvt.h b/src/include/imageio_pvt.h index 273375cd77..13d5c06140 100644 --- a/src/include/imageio_pvt.h +++ b/src/include/imageio_pvt.h @@ -43,6 +43,7 @@ extern int oiio_log_times; extern int openexr_core; extern int jpeg_com_attributes; extern int png_linear_premult; +extern int enable_hwy; extern int limit_channels; extern int limit_imagesize_MB; extern int imagebuf_print_uncaught_errors; diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt index f2459b2d32..7e8dadd1ad 100644 --- a/src/libOpenImageIO/CMakeLists.txt +++ b/src/libOpenImageIO/CMakeLists.txt @@ -165,6 +165,8 @@ target_link_libraries (OpenImageIO $ ${BZIP2_LIBRARIES} ZLIB::ZLIB + hwy::hwy + hwy::hwy_contrib ${CMAKE_DL_LIBS} ) diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp index c7a4d83e9c..9cf1c61183 100644 --- a/src/libOpenImageIO/imagebufalgo_addsub.cpp +++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp @@ -18,6 +18,8 @@ #include #include +#include "imagebufalgo_hwy_pvt.h" + #include "imageio_pvt.h" @@ -26,8 +28,8 @@ OIIO_NAMESPACE_3_1_BEGIN template static bool -add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +add_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -44,7 +46,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, template static bool -add_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -58,6 +61,288 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +// Native integer add using SaturatedAdd (scale-invariant, no float conversion) +template +static bool +add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + ROI roi, int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + T* r_row = RoiRowPtr(Rv, y, roi); + const T* a_row = RoiRowPtr(Av, y, roi); + const T* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Native integer saturated add - much faster than float conversion! + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyBinaryNativeInt(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::SaturatedAdd(a, b); + }); + } else { + // Scalar fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + T* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const T* a_ptr = ChannelPtr(Av, x, y, roi.chbegin); + const T* b_ptr = ChannelPtr(Bv, x, y, roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + // Saturating add in scalar + int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c]; + if constexpr (std::is_unsigned_v) { + r_ptr[c] = (sum > std::numeric_limits::max()) + ? std::numeric_limits::max() + : (T)sum; + } else { + r_ptr[c] = (sum > std::numeric_limits::max()) + ? std::numeric_limits::max() + : (sum < std::numeric_limits::min()) + ? std::numeric_limits::min() + : (T)sum; + } + } + } + } + } + }); + return true; +} + +template +static bool +add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Process whole line as one vector stream + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Add(a, b); + }); + } else { + // Process pixel by pixel (scalar fallback for strided channels) + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + static_cast(a_ptr[c]) + + static_cast(b_ptr[c])); + } + } + } + } + }); + return true; +} + +template +static bool +add_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + std::byte* r_row = PixelBase(Rv, roi.xbegin, y); + const std::byte* a_row = PixelBase(Av, roi.xbegin, y); + for (int x = roi.xbegin; x < roi.xend; ++x) { + const size_t xoff = static_cast(x - roi.xbegin); + Rtype* r_ptr = reinterpret_cast( + r_row + xoff * Rv.pixel_bytes); + const Atype* a_ptr = reinterpret_cast( + a_row + xoff * Av.pixel_bytes); + for (int c = roi.chbegin; c < roi.chend; ++c) { + r_ptr[c] = (Rtype)((SimdType)a_ptr[c] + (SimdType)b[c]); + } + } + } + }); + return true; +} + +template +static bool +add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) { + // Use native integer path for scale-invariant add when all types match + // and are integer types (much faster: 6-12x vs 3-5x with float conversion) + constexpr bool all_same = std::is_same_v + && std::is_same_v; + constexpr bool is_integer = std::is_integral_v; + if constexpr (all_same && is_integer) { + return add_impl_hwy_native_int(R, A, B, roi, nthreads); + } + return add_impl_hwy(R, A, B, roi, nthreads); + } + return add_impl_scalar(R, A, B, roi, nthreads); +} + +template +static bool +add_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return add_impl_hwy(R, A, b, roi, nthreads); + return add_impl_scalar(R, A, b, roi, nthreads); +} + +// Native integer sub using SaturatedSub (scale-invariant, no float conversion) +template +static bool +sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + ROI roi, int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + T* r_row = RoiRowPtr(Rv, y, roi); + const T* a_row = RoiRowPtr(Av, y, roi); + const T* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Native integer saturated sub - much faster than float conversion! + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyBinaryNativeInt(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::SaturatedSub(a, b); + }); + } else { + // Scalar fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + T* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const T* a_ptr = ChannelPtr(Av, x, y, roi.chbegin); + const T* b_ptr = ChannelPtr(Bv, x, y, roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + // Saturating sub in scalar + if constexpr (std::is_unsigned_v) { + r_ptr[c] = (a_ptr[c] > b_ptr[c]) + ? (a_ptr[c] - b_ptr[c]) + : T(0); + } else { + int64_t diff = (int64_t)a_ptr[c] + - (int64_t)b_ptr[c]; + r_ptr[c] = (diff > std::numeric_limits::max()) + ? std::numeric_limits::max() + : (diff < std::numeric_limits::min()) + ? std::numeric_limits::min() + : (T)diff; + } + } + } + } + } + }); + return true; +} + +template +static bool +sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Sub(a, b); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + static_cast(a_ptr[c]) + - static_cast(b_ptr[c])); + } + } + } + } + }); + return true; +} + +template +static bool +sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) { + // Use native integer path for scale-invariant sub when all types match + // and are integer types (much faster: 6-12x vs 3-5x with float conversion) + constexpr bool all_same = std::is_same_v + && std::is_same_v; + constexpr bool is_integer = std::is_integral_v; + if constexpr (all_same && is_integer) { + return sub_impl_hwy_native_int(R, A, B, roi, nthreads); + } + return sub_impl_hwy(R, A, B, roi, nthreads); + } + return sub_impl_scalar(R, A, B, roi, nthreads); +} + static bool add_impl_deep(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) @@ -155,8 +440,8 @@ ImageBufAlgo::add(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads) template static bool -sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h new file mode 100644 index 0000000000..fe4c9b0d8a --- /dev/null +++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h @@ -0,0 +1,970 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +OIIO_NAMESPACE_BEGIN + +// Alias for Highway's namespace for convenience +namespace hn = hwy::HWY_NAMESPACE; + +// ----------------------------------------------------------------------- +// ImageBuf local pixel helpers (header-only) +// ----------------------------------------------------------------------- + +template struct HwyLocalPixelsView { + ByteT* base = nullptr; + size_t pixel_bytes = 0; + size_t scanline_bytes = 0; + int xbegin = 0; + int ybegin = 0; + int nchannels = 0; +}; + +inline HwyLocalPixelsView +HwyPixels(ImageBuf& img) +{ + const ImageSpec& spec = img.spec(); + return { reinterpret_cast(img.localpixels()), + spec.pixel_bytes(), + spec.scanline_bytes(), + img.xbegin(), + img.ybegin(), + spec.nchannels }; +} + +inline HwyLocalPixelsView +HwyPixels(const ImageBuf& img) +{ + const ImageSpec& spec = img.spec(); + return { reinterpret_cast(img.localpixels()), + spec.pixel_bytes(), + spec.scanline_bytes(), + img.xbegin(), + img.ybegin(), + spec.nchannels }; +} + +inline int +RoiNChannels(const ROI& roi) noexcept +{ + return roi.chend - roi.chbegin; +} + +template +inline bool +ChannelsContiguous(const HwyLocalPixelsView& v, int nchannels) noexcept +{ + return size_t(nchannels) * sizeof(T) == v.pixel_bytes; +} + +template +inline ByteT* +PixelBase(const HwyLocalPixelsView& v, int x, int y) noexcept +{ + return v.base + size_t(y - v.ybegin) * v.scanline_bytes + + size_t(x - v.xbegin) * v.pixel_bytes; +} + +template +inline std::conditional_t, const T*, T*> +ChannelPtr(const HwyLocalPixelsView& v, int x, int y, int ch) noexcept +{ + using RetT = std::conditional_t, const T, T>; + return reinterpret_cast(PixelBase(v, x, y) + size_t(ch) * sizeof(T)); +} + +template +inline std::conditional_t, const T*, T*> +RoiRowPtr(const HwyLocalPixelsView& v, int y, const ROI& roi) noexcept +{ + return ChannelPtr(v, roi.xbegin, y, roi.chbegin); +} + +// ----------------------------------------------------------------------- +// Type Traits +// ----------------------------------------------------------------------- + +/// Determine the appropriate SIMD math type for a given result type. +/// Promotes smaller types to float, keeps double as double. +/// Note: uint32_t uses float (not double) for image processing performance. +/// In OIIO, uint32 images are normalized to 0-1 range like uint8/uint16, +/// so float precision (24-bit mantissa) is sufficient and much faster than double. +template struct SimdMathType { + using type = float; +}; +template<> struct SimdMathType { + using type = double; +}; + +// ----------------------------------------------------------------------- +// Load and Promote +// ----------------------------------------------------------------------- + +/// Load and promote source data to target SIMD type. +/// Handles type conversions from various source formats (uint8_t, int8_t, uint16_t, +/// int16_t, uint32_t, int32_t, uint64_t, int64_t, half, float, double) to the +/// target SIMD computation type. +/// @param d Highway descriptor tag defining the target SIMD type +/// @param ptr Pointer to source data (may be unaligned) +/// @return SIMD vector with promoted values +template +inline auto +LoadPromote(D d, const SrcT* ptr) +{ + using MathT = typename D::T; + + if constexpr (std::is_same_v) { + return hn::Load(d, ptr); + } else if constexpr (std::is_same_v) { + using T16 = hwy::float16_t; + auto d16 = hn::Rebind(); + auto v16 = hn::Load(d16, reinterpret_cast(ptr)); + return hn::PromoteTo(d, v16); + } else if constexpr (std::is_same_v) { + auto d_u8 = hn::Rebind(); + auto v_u8 = hn::Load(d_u8, ptr); + auto v_promoted = hn::ConvertTo( + d, hn::PromoteTo(hn::Rebind(), + hn::PromoteTo(hn::Rebind(), v_u8))); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0))); + } else if constexpr (std::is_same_v) { + auto d_i8 = hn::Rebind(); + auto v_i8 = hn::Load(d_i8, ptr); + auto v_promoted = hn::ConvertTo( + d, hn::PromoteTo(hn::Rebind(), + hn::PromoteTo(hn::Rebind(), v_i8))); + // Normalize: map [-128, 127] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + auto d_u16 = hn::Rebind(); + auto v_u16 = hn::Load(d_u16, ptr); + auto v_promoted + = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind(), v_u16)); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0))); + } else if constexpr (std::is_same_v) { + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::Load(d_i16, ptr); + auto v_promoted + = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind(), v_i16)); + // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + // uint32 to float: Load, convert, and normalize to 0-1 range + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::Load(d_u32, ptr); + auto v_promoted = hn::ConvertTo(d, v_u32); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0))); + } else if constexpr (std::is_same_v) { + // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0] + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::Load(d_i32, ptr); + auto v_promoted = hn::ConvertTo(d, v_i32); + // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, + hn::Set(d, (MathT)(1.0 / 2147483647.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + // uint64 to float: Load and demote to uint32, then convert + // Note: Precision loss expected for large values (>24 bits) + auto d_u64 = hn::Rebind(); + auto v_u64 = hn::Load(d_u64, ptr); + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::DemoteTo(d_u32, v_u64); + return hn::ConvertTo(d, v_u32); + } else if constexpr (std::is_same_v) { + // int64 to float: Load and demote to int32, then convert + auto d_i64 = hn::Rebind(); + auto v_i64 = hn::Load(d_i64, ptr); + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::DemoteTo(d_i32, v_i64); + return hn::ConvertTo(d, v_i32); + } else { + return hn::Zero(d); + } +} + +/// Load and promote partial source data to target SIMD type. +/// Same as LoadPromote but handles partial vectors (< full lane count). +/// @param d Highway descriptor tag defining the target SIMD type +/// @param ptr Pointer to source data (may be unaligned) +/// @param count Number of elements to load (must be <= lane count) +/// @return SIMD vector with promoted values (undefined in unused lanes) +template +inline auto +LoadPromoteN(D d, const SrcT* ptr, size_t count) +{ + using MathT = typename D::T; + + if constexpr (std::is_same_v) { + return hn::LoadN(d, ptr, count); + } else if constexpr (std::is_same_v) { + using T16 = hwy::float16_t; + auto d16 = hn::Rebind(); + auto v16 = hn::LoadN(d16, reinterpret_cast(ptr), count); + return hn::PromoteTo(d, v16); + } else if constexpr (std::is_same_v) { + auto d_u8 = hn::Rebind(); + auto v_u8 = hn::LoadN(d_u8, ptr, count); + auto v_promoted = hn::ConvertTo( + d, hn::PromoteTo(hn::Rebind(), + hn::PromoteTo(hn::Rebind(), v_u8))); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0))); + } else if constexpr (std::is_same_v) { + auto d_i8 = hn::Rebind(); + auto v_i8 = hn::LoadN(d_i8, ptr, count); + auto v_promoted = hn::ConvertTo( + d, hn::PromoteTo(hn::Rebind(), + hn::PromoteTo(hn::Rebind(), v_i8))); + // Normalize: map [-128, 127] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + auto d_u16 = hn::Rebind(); + auto v_u16 = hn::LoadN(d_u16, ptr, count); + auto v_promoted + = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind(), v_u16)); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0))); + } else if constexpr (std::is_same_v) { + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::LoadN(d_i16, ptr, count); + auto v_promoted + = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind(), v_i16)); + // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + // uint32 to float: Load, convert, and normalize to 0-1 range + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::LoadN(d_u32, ptr, count); + auto v_promoted = hn::ConvertTo(d, v_u32); + // Normalize to 0-1 range for image operations + return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0))); + } else if constexpr (std::is_same_v) { + // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0] + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::LoadN(d_i32, ptr, count); + auto v_promoted = hn::ConvertTo(d, v_i32); + // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0] + // Clamp INT_MIN so we never produce values < -1.0. + auto v_norm = hn::Mul(v_promoted, + hn::Set(d, (MathT)(1.0 / 2147483647.0))); + return hn::Max(v_norm, hn::Set(d, (MathT)-1.0)); + } else if constexpr (std::is_same_v) { + // uint64 to float: Load and demote to uint32, then convert + auto d_u64 = hn::Rebind(); + auto v_u64 = hn::LoadN(d_u64, ptr, count); + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::DemoteTo(d_u32, v_u64); + return hn::ConvertTo(d, v_u32); + } else if constexpr (std::is_same_v) { + // int64 to float: Load and demote to int32, then convert + auto d_i64 = hn::Rebind(); + auto v_i64 = hn::LoadN(d_i64, ptr, count); + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::DemoteTo(d_i32, v_i64); + return hn::ConvertTo(d, v_i32); + } else { + return hn::Zero(d); + } +} + +// ----------------------------------------------------------------------- +// Demote and Store +// ----------------------------------------------------------------------- + +/// Demote SIMD values and store to destination type. +/// Handles type conversions from SIMD computation type (float/double) back to +/// various destination formats with proper rounding and clamping for integer types. +/// @param d Highway descriptor tag for the source SIMD type +/// @param ptr Pointer to destination data (may be unaligned) +/// @param v SIMD vector to demote and store +template +inline void +DemoteStore(D d, DstT* ptr, VecT v) +{ + using MathT = typename D::T; + using VecD = hn::Vec; + + if constexpr (std::is_same_v) { + hn::Store(v, d, ptr); + } else if constexpr (std::is_same_v) { + auto d16 = hn::Rebind(); + auto v16 = hn::DemoteTo(d16, v); + hn::Store(v16, d16, reinterpret_cast(ptr)); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-255 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)255.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)255.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + auto d_u8 = hn::Rebind(); + auto v_u8 = hn::DemoteTo(d_u8, v_i16); + hn::Store(v_u8, d_u8, ptr); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from approximately [-1.0, 1.0] range to -128-127 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-128.0); + VecD v_max = hn::Set(d, (MathT)127.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + auto d_i8 = hn::Rebind(); + auto v_i8 = hn::DemoteTo(d_i8, v_i16); + hn::Store(v_i8, d_i8, ptr); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-65535 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)65535.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)65535.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_u16 = hn::Rebind(); + auto v_u16 = hn::DemoteTo(d_u16, vi32); + hn::Store(v_u16, d_u16, ptr); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from approximately [-1.0, 1.0] range to -32768-32767 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-32768.0); + VecD v_max = hn::Set(d, (MathT)32767.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + hn::Store(v_i16, d_i16, ptr); + } else if constexpr (std::is_same_v) { + // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-4294967295 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)4294967295.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::ConvertTo(d_u32, v_clamped); + hn::Store(v_u32, d_u32, ptr); + } else if constexpr (std::is_same_v) { + // float -> int32: Denormalize from approximately [-1.0, 1.0] to int32 range + VecD v_val = (VecD)v; + // Denormalize from approximately [-1.0, 1.0] range to -2147483648-2147483647 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-2147483648.0); + VecD v_max = hn::Set(d, (MathT)2147483647.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::ConvertTo(d_i32, v_clamped); + hn::Store(v_i32, d_i32, ptr); + } else if constexpr (std::is_same_v) { + // float -> uint64: Promote via uint32 + // Note: Precision loss expected (float has only 24-bit mantissa) + VecD v_val = (VecD)v; + VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_clamped = hn::Max(v_rounded, v_zero); + + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::ConvertTo(d_u32, v_clamped); + auto d_u64 = hn::Rebind(); + auto v_u64 = hn::PromoteTo(d_u64, v_u32); + hn::Store(v_u64, d_u64, ptr); + } else if constexpr (std::is_same_v) { + // float -> int64: Promote via int32 + VecD v_val = (VecD)v; + VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5)); + + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::ConvertTo(d_i32, v_rounded); + auto d_i64 = hn::Rebind(); + auto v_i64 = hn::PromoteTo(d_i64, v_i32); + hn::Store(v_i64, d_i64, ptr); + } +} + +/// Demote and store partial SIMD values to destination type. +/// Same as DemoteStore but handles partial vectors (< full lane count). +/// @param d Highway descriptor tag for the source SIMD type +/// @param ptr Pointer to destination data (may be unaligned) +/// @param v SIMD vector to demote and store +/// @param count Number of elements to store (must be <= lane count) +template +inline void +DemoteStoreN(D d, DstT* ptr, VecT v, size_t count) +{ + using MathT = typename D::T; + using VecD = hn::Vec; + + if constexpr (std::is_same_v) { + hn::StoreN(v, d, ptr, count); + } else if constexpr (std::is_same_v) { + auto d16 = hn::Rebind(); + auto v16 = hn::DemoteTo(d16, v); + hn::StoreN(v16, d16, reinterpret_cast(ptr), count); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-255 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)255.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)255.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + auto d_u8 = hn::Rebind(); + auto v_u8 = hn::DemoteTo(d_u8, v_i16); + hn::StoreN(v_u8, d_u8, ptr, count); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from approximately [-1.0, 1.0] range to [-128, 127] range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-128.0); + VecD v_max = hn::Set(d, (MathT)127.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + auto d_i8 = hn::Rebind(); + auto v_i8 = hn::DemoteTo(d_i8, v_i16); + hn::StoreN(v_i8, d_i8, ptr, count); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-65535 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)65535.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)65535.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_u16 = hn::Rebind(); + auto v_u16 = hn::DemoteTo(d_u16, vi32); + hn::StoreN(v_u16, d_u16, ptr, count); + } else if constexpr (std::is_same_v) { + VecD v_val = (VecD)v; + // Denormalize from approximately [-1.0, 1.0] range to [-32768, 32767] range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-32768.0); + VecD v_max = hn::Set(d, (MathT)32767.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d32 = hn::Rebind(); + auto vi32 = hn::ConvertTo(d32, v_clamped); + auto d_i16 = hn::Rebind(); + auto v_i16 = hn::DemoteTo(d_i16, vi32); + hn::StoreN(v_i16, d_i16, ptr, count); + } else if constexpr (std::is_same_v) { + // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert + VecD v_val = (VecD)v; + // Denormalize from 0-1 range to 0-4294967295 range + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0)); + VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_max = hn::Set(d, (MathT)4294967295.0); + VecD v_clamped = hn::Max(v_rounded, v_zero); + v_clamped = hn::Min(v_clamped, v_max); + + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::ConvertTo(d_u32, v_clamped); + hn::StoreN(v_u32, d_u32, ptr, count); + } else if constexpr (std::is_same_v) { + // float -> int32: Denormalize from approximately [-1.0, 1.0] range to [-2147483648, 2147483647] range + VecD v_val = (VecD)v; + VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0)); + // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates). + auto is_neg = hn::Lt(v_denorm, hn::Zero(d)); + auto v_bias = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5), + hn::Set(d, (MathT)0.5)); + VecD v_rounded = hn::Add(v_denorm, v_bias); + VecD v_min = hn::Set(d, (MathT)-2147483648.0); + VecD v_max = hn::Set(d, (MathT)2147483647.0); + VecD v_clamped = hn::Max(v_rounded, v_min); + v_clamped = hn::Min(v_clamped, v_max); + + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::ConvertTo(d_i32, v_clamped); + hn::StoreN(v_i32, d_i32, ptr, count); + } else if constexpr (std::is_same_v) { + // float -> uint64: Promote via uint32 + VecD v_val = (VecD)v; + VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5)); + VecD v_zero = hn::Zero(d); + VecD v_clamped = hn::Max(v_rounded, v_zero); + + auto d_u32 = hn::Rebind(); + auto v_u32 = hn::ConvertTo(d_u32, v_clamped); + auto d_u64 = hn::Rebind(); + auto v_u64 = hn::PromoteTo(d_u64, v_u32); + hn::StoreN(v_u64, d_u64, ptr, count); + } else if constexpr (std::is_same_v) { + // float -> int64: Promote via int32 + VecD v_val = (VecD)v; + VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5)); + + auto d_i32 = hn::Rebind(); + auto v_i32 = hn::ConvertTo(d_i32, v_rounded); + auto d_i64 = hn::Rebind(); + auto v_i64 = hn::PromoteTo(d_i64, v_i32); + hn::StoreN(v_i64, d_i64, ptr, count); + } +} + +// ----------------------------------------------------------------------- +// Native Integer Kernel Runners (No Type Conversion) +// ----------------------------------------------------------------------- + +/// Execute a unary SIMD operation on native integer arrays (no type promotion). +/// For scale-invariant operations like abs, where int_op(a) == denorm(float_op(norm(a))). +/// Much faster than promotion path - operates directly on integer SIMD vectors. +/// @param r Destination array (same type as source) +/// @param a Source array +/// @param n Number of elements to process +/// @param op Lambda/functor taking (descriptor, vector) and returning result vector +/// Example: [](auto d, auto va) { return hn::Abs(va); } +template +inline void +RunHwyUnaryNativeInt(T* r, const T* a, size_t n, OpFunc op) +{ + const hn::ScalableTag d; + size_t x = 0; + size_t lanes = hn::Lanes(d); + for (; x + lanes <= n; x += lanes) { + auto va = hn::Load(d, a + x); + auto res = op(d, va); + hn::Store(res, d, r + x); + } + size_t remaining = n - x; + if (remaining > 0) { + auto va = hn::LoadN(d, a + x, remaining); + auto res = op(d, va); + hn::StoreN(res, d, r + x, remaining); + } +} + +/// Execute a binary SIMD operation on native integer arrays (no type promotion). +/// For scale-invariant operations like saturated add, min, max, where: +/// int_op(a, b) == denorm(float_op(norm(a), norm(b))). +/// Much faster than promotion path - no conversion overhead. +/// @param r Destination array (same type as sources) +/// @param a First source array +/// @param b Second source array +/// @param n Number of elements to process +/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result +/// Example: [](auto d, auto va, auto vb) { return hn::SaturatedAdd(va, vb); } +template +inline void +RunHwyBinaryNativeInt(T* r, const T* a, const T* b, size_t n, OpFunc op) +{ + const hn::ScalableTag d; + size_t x = 0; + size_t lanes = hn::Lanes(d); + for (; x + lanes <= n; x += lanes) { + auto va = hn::Load(d, a + x); + auto vb = hn::Load(d, b + x); + auto res = op(d, va, vb); + hn::Store(res, d, r + x); + } + size_t remaining = n - x; + if (remaining > 0) { + auto va = hn::LoadN(d, a + x, remaining); + auto vb = hn::LoadN(d, b + x, remaining); + auto res = op(d, va, vb); + hn::StoreN(res, d, r + x, remaining); + } +} + +// ----------------------------------------------------------------------- +// Generic Kernel Runners (With Type Conversion) +// ----------------------------------------------------------------------- + +/// Execute a unary SIMD operation on an array. +/// Processes array elements in SIMD batches, handling type promotion/demotion +/// and partial vectors at the end. +/// @param r Destination array +/// @param a Source array +/// @param n Number of elements to process +/// @param op Lambda/functor taking (descriptor, vector) and returning result vector +/// Example: [](auto d, auto va) { return hn::Sqrt(va); } +template +inline void +RunHwyUnaryCmd(Rtype* r, const Atype* a, size_t n, OpFunc op) +{ + using MathT = typename SimdMathType::type; + const hn::ScalableTag d; + size_t x = 0; + size_t lanes = hn::Lanes(d); + for (; x + lanes <= n; x += lanes) { + auto va = LoadPromote(d, a + x); + auto res = op(d, va); + DemoteStore(d, r + x, res); + } + size_t remaining = n - x; + if (remaining > 0) { + auto va = LoadPromoteN(d, a + x, remaining); + auto res = op(d, va); + DemoteStoreN(d, r + x, res, remaining); + } +} + +/// Execute a binary SIMD operation on two arrays. +/// Processes array elements in SIMD batches, handling type promotion/demotion +/// and partial vectors at the end. +/// @param r Destination array +/// @param a First source array +/// @param b Second source array +/// @param n Number of elements to process +/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result +/// Example: [](auto d, auto va, auto vb) { return hn::Add(va, vb); } +template +inline void +RunHwyCmd(Rtype* r, const Atype* a, const Btype* b, size_t n, OpFunc op) +{ + using MathT = typename SimdMathType::type; + const hn::ScalableTag d; + size_t x = 0; + size_t lanes = hn::Lanes(d); + for (; x + lanes <= n; x += lanes) { + auto va = LoadPromote(d, a + x); + auto vb = LoadPromote(d, b + x); + auto res = op(d, va, vb); + DemoteStore(d, r + x, res); + } + size_t remaining = n - x; + if (remaining > 0) { + auto va = LoadPromoteN(d, a + x, remaining); + auto vb = LoadPromoteN(d, b + x, remaining); + auto res = op(d, va, vb); + DemoteStoreN(d, r + x, res, remaining); + } +} + +/// Execute a ternary SIMD operation on three arrays. +/// Processes array elements in SIMD batches, handling type promotion/demotion +/// and partial vectors at the end. +/// @param r Destination array +/// @param a First source array +/// @param b Second source array +/// @param c Third source array +/// @param n Number of elements to process +/// @param op Lambda/functor taking (descriptor, vector_a, vector_b, vector_c) and returning result +/// Example: [](auto d, auto va, auto vb, auto vc) { return hn::MulAdd(va, vb, vc); } +template +inline void +RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c, + size_t n, OpFunc op) +{ + using MathT = typename SimdMathType::type; + const hn::ScalableTag d; + size_t x = 0; + size_t lanes = hn::Lanes(d); + for (; x + lanes <= n; x += lanes) { + auto va = LoadPromote(d, a + x); + auto vb = LoadPromote(d, b + x); + auto vc = LoadPromote(d, c + x); + auto res = op(d, va, vb, vc); + DemoteStore(d, r + x, res); + } + size_t remaining = n - x; + if (remaining > 0) { + auto va = LoadPromoteN(d, a + x, remaining); + auto vb = LoadPromoteN(d, b + x, remaining); + auto vc = LoadPromoteN(d, c + x, remaining); + auto res = op(d, va, vb, vc); + DemoteStoreN(d, r + x, res, remaining); + } +} + +// ----------------------------------------------------------------------- +// Interleaved Channel Load/Store Helpers +// ----------------------------------------------------------------------- + +/// Load 4 interleaved channels (RGBA) with type promotion. +/// For matching types, uses Highway's native LoadInterleaved4. +/// For type promotion, loads and manually deinterleaves. +/// @param d Highway descriptor tag for the target SIMD type +/// @param ptr Pointer to interleaved RGBA data (R0,G0,B0,A0,R1,G1,B1,A1,...) +/// @return Tuple of (R, G, B, A) SIMD vectors in promoted type +template +inline auto +LoadInterleaved4Promote(D d, const SrcT* ptr) +{ + using MathT = typename D::T; + using Vec = hn::Vec; + + if constexpr (std::is_same_v) { + // No promotion needed - use Highway's optimized LoadInterleaved4 + Vec r, g, b, a; + hn::LoadInterleaved4(d, ptr, r, g, b, a); + return std::make_tuple(r, g, b, a); + } else if constexpr (std::is_same_v) { + // Special handling for half type - convert through hwy::float16_t + using T16 = hwy::float16_t; + auto d16 = hn::Rebind(); + + // Load interleaved half data as float16_t + hn::Vec r16, g16, b16, a16; + hn::LoadInterleaved4(d16, reinterpret_cast(ptr), r16, g16, + b16, a16); + + // Promote to computation type + Vec r_vec = hn::PromoteTo(d, r16); + Vec g_vec = hn::PromoteTo(d, g16); + Vec b_vec = hn::PromoteTo(d, b16); + Vec a_vec = hn::PromoteTo(d, a16); + + return std::make_tuple(r_vec, g_vec, b_vec, a_vec); + } else { + // Generic type promotion - deinterleave manually with normalization + const size_t N = hn::Lanes(d); + SrcT r_src[hn::MaxLanes(d)]; + SrcT g_src[hn::MaxLanes(d)]; + SrcT b_src[hn::MaxLanes(d)]; + SrcT a_src[hn::MaxLanes(d)]; + + for (size_t i = 0; i < N; ++i) { + r_src[i] = ptr[i * 4 + 0]; + g_src[i] = ptr[i * 4 + 1]; + b_src[i] = ptr[i * 4 + 2]; + a_src[i] = ptr[i * 4 + 3]; + } + + // Use LoadPromote for proper normalization of integer types + auto r_vec = LoadPromote(d, r_src); + auto g_vec = LoadPromote(d, g_src); + auto b_vec = LoadPromote(d, b_src); + auto a_vec = LoadPromote(d, a_src); + + return std::make_tuple(r_vec, g_vec, b_vec, a_vec); + } +} + +/// Store 4 interleaved channels (RGBA) with type demotion. +/// For matching types, uses Highway's native StoreInterleaved4. +/// For type demotion, manually interleaves and stores. +/// @param d Highway descriptor tag for the source SIMD type +/// @param ptr Pointer to destination interleaved RGBA data +/// @param r Red channel SIMD vector +/// @param g Green channel SIMD vector +/// @param b Blue channel SIMD vector +/// @param a Alpha channel SIMD vector +template +inline void +StoreInterleaved4Demote(D d, DstT* ptr, VecT r, VecT g, VecT b, VecT a) +{ + using MathT = typename D::T; + + if constexpr (std::is_same_v) { + // No demotion needed - use Highway's optimized StoreInterleaved4 + hn::StoreInterleaved4(r, g, b, a, d, ptr); + } else if constexpr (std::is_same_v) { + // Special handling for half type - convert through hwy::float16_t + using T16 = hwy::float16_t; + auto d16 = hn::Rebind(); + + // Demote to float16_t + auto r16 = hn::DemoteTo(d16, r); + auto g16 = hn::DemoteTo(d16, g); + auto b16 = hn::DemoteTo(d16, b); + auto a16 = hn::DemoteTo(d16, a); + + // Store interleaved float16_t data + hn::StoreInterleaved4(r16, g16, b16, a16, d16, + reinterpret_cast(ptr)); + } else { + // Generic type demotion - use DemoteStore for each channel then interleave + const size_t N = hn::Lanes(d); + + // Temporary arrays for demoted values + DstT r_demoted[hn::MaxLanes(d)]; + DstT g_demoted[hn::MaxLanes(d)]; + DstT b_demoted[hn::MaxLanes(d)]; + DstT a_demoted[hn::MaxLanes(d)]; + + // Use DemoteStoreN to properly denormalize integer types + DemoteStoreN(d, r_demoted, r, N); + DemoteStoreN(d, g_demoted, g, N); + DemoteStoreN(d, b_demoted, b, N); + DemoteStoreN(d, a_demoted, a, N); + + // Interleave the demoted values + for (size_t i = 0; i < N; ++i) { + ptr[i * 4 + 0] = r_demoted[i]; + ptr[i * 4 + 1] = g_demoted[i]; + ptr[i * 4 + 2] = b_demoted[i]; + ptr[i * 4 + 3] = a_demoted[i]; + } + } +} + +// ----------------------------------------------------------------------- +// Rangecompress/Rangeexpand SIMD Kernels +// ----------------------------------------------------------------------- + +/// Apply rangecompress formula to a SIMD vector. +/// Formula (courtesy Sony Pictures Imageworks): +/// if (|x| <= 0.18) return x +/// else return copysign(a + b * log(c * |x| + 1), x) +/// where a = -0.545768857, b = 0.183516696, c = 284.357788 +/// @param d Highway descriptor tag +/// @param x Input SIMD vector +/// @return Compressed SIMD vector +template +inline auto +rangecompress_simd(D d, VecT x) +{ + using T = typename D::T; + + // Constants from Sony Pictures Imageworks + constexpr T x1 = static_cast(0.18); + constexpr T a = static_cast(-0.54576885700225830078); + constexpr T b = static_cast(0.18351669609546661377); + constexpr T c = static_cast(284.3577880859375); + + auto abs_x = hn::Abs(x); + auto mask_passthrough = hn::Le(abs_x, hn::Set(d, x1)); + + // compressed = a + b * log(c * |x| + 1.0) + auto c_vec = hn::Set(d, c); + auto one = hn::Set(d, static_cast(1.0)); + auto temp = hn::MulAdd(c_vec, abs_x, one); // c * |x| + 1.0 + auto log_val = hn::Log(d, temp); + auto b_vec = hn::Set(d, b); + auto a_vec = hn::Set(d, a); + auto compressed = hn::MulAdd(b_vec, log_val, a_vec); // a + b * log + + // Apply sign of original x + auto result = hn::CopySign(compressed, x); + + // If |x| <= x1, return x; else return compressed + return hn::IfThenElse(mask_passthrough, x, result); +} + +/// Apply rangeexpand formula to a SIMD vector (inverse of rangecompress). +/// Formula: +/// if (|y| <= 0.18) return y +/// else x = exp((|y| - a) / b); x = (x - 1) / c +/// if x < 0.18 then x = (-x_intermediate - 1) / c +/// return copysign(x, y) +/// @param d Highway descriptor tag +/// @param y Input SIMD vector (compressed values) +/// @return Expanded SIMD vector +template +inline auto +rangeexpand_simd(D d, VecT y) +{ + using T = typename D::T; + + // Constants (same as rangecompress) + constexpr T x1 = static_cast(0.18); + constexpr T a = static_cast(-0.54576885700225830078); + constexpr T b = static_cast(0.18351669609546661377); + constexpr T c = static_cast(284.3577880859375); + + auto abs_y = hn::Abs(y); + auto mask_passthrough = hn::Le(abs_y, hn::Set(d, x1)); + + // x_intermediate = exp((|y| - a) / b) + auto a_vec = hn::Set(d, a); + auto b_vec = hn::Set(d, b); + auto intermediate = hn::Div(hn::Sub(abs_y, a_vec), b_vec); // (|y| - a) / b + auto x_intermediate = hn::Exp(d, intermediate); + + // x = (x_intermediate - 1.0) / c + auto one = hn::Set(d, static_cast(1.0)); + auto c_vec = hn::Set(d, c); + auto x = hn::Div(hn::Sub(x_intermediate, one), c_vec); + + // If x < x1, use alternate solution: (-x_intermediate - 1.0) / c + auto mask_alternate = hn::Lt(x, hn::Set(d, x1)); + auto x_alternate = hn::Div(hn::Sub(hn::Neg(x_intermediate), one), c_vec); + x = hn::IfThenElse(mask_alternate, x_alternate, x); + + // Apply sign of input y + auto result = hn::CopySign(x, y); + + return hn::IfThenElse(mask_passthrough, y, result); +} + +OIIO_NAMESPACE_END diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp index 5707fcd6ac..74ca2b03d7 100644 --- a/src/libOpenImageIO/imagebufalgo_mad.cpp +++ b/src/libOpenImageIO/imagebufalgo_mad.cpp @@ -12,6 +12,7 @@ #include #include +#include "imagebufalgo_hwy_pvt.h" #include "imageio_pvt.h" @@ -21,67 +22,88 @@ OIIO_NAMESPACE_3_1_BEGIN template static bool -mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C, - ROI roi, int nthreads) +mad_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + const ImageBuf& C, ROI roi, int nthreads) +{ + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + ImageBuf::Iterator r(R, roi); + ImageBuf::ConstIterator a(A, roi); + ImageBuf::ConstIterator b(B, roi); + ImageBuf::ConstIterator c(C, roi); + for (; !r.done(); ++r, ++a, ++b, ++c) { + for (int ch = roi.chbegin; ch < roi.chend; ++ch) + r[ch] = a[ch] * b[ch] + c[ch]; + } + }); + return true; +} + + + +template +static bool +mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + const ImageBuf& C, ROI roi, int nthreads) { + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + auto Cv = HwyPixels(C); ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { - if ((std::is_same::value - || std::is_same::value) - && (std::is_same::value - || std::is_same::value) - // && R.localpixels() // has to be, because it's writable - && A.localpixels() && B.localpixels() - && C.localpixels() - // && R.contains_roi(roi) // has to be, because IBAPrep - && A.contains_roi(roi) && B.contains_roi(roi) && C.contains_roi(roi) - && roi.chbegin == 0 && roi.chend == R.nchannels() - && roi.chend == A.nchannels() && roi.chend == B.nchannels() - && roi.chend == C.nchannels()) { - // Special case when all inputs are either float or half, with in- - // memory contiguous data and we're operating on the full channel - // range: skip iterators: For these circumstances, we can operate on - // the raw memory very efficiently. Otherwise, we will need the - // magic of the the Iterators (and pay the price). - int nxvalues = roi.width() * R.nchannels(); - for (int z = roi.zbegin; z < roi.zend; ++z) - for (int y = roi.ybegin; y < roi.yend; ++y) { - Rtype* rraw = (Rtype*)R.pixeladdr(roi.xbegin, y, z); - const ABCtype* araw - = (const ABCtype*)A.pixeladdr(roi.xbegin, y, z); - const ABCtype* braw - = (const ABCtype*)B.pixeladdr(roi.xbegin, y, z); - const ABCtype* craw - = (const ABCtype*)C.pixeladdr(roi.xbegin, y, z); - OIIO_DASSERT(araw && braw && craw); - // The straightforward loop auto-vectorizes very well, - // there's no benefit to using explicit SIMD here. - for (int x = 0; x < nxvalues; ++x) - rraw[x] = araw[x] * braw[x] + craw[x]; - // But if you did want to explicitly vectorize, this is - // how it would look: - // int simdend = nxvalues & (~3); // how many float4's? - // for (int x = 0; x < simdend; x += 4) { - // simd::float4 a_simd(araw+x), b_simd(braw+x), c_simd(craw+x); - // simd::float4 r_simd = a_simd * b_simd + c_simd; - // r_simd.store (rraw+x); - // } - // for (int x = simdend; x < nxvalues; ++x) - // rraw[x] = araw[x] * braw[x] + craw[x]; + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels) + && ChannelsContiguous(Cv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const ABCtype* a_row = RoiRowPtr(Av, y, roi); + const ABCtype* b_row = RoiRowPtr(Bv, y, roi); + const ABCtype* c_row = RoiRowPtr(Cv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + // Use Highway SIMD for a*b+c (fused multiply-add) + RunHwyTernaryCmd(r_row, a_row, b_row, c_row, n, + [](auto d, auto a, auto b, + auto c) { + return hn::MulAdd(a, b, c); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const ABCtype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const ABCtype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + const ABCtype* c_ptr = ChannelPtr(Cv, x, y, + roi.chbegin); + for (int ch = 0; ch < nchannels; ++ch) { + r_ptr[ch] = static_cast( + static_cast(a_ptr[ch]) + * static_cast(b_ptr[ch]) + + static_cast(c_ptr[ch])); + } } - } else { - ImageBuf::Iterator r(R, roi); - ImageBuf::ConstIterator a(A, roi); - ImageBuf::ConstIterator b(B, roi); - ImageBuf::ConstIterator c(C, roi); - for (; !r.done(); ++r, ++a, ++b, ++c) { - for (int ch = roi.chbegin; ch < roi.chend; ++ch) - r[ch] = a[ch] * b[ch] + c[ch]; } } }); return true; } +template +static bool +mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C, + ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels() && C.localpixels()) + return mad_impl_hwy(R, A, B, C, roi, nthreads); + return mad_impl_scalar(R, A, B, C, roi, nthreads); +} + template @@ -232,11 +254,73 @@ ImageBufAlgo::mad(Image_or_Const A, Image_or_Const B, Image_or_Const C, ROI roi, +// Highway SIMD implementation for invert: 1 - x +template +static bool +invert_impl_hwy(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads) +{ + using MathT = typename SimdMathType::type; + + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyUnaryCmd( + r_row, a_row, n, [](auto d, auto va) { + auto one = hn::Set(d, static_cast(1.0)); + return hn::Sub(one, va); + }); + } else { + // Non-contiguous fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + 1.0f - static_cast(a_ptr[c])); + } + } + } + } + }); + return true; +} + + +// Dispatcher for invert +template +static bool +invert_impl(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return invert_impl_hwy(R, A, roi, nthreads); + + // Scalar fallback: use mad(A, -1.0, 1.0) + return ImageBufAlgo::mad(R, A, -1.0, 1.0, roi, nthreads); +} + + bool ImageBufAlgo::invert(ImageBuf& dst, const ImageBuf& A, ROI roi, int nthreads) { - // Calculate invert as simply 1-A == A*(-1)+1 - return mad(dst, A, -1.0, 1.0, roi, nthreads); + OIIO::pvt::LoggedTimer logtime("IBA::invert"); + if (!IBAprep(roi, &dst, &A)) + return false; + bool ok; + OIIO_DISPATCH_COMMON_TYPES2(ok, "invert", invert_impl, dst.spec().format, + A.spec().format, dst, A, roi, nthreads); + return ok; } diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp index 4fa1a6cba0..8c5c4582e8 100644 --- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp +++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp @@ -12,6 +12,7 @@ #include +#include "imagebufalgo_hwy_pvt.h" #include #include #include @@ -86,8 +87,8 @@ ImageBufAlgo::scale(const ImageBuf& A, const ImageBuf& B, KWArgs options, template static bool -mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +mul_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -104,7 +105,8 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, template static bool -mul_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +mul_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::ConstIterator a(A, roi); @@ -117,6 +119,100 @@ mul_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +template +static bool +mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Mul(a, b); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + static_cast(a_ptr[c]) + * static_cast(b_ptr[c])); + } + } + } + } + }); + return true; +} + +template +static bool +mul_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + std::byte* r_row = PixelBase(Rv, roi.xbegin, y); + const std::byte* a_row = PixelBase(Av, roi.xbegin, y); + for (int x = roi.xbegin; x < roi.xend; ++x) { + const size_t xoff = static_cast(x - roi.xbegin); + Rtype* r_ptr = reinterpret_cast( + r_row + xoff * Rv.pixel_bytes); + const Atype* a_ptr = reinterpret_cast( + a_row + xoff * Av.pixel_bytes); + + for (int c = roi.chbegin; c < roi.chend; ++c) { + r_ptr[c] = (Rtype)((SimdType)a_ptr[c] * (SimdType)b[c]); + } + } + } + }); + return true; +} + +template +static bool +mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) + return mul_impl_hwy(R, A, B, roi, nthreads); + return mul_impl_scalar(R, A, B, roi, nthreads); +} + +template +static bool +mul_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return mul_impl_hwy(R, A, b, roi, nthreads); + return mul_impl_scalar(R, A, b, roi, nthreads); +} + static bool mul_impl_deep(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) @@ -198,8 +294,8 @@ ImageBufAlgo::mul(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads) template static bool -div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +div_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -216,6 +312,69 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, +template +static bool +div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd( + r_row, a_row, b_row, n, [](auto d, auto a, auto b) { + // Check for zero division: if b == 0, return 0 + auto zero = hn::Zero(d); + auto mask = hn::Eq(b, zero); + return hn::IfThenElse(mask, zero, hn::Div(a, b)); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + float v = static_cast(b_ptr[c]); + r_ptr[c] = (v == 0.0f) + ? static_cast(0.0f) + : static_cast( + static_cast(a_ptr[c]) / v); + } + } + } + } + }); + return true; +} + +template +static bool +div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) + return div_impl_hwy(R, A, B, roi, nthreads); + return div_impl_scalar(R, A, B, roi, nthreads); +} + + + bool ImageBufAlgo::div(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi, int nthreads) diff --git a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp index 04daeaad27..1be1acc9ad 100644 --- a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp +++ b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp @@ -19,6 +19,7 @@ #include #include +#include "imagebufalgo_hwy_pvt.h" #include "imageio_pvt.h" @@ -27,8 +28,8 @@ OIIO_NAMESPACE_3_1_BEGIN template static bool -min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +min_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -45,7 +46,8 @@ min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, template static bool -min_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +min_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -59,6 +61,155 @@ min_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +// Native integer min (scale-invariant, no float conversion) +template +static bool +min_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + ROI roi, int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + T* r_row = RoiRowPtr(Rv, y, roi); + const T* a_row = RoiRowPtr(Av, y, roi); + const T* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Native integer min - much faster than float conversion! + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyBinaryNativeInt(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Min(a, b); + }); + } else { + // Scalar fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + T* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const T* a_ptr = ChannelPtr(Av, x, y, roi.chbegin); + const T* b_ptr = ChannelPtr(Bv, x, y, roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = std::min(a_ptr[c], b_ptr[c]); + } + } + } + } + }); + return true; +} + +template +static bool +min_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Min(a, b); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + std::min(static_cast(a_ptr[c]), + static_cast(b_ptr[c]))); + } + } + } + } + }); + return true; +} + +template +static bool +min_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + std::byte* r_row = PixelBase(Rv, roi.xbegin, y); + const std::byte* a_row = PixelBase(Av, roi.xbegin, y); + for (int x = roi.xbegin; x < roi.xend; ++x) { + const size_t xoff = static_cast(x - roi.xbegin); + Rtype* r_ptr = reinterpret_cast( + r_row + xoff * Rv.pixel_bytes); + const Atype* a_ptr = reinterpret_cast( + a_row + xoff * Av.pixel_bytes); + for (int c = roi.chbegin; c < roi.chend; ++c) { + r_ptr[c] = (Rtype)std::min((SimdType)a_ptr[c], + (SimdType)b[c]); + } + } + } + }); + return true; +} + +template +static bool +min_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) { + // Use native integer path for scale-invariant min when all types match + // and are integer types (much faster: 6-12x vs 3-5x with float conversion) + constexpr bool all_same = std::is_same_v + && std::is_same_v; + constexpr bool is_integer = std::is_integral_v; + if constexpr (all_same && is_integer) { + return min_impl_hwy_native_int(R, A, B, roi, nthreads); + } + return min_impl_hwy(R, A, B, roi, nthreads); + } + return min_impl_scalar(R, A, B, roi, nthreads); +} + +template +static bool +min_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return min_impl_hwy(R, A, b, roi, nthreads); + return min_impl_scalar(R, A, b, roi, nthreads); +} + + + bool ImageBufAlgo::min(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi, int nthreads) @@ -124,8 +275,8 @@ ImageBufAlgo::min(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads) template static bool -max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +max_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -142,7 +293,8 @@ max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, template static bool -max_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +max_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -156,6 +308,155 @@ max_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +// Native integer max (scale-invariant, no float conversion) +template +static bool +max_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + ROI roi, int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + T* r_row = RoiRowPtr(Rv, y, roi); + const T* a_row = RoiRowPtr(Av, y, roi); + const T* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Native integer max - much faster than float conversion! + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyBinaryNativeInt(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Max(a, b); + }); + } else { + // Scalar fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + T* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const T* a_ptr = ChannelPtr(Av, x, y, roi.chbegin); + const T* b_ptr = ChannelPtr(Bv, x, y, roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = std::max(a_ptr[c], b_ptr[c]); + } + } + } + } + }); + return true; +} + +template +static bool +max_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Max(a, b); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + std::max(static_cast(a_ptr[c]), + static_cast(b_ptr[c]))); + } + } + } + } + }); + return true; +} + +template +static bool +max_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + std::byte* r_row = PixelBase(Rv, roi.xbegin, y); + const std::byte* a_row = PixelBase(Av, roi.xbegin, y); + for (int x = roi.xbegin; x < roi.xend; ++x) { + const size_t xoff = static_cast(x - roi.xbegin); + Rtype* r_ptr = reinterpret_cast( + r_row + xoff * Rv.pixel_bytes); + const Atype* a_ptr = reinterpret_cast( + a_row + xoff * Av.pixel_bytes); + for (int c = roi.chbegin; c < roi.chend; ++c) { + r_ptr[c] = (Rtype)std::max((SimdType)a_ptr[c], + (SimdType)b[c]); + } + } + } + }); + return true; +} + +template +static bool +max_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) { + // Use native integer path for scale-invariant max when all types match + // and are integer types (much faster: 6-12x vs 3-5x with float conversion) + constexpr bool all_same = std::is_same_v + && std::is_same_v; + constexpr bool is_integer = std::is_integral_v; + if constexpr (all_same && is_integer) { + return max_impl_hwy_native_int(R, A, B, roi, nthreads); + } + return max_impl_hwy(R, A, B, roi, nthreads); + } + return max_impl_scalar(R, A, B, roi, nthreads); +} + +template +static bool +max_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return max_impl_hwy(R, A, b, roi, nthreads); + return max_impl_scalar(R, A, b, roi, nthreads); +} + + + bool ImageBufAlgo::max(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi, int nthreads) @@ -221,8 +522,8 @@ ImageBufAlgo::max(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads) template static bool -clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max, - bool clampalpha01, ROI roi, int nthreads) +clamp_scalar(ImageBuf& dst, const ImageBuf& src, const float* min, + const float* max, bool clampalpha01, ROI roi, int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::ConstIterator s(src, roi); @@ -240,6 +541,105 @@ clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max, } +template +static bool +clamp_hwy(ImageBuf& dst, const ImageBuf& src, const float* min_vals, + const float* max_vals, bool clampalpha01, ROI roi, int nthreads) +{ + using MathT = typename SimdMathType::type; + + auto Dv = HwyPixels(dst); + auto Sv = HwyPixels(src); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Dv, nchannels) + && ChannelsContiguous(Sv, nchannels); + + // Set up Highway + const hn::ScalableTag d; + size_t lanes = hn::Lanes(d); + + // Pre-compute min/max pattern repeated to fill vector lanes + // Pattern: [min[0], min[1], ..., min[nch-1], min[0], min[1], ...] + MathT min_pattern[hn::MaxLanes(d)]; + MathT max_pattern[hn::MaxLanes(d)]; + for (size_t i = 0; i < lanes; ++i) { + int ch = static_cast(i % nchannels); + min_pattern[i] = static_cast(min_vals[roi.chbegin + ch]); + max_pattern[i] = static_cast(max_vals[roi.chbegin + ch]); + } + auto v_min = hn::Load(d, min_pattern); + auto v_max = hn::Load(d, max_pattern); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Dtype* d_row = RoiRowPtr(Dv, y, roi); + const Stype* s_row = RoiRowPtr(Sv, y, roi); + + if (contig) { + size_t total = static_cast(roi.width()) * nchannels; + size_t x = 0; + + // Process full vectors (pattern wraps correctly even when lanes % nchannels != 0) + if (nchannels > 0) { + for (; x + lanes <= total; x += lanes) { + auto va = LoadPromote(d, s_row + x); + auto res = hn::Clamp(va, v_min, v_max); + DemoteStore(d, d_row + x, res); + } + } + + // Handle remaining values with partial vector load/store + if (x < total) { + size_t remaining = total - x; + auto va = LoadPromoteN(d, s_row + x, remaining); + auto res = hn::Clamp(va, v_min, v_max); + DemoteStoreN(d, d_row + x, res, remaining); + } + } else { + // Non-contiguous: scalar fallback per pixel + for (int x = roi.xbegin; x < roi.xend; ++x) { + Dtype* d_ptr = reinterpret_cast( + PixelBase(Dv, x, y)); + const Stype* s_ptr = reinterpret_cast( + PixelBase(Sv, x, y)); + for (int c = roi.chbegin; c < roi.chend; ++c) { + d_ptr[c] = static_cast( + OIIO::clamp(static_cast(s_ptr[c]), + min_vals[c], max_vals[c])); + } + } + } + } + + // Handle clampalpha01 separately (clamp alpha to [0,1]) + int a = src.spec().alpha_channel; + if (clampalpha01 && a >= roi.chbegin && a < roi.chend) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Dtype* d_ptr = reinterpret_cast( + PixelBase(Dv, x, y)); + d_ptr[a] = static_cast( + OIIO::clamp(static_cast(d_ptr[a]), 0.0f, + 1.0f)); + } + } + } + }); + return true; +} + + +template +static bool +clamp_(ImageBuf& dst, const ImageBuf& src, const float* min, const float* max, + bool clampalpha01, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels()) + return clamp_hwy(dst, src, min, max, clampalpha01, roi, nthreads); + return clamp_scalar(dst, src, min, max, clampalpha01, roi, nthreads); +} + + bool ImageBufAlgo::clamp(ImageBuf& dst, const ImageBuf& src, cspan min, @@ -277,8 +677,8 @@ ImageBufAlgo::clamp(const ImageBuf& src, cspan min, cspan max, template static bool -absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, - int nthreads) +absdiff_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -294,8 +694,8 @@ absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, template static bool -absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, - int nthreads) +absdiff_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::Iterator r(R, roi); @@ -309,6 +709,170 @@ absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, +// Native integer absdiff (scale-invariant, no float conversion) +template +static bool +absdiff_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, + ROI roi, int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + T* r_row = RoiRowPtr(Rv, y, roi); + const T* a_row = RoiRowPtr(Av, y, roi); + const T* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + // Native integer absdiff - much faster than float conversion! + // AbsDiff(a,b) = |a - b| = max(a,b) - min(a,b) + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyBinaryNativeInt( + r_row, a_row, b_row, n, [](auto d, auto a, auto b) { + if constexpr (std::is_unsigned_v) { + return hn::Sub(hn::Max(a, b), hn::Min(a, b)); + } else { + return hn::Abs(hn::SaturatedSub(a, b)); + } + }); + } else { + // Scalar fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + T* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const T* a_ptr = ChannelPtr(Av, x, y, roi.chbegin); + const T* b_ptr = ChannelPtr(Bv, x, y, roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + if constexpr (std::is_unsigned_v) { + r_ptr[c] = (a_ptr[c] > b_ptr[c]) + ? (a_ptr[c] - b_ptr[c]) + : (b_ptr[c] - a_ptr[c]); + } else { + int64_t diff = (int64_t)a_ptr[c] + - (int64_t)b_ptr[c]; + r_ptr[c] = (T)std::abs(diff); + } + } + } + } + } + }); + return true; +} + +template +static bool +absdiff_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + auto Bv = HwyPixels(B); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels) + && ChannelsContiguous(Bv, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + const Btype* b_row = RoiRowPtr(Bv, y, roi); + + if (contig) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyCmd(r_row, a_row, b_row, n, + [](auto d, auto a, auto b) { + return hn::Abs( + hn::Sub(a, b)); + }); + } else { + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + const Btype* b_ptr = ChannelPtr(Bv, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + r_ptr[c] = static_cast( + std::abs(static_cast(a_ptr[c]) + - static_cast(b_ptr[c]))); + } + } + } + } + }); + return true; +} + +template +static bool +absdiff_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + for (int y = roi.ybegin; y < roi.yend; ++y) { + std::byte* r_row = PixelBase(Rv, roi.xbegin, y); + const std::byte* a_row = PixelBase(Av, roi.xbegin, y); + for (int x = roi.xbegin; x < roi.xend; ++x) { + const size_t xoff = static_cast(x - roi.xbegin); + Rtype* r_ptr = reinterpret_cast( + r_row + xoff * Rv.pixel_bytes); + const Atype* a_ptr = reinterpret_cast( + a_row + xoff * Av.pixel_bytes); + for (int c = roi.chbegin; c < roi.chend; ++c) { + r_ptr[c] = (Rtype)std::abs((SimdType)a_ptr[c] + - (SimdType)b[c]); + } + } + } + }); + return true; +} + +template +static bool +absdiff_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels() + && B.localpixels()) { + // Use native integer path for scale-invariant absdiff when all types match + // and are integer types (much faster: 6-12x vs 3-5x with float conversion) + constexpr bool all_same = std::is_same_v + && std::is_same_v; + constexpr bool is_integer = std::is_integral_v; + if constexpr (all_same && is_integer) { + return absdiff_impl_hwy_native_int(R, A, B, roi, nthreads); + } + return absdiff_impl_hwy(R, A, B, roi, nthreads); + } + return absdiff_impl_scalar(R, A, B, roi, nthreads); +} + +template +static bool +absdiff_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return absdiff_impl_hwy(R, A, b, roi, nthreads); + return absdiff_impl_scalar(R, A, b, roi, nthreads); +} + + + bool ImageBufAlgo::absdiff(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi, int nthreads) @@ -396,7 +960,8 @@ ImageBufAlgo::abs(const ImageBuf& A, ROI roi, int nthreads) template static bool -pow_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +pow_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) { ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { ImageBuf::ConstIterator a(A, roi); @@ -408,6 +973,100 @@ pow_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) } + +template +static bool +pow_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, + int nthreads) +{ + using MathT = typename SimdMathType::type; + + bool scalar_pow = (b.size() == 1); + float p_val = b[0]; + + auto Rv = HwyPixels(R); + auto Av = HwyPixels(A); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Rv, nchannels) + && ChannelsContiguous(Av, nchannels); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + Rtype* r_row = RoiRowPtr(Rv, y, roi); + const Atype* a_row = RoiRowPtr(Av, y, roi); + + if (contig && scalar_pow) { + size_t n = static_cast(roi.width()) + * static_cast(nchannels); + RunHwyUnaryCmd( + r_row, a_row, n, [p_val](auto d, auto va) { + auto vpow = hn::Set(d, static_cast(p_val)); + return hn::Exp(d, hn::Mul(vpow, hn::Log(d, va))); + }); + } else { + // Normalize: unsigned ints to [0,1], signed ints to [-1,1] + constexpr float norm_factor + = std::is_integral_v + ? (std::is_same_v + ? 1.0f / 4294967295.0f + : std::is_same_v + ? 1.0f / 2147483647.0f + : std::is_same_v ? 1.0f / 65535.0f + : std::is_same_v ? 1.0f / 32767.0f + : std::is_same_v ? 1.0f / 255.0f + : std::is_same_v ? 1.0f / 127.0f + : 1.0f / 255.0f) + : 1.0f; + constexpr float denorm_factor + = std::is_integral_v + ? (std::is_same_v ? 4294967295.0f + : std::is_same_v ? 2147483647.0f + : std::is_same_v ? 65535.0f + : std::is_same_v ? 32767.0f + : std::is_same_v ? 255.0f + : std::is_same_v ? 127.0f + : 255.0f) + : 1.0f; + + for (int x = roi.xbegin; x < roi.xend; ++x) { + Rtype* r_ptr = ChannelPtr(Rv, x, y, roi.chbegin); + const Atype* a_ptr = ChannelPtr(Av, x, y, + roi.chbegin); + for (int c = 0; c < nchannels; ++c) { + using SimdType + = std::conditional_t, + double, float>; + SimdType normalized = static_cast(a_ptr[c]) + * norm_factor; + SimdType result + = pow(normalized, + static_cast(b[roi.chbegin + c])); + // Only add rounding offset for integer types + if constexpr (std::is_integral_v) { + r_ptr[c] = static_cast(result * denorm_factor + + 0.5f); + } else { + r_ptr[c] = static_cast(result + * denorm_factor); + } + } + } + } + } + }); + return true; +} + +template +static bool +pow_impl(ImageBuf& R, const ImageBuf& A, cspan b, ROI roi, int nthreads) +{ + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) + return pow_impl_hwy(R, A, b, roi, nthreads); + return pow_impl_scalar(R, A, b, roi, nthreads); +} + + bool ImageBufAlgo::pow(ImageBuf& dst, const ImageBuf& A, cspan b, ROI roi, int nthreads) @@ -604,11 +1263,179 @@ rangeexpand(float y) +template +static bool +rangecompress_hwy(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi, + int nthreads) +{ + using MathT = typename SimdMathType::type; + + const ImageSpec& Rspec = R.spec(); + const ImageSpec& Aspec = A.spec(); + int alpha_channel = Aspec.alpha_channel; + int z_channel = Aspec.z_channel; + int nchannels = roi.chend - roi.chbegin; + + // Luma weights + constexpr float wr = 0.21264f, wg = 0.71517f, wb = 0.07219f; + + // Check if luma mode is viable + bool can_use_luma + = useluma && roi.nchannels() >= 3 + && !(alpha_channel >= roi.chbegin && alpha_channel < roi.chbegin + 3) + && !(z_channel >= roi.chbegin && z_channel < roi.chbegin + 3); + + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + size_t r_pixel_bytes = Rspec.pixel_bytes(); + size_t a_pixel_bytes = Aspec.pixel_bytes(); + size_t r_scanline_bytes = Rspec.scanline_bytes(); + size_t a_scanline_bytes = Aspec.scanline_bytes(); + + char* r_base = reinterpret_cast(R.localpixels()); + const char* a_base = reinterpret_cast(A.localpixels()); + + bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes) + && (nchannels * sizeof(Atype) == a_pixel_bytes); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes + + (roi.xbegin - R.xbegin()) * r_pixel_bytes; + const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes + + (roi.xbegin - A.xbegin()) * a_pixel_bytes; + + r_row += roi.chbegin * sizeof(Rtype); + a_row += roi.chbegin * sizeof(Atype); + + if (contig && !can_use_luma && alpha_channel < 0 && z_channel < 0) { + // Per-channel mode with no alpha/z to skip: process all channels + size_t n = static_cast(roi.width()) * nchannels; + RunHwyUnaryCmd( + reinterpret_cast(r_row), + reinterpret_cast(a_row), n, + [](auto d, auto va) { return rangecompress_simd(d, va); }); + } else if (contig && can_use_luma && nchannels >= 3) { + // Luma mode: process RGB with luma-based scaling + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + + Rtype* r_ptr = reinterpret_cast(r_row); + const Atype* a_ptr = reinterpret_cast(a_row); + + int x = 0; + for (; x + static_cast(N) <= roi.width(); + x += static_cast(N)) { + // Load RGB for N pixels + auto r_vec = LoadPromote(d, a_ptr + x * nchannels + 0); + auto g_vec = LoadPromote(d, a_ptr + x * nchannels + 1); + auto b_vec = LoadPromote(d, a_ptr + x * nchannels + 2); + + // Compute luma: 0.21264*R + 0.71517*G + 0.07219*B + auto luma = hn::MulAdd( + hn::Set(d, static_cast(wr)), r_vec, + hn::MulAdd(hn::Set(d, static_cast(wg)), g_vec, + hn::Mul(hn::Set(d, static_cast(wb)), + b_vec))); + + // Compress luma + auto compressed_luma = rangecompress_simd(d, luma); + + // Compute scale = compressed_luma / luma (avoid div by zero) + auto zero = hn::Set(d, static_cast(0.0)); + auto is_zero = hn::Eq(luma, zero); + auto safe_luma = hn::IfThenElse( + is_zero, hn::Set(d, static_cast(1.0)), luma); + auto scale = hn::Div(compressed_luma, safe_luma); + scale = hn::IfThenElse(is_zero, zero, scale); + + // Apply scale to RGB + r_vec = hn::Mul(r_vec, scale); + g_vec = hn::Mul(g_vec, scale); + b_vec = hn::Mul(b_vec, scale); + + // Store RGB + DemoteStore(d, r_ptr + x * nchannels + 0, r_vec); + DemoteStore(d, r_ptr + x * nchannels + 1, g_vec); + DemoteStore(d, r_ptr + x * nchannels + 2, b_vec); + + // Copy remaining channels (alpha, etc.) - scalar + for (size_t i = 0; + i < N && x + static_cast(i) < roi.width(); ++i) { + for (int c = 3; c < nchannels; ++c) { + r_ptr[(x + static_cast(i)) * nchannels + c] + = a_ptr[(x + static_cast(i)) * nchannels + + c]; + } + } + } + + // Scalar tail for remaining pixels + for (; x < roi.width(); ++x) { + float r = static_cast(a_ptr[x * nchannels + 0]); + float g = static_cast(a_ptr[x * nchannels + 1]); + float b = static_cast(a_ptr[x * nchannels + 2]); + float luma = wr * r + wg * g + wb * b; + float scale = luma > 0.0f ? rangecompress(luma) / luma + : 0.0f; + r_ptr[x * nchannels + 0] = static_cast(r * scale); + r_ptr[x * nchannels + 1] = static_cast(g * scale); + r_ptr[x * nchannels + 2] = static_cast(b * scale); + for (int c = 3; c < nchannels; ++c) { + r_ptr[x * nchannels + c] = a_ptr[x * nchannels + c]; + } + } + } else { + // Fallback: scalar per-pixel processing with channel skipping + for (int x = 0; x < roi.width(); ++x) { + Rtype* r_pixel = reinterpret_cast( + r_row + x * r_pixel_bytes); + const Atype* a_pixel = reinterpret_cast( + a_row + x * a_pixel_bytes); + + if (can_use_luma) { + float r_val = static_cast(a_pixel[0]); + float g_val = static_cast(a_pixel[1]); + float b_val = static_cast(a_pixel[2]); + float luma = wr * r_val + wg * g_val + wb * b_val; + float scale = luma > 0.0f ? rangecompress(luma) / luma + : 0.0f; + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) + r_pixel[c] = a_pixel[c]; + else + r_pixel[c] = static_cast( + static_cast(a_pixel[c]) * scale); + } + } else { + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) + r_pixel[c] = a_pixel[c]; + else + r_pixel[c] = static_cast(rangecompress( + static_cast(a_pixel[c]))); + } + } + } + } + } + }); + return true; +} + + + template static bool rangecompress_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi, int nthreads) { + // Use SIMD fast path if Highway enabled and buffers are in local memory + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) { + return rangecompress_hwy(R, A, useluma, roi, nthreads); + } + + // Original scalar implementation for non-local buffers ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { const ImageSpec& Aspec(A.spec()); int alpha_channel = Aspec.alpha_channel; @@ -672,11 +1499,178 @@ rangecompress_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi, +template +static bool +rangeexpand_hwy(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi, + int nthreads) +{ + using MathT = typename SimdMathType::type; + + const ImageSpec& Rspec = R.spec(); + const ImageSpec& Aspec = A.spec(); + int alpha_channel = Aspec.alpha_channel; + int z_channel = Aspec.z_channel; + int nchannels = roi.chend - roi.chbegin; + + // Luma weights + constexpr float wr = 0.21264f, wg = 0.71517f, wb = 0.07219f; + + // Check if luma mode is viable + bool can_use_luma + = useluma && roi.nchannels() >= 3 + && !(alpha_channel >= roi.chbegin && alpha_channel < roi.chbegin + 3) + && !(z_channel >= roi.chbegin && z_channel < roi.chbegin + 3); + + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + size_t r_pixel_bytes = Rspec.pixel_bytes(); + size_t a_pixel_bytes = Aspec.pixel_bytes(); + size_t r_scanline_bytes = Rspec.scanline_bytes(); + size_t a_scanline_bytes = Aspec.scanline_bytes(); + + char* r_base = reinterpret_cast(R.localpixels()); + const char* a_base = reinterpret_cast(A.localpixels()); + + bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes) + && (nchannels * sizeof(Atype) == a_pixel_bytes); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes + + (roi.xbegin - R.xbegin()) * r_pixel_bytes; + const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes + + (roi.xbegin - A.xbegin()) * a_pixel_bytes; + + r_row += roi.chbegin * sizeof(Rtype); + a_row += roi.chbegin * sizeof(Atype); + + if (contig && !can_use_luma && alpha_channel < 0 && z_channel < 0) { + // Per-channel mode with no alpha/z to skip: process all channels + size_t n = static_cast(roi.width()) * nchannels; + RunHwyUnaryCmd( + reinterpret_cast(r_row), + reinterpret_cast(a_row), n, + [](auto d, auto va) { return rangeexpand_simd(d, va); }); + } else if (contig && can_use_luma && nchannels >= 3) { + // Luma mode: process RGB with luma-based scaling + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + + Rtype* r_ptr = reinterpret_cast(r_row); + const Atype* a_ptr = reinterpret_cast(a_row); + + int x = 0; + for (; x + static_cast(N) <= roi.width(); + x += static_cast(N)) { + // Load RGB for N pixels + auto r_vec = LoadPromote(d, a_ptr + x * nchannels + 0); + auto g_vec = LoadPromote(d, a_ptr + x * nchannels + 1); + auto b_vec = LoadPromote(d, a_ptr + x * nchannels + 2); + + // Compute luma: 0.21264*R + 0.71517*G + 0.07219*B + auto luma = hn::MulAdd( + hn::Set(d, static_cast(wr)), r_vec, + hn::MulAdd(hn::Set(d, static_cast(wg)), g_vec, + hn::Mul(hn::Set(d, static_cast(wb)), + b_vec))); + + // Expand luma + auto expanded_luma = rangeexpand_simd(d, luma); + + // Compute scale = expanded_luma / luma (avoid div by zero) + auto zero = hn::Set(d, static_cast(0.0)); + auto is_zero = hn::Eq(luma, zero); + auto safe_luma = hn::IfThenElse( + is_zero, hn::Set(d, static_cast(1.0)), luma); + auto scale = hn::Div(expanded_luma, safe_luma); + scale = hn::IfThenElse(is_zero, zero, scale); + + // Apply scale to RGB + r_vec = hn::Mul(r_vec, scale); + g_vec = hn::Mul(g_vec, scale); + b_vec = hn::Mul(b_vec, scale); + + // Store RGB + DemoteStore(d, r_ptr + x * nchannels + 0, r_vec); + DemoteStore(d, r_ptr + x * nchannels + 1, g_vec); + DemoteStore(d, r_ptr + x * nchannels + 2, b_vec); + + // Copy remaining channels (alpha, etc.) - scalar + for (size_t i = 0; + i < N && x + static_cast(i) < roi.width(); ++i) { + for (int c = 3; c < nchannels; ++c) { + r_ptr[(x + static_cast(i)) * nchannels + c] + = a_ptr[(x + static_cast(i)) * nchannels + + c]; + } + } + } + + // Scalar tail for remaining pixels + for (; x < roi.width(); ++x) { + float r = static_cast(a_ptr[x * nchannels + 0]); + float g = static_cast(a_ptr[x * nchannels + 1]); + float b = static_cast(a_ptr[x * nchannels + 2]); + float luma = wr * r + wg * g + wb * b; + float scale = luma > 0.0f ? rangeexpand(luma) / luma : 0.0f; + r_ptr[x * nchannels + 0] = static_cast(r * scale); + r_ptr[x * nchannels + 1] = static_cast(g * scale); + r_ptr[x * nchannels + 2] = static_cast(b * scale); + for (int c = 3; c < nchannels; ++c) { + r_ptr[x * nchannels + c] = a_ptr[x * nchannels + c]; + } + } + } else { + // Fallback: scalar per-pixel processing with channel skipping + for (int x = 0; x < roi.width(); ++x) { + Rtype* r_pixel = reinterpret_cast( + r_row + x * r_pixel_bytes); + const Atype* a_pixel = reinterpret_cast( + a_row + x * a_pixel_bytes); + + if (can_use_luma) { + float r_val = static_cast(a_pixel[0]); + float g_val = static_cast(a_pixel[1]); + float b_val = static_cast(a_pixel[2]); + float luma = wr * r_val + wg * g_val + wb * b_val; + float scale = luma > 0.0f ? rangeexpand(luma) / luma + : 0.0f; + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) + r_pixel[c] = a_pixel[c]; + else + r_pixel[c] = static_cast( + static_cast(a_pixel[c]) * scale); + } + } else { + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) + r_pixel[c] = a_pixel[c]; + else + r_pixel[c] = static_cast(rangeexpand( + static_cast(a_pixel[c]))); + } + } + } + } + } + }); + return true; +} + + + template static bool rangeexpand_(ImageBuf& R, const ImageBuf& A, bool useluma, ROI roi, int nthreads) { + // Use SIMD fast path if Highway enabled and buffers are in local memory + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) { + return rangeexpand_hwy(R, A, useluma, roi, nthreads); + } + + // Original scalar implementation for non-local buffers ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { const ImageSpec& Aspec(A.spec()); int alpha_channel = Aspec.alpha_channel; @@ -800,6 +1794,12 @@ template static bool unpremult_(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads) { + // Use SIMD fast path if Highway enabled and buffers are in local memory + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) { + return unpremult_hwy(R, A, roi, nthreads); + } + + // Original scalar implementation for non-local buffers ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { int alpha_channel = A.spec().alpha_channel; int z_channel = A.spec().z_channel; @@ -873,11 +1873,389 @@ ImageBufAlgo::unpremult(const ImageBuf& src, ROI roi, int nthreads) +template +static bool +premult_hwy(ImageBuf& R, const ImageBuf& A, bool preserve_alpha0, ROI roi, + int nthreads) +{ + using MathT = typename SimdMathType::type; + + const ImageSpec& Rspec = R.spec(); + const ImageSpec& Aspec = A.spec(); + int alpha_channel = Aspec.alpha_channel; + int z_channel = Aspec.z_channel; + int nchannels = roi.chend - roi.chbegin; + + // Check if we can use the RGBA fast path + bool can_use_rgba_simd = (nchannels == 4 && alpha_channel == 3 + && z_channel < 0 && roi.chbegin == 0); + + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + size_t r_pixel_bytes = Rspec.pixel_bytes(); + size_t a_pixel_bytes = Aspec.pixel_bytes(); + size_t r_scanline_bytes = Rspec.scanline_bytes(); + size_t a_scanline_bytes = Aspec.scanline_bytes(); + + char* r_base = reinterpret_cast(R.localpixels()); + const char* a_base = reinterpret_cast(A.localpixels()); + + bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes) + && (nchannels * sizeof(Atype) == a_pixel_bytes); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes + + (roi.xbegin - R.xbegin()) * r_pixel_bytes; + const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes + + (roi.xbegin - A.xbegin()) * a_pixel_bytes; + + r_row += roi.chbegin * sizeof(Rtype); + a_row += roi.chbegin * sizeof(Atype); + + if (contig && can_use_rgba_simd) { + // RGBA fast path: interleaved load/store + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + + Rtype* r_ptr = reinterpret_cast(r_row); + const Atype* a_ptr = reinterpret_cast(a_row); + + int x = 0; + for (; x + static_cast(N) <= roi.width(); + x += static_cast(N)) { + // Load N RGBA pixels + auto [r_vec, g_vec, b_vec, a_vec] + = LoadInterleaved4Promote( + d, a_ptr + x * 4); + + // Premultiply: RGB *= A + if (preserve_alpha0) { + auto zero = hn::Set(d, static_cast(0.0)); + auto one = hn::Set(d, static_cast(1.0)); + auto is_zero = hn::Eq(a_vec, zero); + auto is_one = hn::Eq(a_vec, one); + auto skip_mask = hn::Or(is_zero, is_one); + + r_vec = hn::IfThenElse(skip_mask, r_vec, + hn::Mul(r_vec, a_vec)); + g_vec = hn::IfThenElse(skip_mask, g_vec, + hn::Mul(g_vec, a_vec)); + b_vec = hn::IfThenElse(skip_mask, b_vec, + hn::Mul(b_vec, a_vec)); + } else { + auto one = hn::Set(d, static_cast(1.0)); + auto is_one = hn::Eq(a_vec, one); + + r_vec = hn::IfThenElse(is_one, r_vec, + hn::Mul(r_vec, a_vec)); + g_vec = hn::IfThenElse(is_one, g_vec, + hn::Mul(g_vec, a_vec)); + b_vec = hn::IfThenElse(is_one, b_vec, + hn::Mul(b_vec, a_vec)); + } + // a_vec unchanged + + // Store N RGBA pixels + StoreInterleaved4Demote(d, + r_ptr + x * 4, + r_vec, g_vec, + b_vec, a_vec); + } + + // Scalar tail for remaining pixels + for (; x < roi.width(); ++x) { + // Normalize: unsigned ints to [0,1], signed ints to [-1,1] + constexpr float norm_factor + = std::is_integral_v + ? (std::is_same_v + ? 1.0f / 4294967295.0f + : std::is_same_v + ? 1.0f / 2147483647.0f + : std::is_same_v + ? 1.0f / 65535.0f + : std::is_same_v + ? 1.0f / 32767.0f + : std::is_same_v + ? 1.0f / 255.0f + : std::is_same_v + ? 1.0f / 127.0f + : 1.0f / 255.0f) + : 1.0f; + constexpr float denorm_factor + = std::is_integral_v + ? (std::is_same_v ? 4294967295.0f + : std::is_same_v + ? 2147483647.0f + : std::is_same_v ? 65535.0f + : std::is_same_v ? 32767.0f + : std::is_same_v ? 255.0f + : std::is_same_v ? 127.0f + : 255.0f) + : 1.0f; + + float alpha = static_cast(a_ptr[x * 4 + 3]) + * norm_factor; + if ((preserve_alpha0 && alpha == 0.0f) || alpha == 1.0f) { + if (&R != &A) { + r_ptr[x * 4 + 0] = a_ptr[x * 4 + 0]; + r_ptr[x * 4 + 1] = a_ptr[x * 4 + 1]; + r_ptr[x * 4 + 2] = a_ptr[x * 4 + 2]; + r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3]; + } + continue; + } + // Only add rounding offset for integer types + if constexpr (std::is_integral_v) { + r_ptr[x * 4 + 0] = static_cast( + static_cast(a_ptr[x * 4 + 0]) * norm_factor + * alpha * denorm_factor + + 0.5f); + r_ptr[x * 4 + 1] = static_cast( + static_cast(a_ptr[x * 4 + 1]) * norm_factor + * alpha * denorm_factor + + 0.5f); + r_ptr[x * 4 + 2] = static_cast( + static_cast(a_ptr[x * 4 + 2]) * norm_factor + * alpha * denorm_factor + + 0.5f); + } else { + r_ptr[x * 4 + 0] = static_cast( + static_cast(a_ptr[x * 4 + 0]) * norm_factor + * alpha * denorm_factor); + r_ptr[x * 4 + 1] = static_cast( + static_cast(a_ptr[x * 4 + 1]) * norm_factor + * alpha * denorm_factor); + r_ptr[x * 4 + 2] = static_cast( + static_cast(a_ptr[x * 4 + 2]) * norm_factor + * alpha * denorm_factor); + } + r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3]; + } + } else { + // Fallback to scalar per-pixel processing + for (int x = 0; x < roi.width(); ++x) { + Rtype* r_pixel = reinterpret_cast( + r_row + x * r_pixel_bytes); + const Atype* a_pixel = reinterpret_cast( + a_row + x * a_pixel_bytes); + + float alpha = static_cast(a_pixel[alpha_channel]); + bool skip = (alpha == 1.0f) + || (preserve_alpha0 && alpha == 0.0f); + + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) { + r_pixel[c] = a_pixel[c]; + } else if (skip) { + r_pixel[c] = a_pixel[c]; + } else { + r_pixel[c] = static_cast( + static_cast(a_pixel[c]) * alpha); + } + } + } + } + } + }); + return true; +} + + + +template +static bool +unpremult_hwy(ImageBuf& R, const ImageBuf& A, ROI roi, int nthreads) +{ + using MathT = typename SimdMathType::type; + + const ImageSpec& Rspec = R.spec(); + const ImageSpec& Aspec = A.spec(); + int alpha_channel = Aspec.alpha_channel; + int z_channel = Aspec.z_channel; + int nchannels = roi.chend - roi.chbegin; + + // Check if we can use the RGBA fast path + bool can_use_rgba_simd = (nchannels == 4 && alpha_channel == 3 + && z_channel < 0 && roi.chbegin == 0); + + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + size_t r_pixel_bytes = Rspec.pixel_bytes(); + size_t a_pixel_bytes = Aspec.pixel_bytes(); + size_t r_scanline_bytes = Rspec.scanline_bytes(); + size_t a_scanline_bytes = Aspec.scanline_bytes(); + + char* r_base = reinterpret_cast(R.localpixels()); + const char* a_base = reinterpret_cast(A.localpixels()); + + bool contig = (nchannels * sizeof(Rtype) == r_pixel_bytes) + && (nchannels * sizeof(Atype) == a_pixel_bytes); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + char* r_row = r_base + (y - R.ybegin()) * r_scanline_bytes + + (roi.xbegin - R.xbegin()) * r_pixel_bytes; + const char* a_row = a_base + (y - A.ybegin()) * a_scanline_bytes + + (roi.xbegin - A.xbegin()) * a_pixel_bytes; + + r_row += roi.chbegin * sizeof(Rtype); + a_row += roi.chbegin * sizeof(Atype); + + if (contig && can_use_rgba_simd) { + // RGBA fast path: interleaved load/store + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + + Rtype* r_ptr = reinterpret_cast(r_row); + const Atype* a_ptr = reinterpret_cast(a_row); + + int x = 0; + for (; x + static_cast(N) <= roi.width(); + x += static_cast(N)) { + // Load N RGBA pixels + auto [r_vec, g_vec, b_vec, a_vec] + = LoadInterleaved4Promote( + d, a_ptr + x * 4); + + // Unpremultiply: RGB /= A (with div-by-zero protection) + auto zero = hn::Set(d, static_cast(0.0)); + auto one = hn::Set(d, static_cast(1.0)); + auto is_zero = hn::Eq(a_vec, zero); + auto is_one = hn::Eq(a_vec, one); + auto skip_mask = hn::Or(is_zero, is_one); + + // Avoid division by zero + auto safe_a = hn::IfThenElse(is_zero, one, a_vec); + + r_vec = hn::IfThenElse(skip_mask, r_vec, + hn::Div(r_vec, safe_a)); + g_vec = hn::IfThenElse(skip_mask, g_vec, + hn::Div(g_vec, safe_a)); + b_vec = hn::IfThenElse(skip_mask, b_vec, + hn::Div(b_vec, safe_a)); + // a_vec unchanged + + // Store N RGBA pixels + StoreInterleaved4Demote(d, + r_ptr + x * 4, + r_vec, g_vec, + b_vec, a_vec); + } + + // Scalar tail for remaining pixels + for (; x < roi.width(); ++x) { + // Normalize: unsigned ints to [0,1], signed ints to [-1,1] + constexpr float norm_factor + = std::is_integral_v + ? (std::is_same_v + ? 1.0f / 4294967295.0f + : std::is_same_v + ? 1.0f / 2147483647.0f + : std::is_same_v + ? 1.0f / 65535.0f + : std::is_same_v + ? 1.0f / 32767.0f + : std::is_same_v + ? 1.0f / 255.0f + : std::is_same_v + ? 1.0f / 127.0f + : 1.0f / 255.0f) + : 1.0f; + constexpr float denorm_factor + = std::is_integral_v + ? (std::is_same_v ? 4294967295.0f + : std::is_same_v + ? 2147483647.0f + : std::is_same_v ? 65535.0f + : std::is_same_v ? 32767.0f + : std::is_same_v ? 255.0f + : std::is_same_v ? 127.0f + : 255.0f) + : 1.0f; + + float alpha = static_cast(a_ptr[x * 4 + 3]) + * norm_factor; + if (alpha == 0.0f || alpha == 1.0f) { + if (&R != &A) { + r_ptr[x * 4 + 0] = a_ptr[x * 4 + 0]; + r_ptr[x * 4 + 1] = a_ptr[x * 4 + 1]; + r_ptr[x * 4 + 2] = a_ptr[x * 4 + 2]; + r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3]; + } + continue; + } + // Only add rounding offset for integer types + if constexpr (std::is_integral_v) { + r_ptr[x * 4 + 0] = static_cast( + (static_cast(a_ptr[x * 4 + 0]) * norm_factor + / alpha) + * denorm_factor + + 0.5f); + r_ptr[x * 4 + 1] = static_cast( + (static_cast(a_ptr[x * 4 + 1]) * norm_factor + / alpha) + * denorm_factor + + 0.5f); + r_ptr[x * 4 + 2] = static_cast( + (static_cast(a_ptr[x * 4 + 2]) * norm_factor + / alpha) + * denorm_factor + + 0.5f); + } else { + r_ptr[x * 4 + 0] = static_cast( + (static_cast(a_ptr[x * 4 + 0]) * norm_factor + / alpha) + * denorm_factor); + r_ptr[x * 4 + 1] = static_cast( + (static_cast(a_ptr[x * 4 + 1]) * norm_factor + / alpha) + * denorm_factor); + r_ptr[x * 4 + 2] = static_cast( + (static_cast(a_ptr[x * 4 + 2]) * norm_factor + / alpha) + * denorm_factor); + } + r_ptr[x * 4 + 3] = a_ptr[x * 4 + 3]; + } + } else { + // Fallback to scalar per-pixel processing + for (int x = 0; x < roi.width(); ++x) { + Rtype* r_pixel = reinterpret_cast( + r_row + x * r_pixel_bytes); + const Atype* a_pixel = reinterpret_cast( + a_row + x * a_pixel_bytes); + + float alpha = static_cast(a_pixel[alpha_channel]); + + for (int c = 0; c < nchannels; ++c) { + int abs_c = roi.chbegin + c; + if (abs_c == alpha_channel || abs_c == z_channel) { + r_pixel[c] = a_pixel[c]; + } else if (alpha == 0.0f || alpha == 1.0f) { + r_pixel[c] = a_pixel[c]; + } else { + r_pixel[c] = static_cast( + static_cast(a_pixel[c]) / alpha); + } + } + } + } + } + }); + return true; +} + + + template static bool premult_(ImageBuf& R, const ImageBuf& A, bool preserve_alpha0, ROI roi, int nthreads) { + // Use SIMD fast path if Highway enabled and buffers are in local memory + if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()) { + return premult_hwy(R, A, preserve_alpha0, roi, nthreads); + } + + // Original scalar implementation for non-local buffers ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { int alpha_channel = A.spec().alpha_channel; int z_channel = A.spec().z_channel; @@ -997,6 +2375,113 @@ allspan(cspan s, const T& v) +// Highway SIMD implementation for contrast_remap (linear stretch only, no sigmoid) +template +static bool +contrast_remap_hwy(ImageBuf& dst, const ImageBuf& src, cspan black, + cspan white, cspan min, cspan max, + bool do_minmax, ROI roi, int nthreads) +{ + using MathT = typename SimdMathType::type; + + auto Dv = HwyPixels(dst); + auto Sv = HwyPixels(src); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const int nchannels = RoiNChannels(roi); + const bool contig = ChannelsContiguous(Dv, nchannels) + && ChannelsContiguous(Sv, nchannels); + + const hn::ScalableTag d; + size_t lanes = hn::Lanes(d); + + // Pre-compute per-channel constants for linear stretch + // Formula: (x - black) * scale = x * scale + (-black * scale) + // This allows using FMA (MulAdd) instead of Sub + Mul + MathT scale_pattern[hn::MaxLanes(d)]; // 1/(white-black) + MathT offset_pattern[hn::MaxLanes(d)]; // -black * scale + MathT min_pattern[hn::MaxLanes(d)]; + MathT max_pattern[hn::MaxLanes(d)]; + + for (size_t i = 0; i < lanes; ++i) { + int ch = static_cast(i % nchannels); + MathT black_val = static_cast(black[roi.chbegin + ch]); + MathT white_val = static_cast(white[roi.chbegin + ch]); + MathT scale = static_cast(1.0) / (white_val - black_val); + scale_pattern[i] = scale; + offset_pattern[i] = -black_val + * scale; // Precompute offset for FMA + min_pattern[i] = static_cast(min[roi.chbegin + ch]); + max_pattern[i] = static_cast(max[roi.chbegin + ch]); + } + auto v_scale = hn::Load(d, scale_pattern); + auto v_offset = hn::Load(d, offset_pattern); + auto v_min = hn::Load(d, min_pattern); + auto v_max = hn::Load(d, max_pattern); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + D* d_row = RoiRowPtr(Dv, y, roi); + const S* s_row = RoiRowPtr(Sv, y, roi); + + if (contig && nchannels > 0) { + size_t total = static_cast(roi.width()) * nchannels; + size_t x = 0; + // SIMD loop - pattern wraps correctly even when lanes % nchannels != 0 + for (; x + lanes <= total; x += lanes) { + auto va = LoadPromote(d, s_row + x); + // Linear stretch using FMA: x * scale + offset + // where offset = -black * scale + auto stretched = hn::MulAdd(va, v_scale, v_offset); + // Optional remap to [min, max]: min + stretched * (max - min) + auto res = do_minmax + ? hn::MulAdd(stretched, + hn::Sub(v_max, v_min), v_min) + : stretched; + DemoteStore(d, d_row + x, res); + } + // Scalar tail for remaining pixels + for (; x < total; ++x) { + int ch = static_cast(x % nchannels); + float val = static_cast(s_row[x]); + float black_val = black[roi.chbegin + ch]; + float white_val = white[roi.chbegin + ch]; + float scale = 1.0f / (white_val - black_val); + float offset = -black_val * scale; + float result = val * scale + offset; + if (do_minmax) { + float min_val = min[roi.chbegin + ch]; + float max_val = max[roi.chbegin + ch]; + result = result * (max_val - min_val) + min_val; + } + d_row[x] = static_cast(result); + } + } else { + // Non-contiguous fallback + for (int x = roi.xbegin; x < roi.xend; ++x) { + D* d_ptr = reinterpret_cast(PixelBase(Dv, x, y)); + const S* s_ptr = reinterpret_cast( + PixelBase(Sv, x, y)); + for (int c = roi.chbegin; c < roi.chend; ++c) { + float val = static_cast(s_ptr[c]); + float black_val = black[c]; + float white_val = white[c]; + float scale = 1.0f / (white_val - black_val); + float offset = -black_val * scale; + float result = val * scale + offset; // FMA + if (do_minmax) { + float min_val = min[c]; + float max_val = max[c]; + result = result * (max_val - min_val) + min_val; + } + d_ptr[c] = static_cast(result); + } + } + } + } + }); + return true; +} + + template static bool contrast_remap_(ImageBuf& dst, const ImageBuf& src, cspan black, @@ -1011,6 +2496,14 @@ contrast_remap_(ImageBuf& dst, const ImageBuf& src, cspan black, bool use_sigmoid = !allspan(scontrast, 1.0f); bool do_minmax = !(allspan(min, 0.0f) && allspan(max, 1.0f)); + // Use Highway SIMD for simple linear stretch (no sigmoid) + if (OIIO::pvt::enable_hwy && !use_sigmoid && !same_black_white + && dst.localpixels() && src.localpixels()) { + return contrast_remap_hwy(dst, src, black, white, min, max, + do_minmax, roi, nthreads); + } + + // Scalar fallback for complex cases (sigmoid, thresholding, etc.) ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { if (same_black_white) { // Special case -- black & white are the same value, which is diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp index 0abbb1ace8..2863d1ae90 100644 --- a/src/libOpenImageIO/imagebufalgo_xform.cpp +++ b/src/libOpenImageIO/imagebufalgo_xform.cpp @@ -21,6 +21,8 @@ #include +#include "imagebufalgo_hwy_pvt.h" + OIIO_NAMESPACE_3_1_BEGIN @@ -932,11 +934,7 @@ ImageBufAlgo::fit(ImageBuf& dst, const ImageBuf& src, KWArgs options, ROI roi, OIIO::pvt::LoggedTimer logtime("IBA::fit"); static const ustring recognized[] = { - filtername_us, - filterwidth_us, - filterptr_us, - fillmode_us, - exact_us, + filtername_us, filterwidth_us, filterptr_us, fillmode_us, exact_us, #if 0 /* Not currently recognized */ wrap_us, edgeclamp_us, @@ -1072,15 +1070,39 @@ ImageBufAlgo::fit(const ImageBuf& src, KWArgs options, ROI roi, int nthreads) template static bool -resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, - int nthreads) +resample_scalar(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, + int nthreads) { + // This operates just like the internals of ImageBuf::interppixel(), but + // reuses the provided iterator to avoid the overhead of constructing a new + // one each time. This speeds it up by 20x! The iterator `it` must already + // be associated with `img`, but it need not be positioned correctly. + auto interppixel = + [](const ImageBuf& img, ImageBuf::ConstIterator& it, float x, + float y, span pixel, ImageBuf::WrapMode wrap) -> bool { + int n = std::min(int(pixel.size()), img.spec().nchannels); + float* localpixel = OIIO_ALLOCA(float, n * 4); + float* p[4] = { localpixel, localpixel + n, localpixel + 2 * n, + localpixel + 3 * n }; + x -= 0.5f; + y -= 0.5f; + int xtexel, ytexel; + float xfrac, yfrac; + xfrac = floorfrac(x, &xtexel); + yfrac = floorfrac(y, &ytexel); + it.rerange(xtexel, xtexel + 2, ytexel, ytexel + 2, 0, 1, wrap); + for (int i = 0; i < 4; ++i, ++it) + for (int c = 0; c < n; ++c) + p[i][c] = it[c]; //NOSONAR + bilerp(p[0], p[1], p[2], p[3], xfrac, yfrac, n, pixel.data()); + return true; + }; + OIIO_ASSERT(src.deep() == dst.deep()); ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { const ImageSpec& srcspec(src.spec()); const ImageSpec& dstspec(dst.spec()); int nchannels = src.nchannels(); - bool deep = src.deep(); // Local copies of the source image window, converted to float float srcfx = srcspec.full_x; @@ -1109,25 +1131,10 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, float s = (x - dstfx + 0.5f) * dstpixelwidth; float src_xf = srcfx + s * srcfw; int src_x = ifloor(src_xf); - if (deep) { - srcpel.pos(src_x, src_y, 0); - int nsamps = srcpel.deep_samples(); - OIIO_DASSERT(nsamps == out.deep_samples()); - if (!nsamps || nsamps != out.deep_samples()) - continue; - for (int c = 0; c < nchannels; ++c) { - if (dstspec.channelformat(c) == TypeDesc::UINT32) - for (int samp = 0; samp < nsamps; ++samp) - out.set_deep_value( - c, samp, srcpel.deep_value_uint(c, samp)); - else - for (int samp = 0; samp < nsamps; ++samp) - out.set_deep_value(c, samp, - srcpel.deep_value(c, samp)); - } - } else if (interpolate) { + if (interpolate) { // Non-deep image, bilinearly interpolate - src.interppixel(src_xf, src_yf, pel, ImageBuf::WrapClamp); + interppixel(src, srcpel, src_xf, src_yf, pel, + ImageBuf::WrapClamp); for (int c = roi.chbegin; c < roi.chend; ++c) out[c] = pel[c]; } else { @@ -1142,6 +1149,261 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, return true; } +static bool +resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, + int nthreads) +{ + // If it's deep, figure out the sample allocations first, because + // it's not thread-safe to do that simultaneously with copying the + // values. + const ImageSpec& srcspec(src.spec()); + const ImageSpec& dstspec(dst.spec()); + float srcfx = srcspec.full_x; + float srcfy = srcspec.full_y; + float srcfw = srcspec.full_width; + float srcfh = srcspec.full_height; + float dstpixelwidth = 1.0f / dstspec.full_width; + float dstpixelheight = 1.0f / dstspec.full_height; + ImageBuf::ConstIterator srcpel(src, roi); + ImageBuf::Iterator dstpel(dst, roi); + for (; !dstpel.done(); ++dstpel, ++srcpel) { + float s = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth; + float t = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight; + int src_y = ifloor(srcfy + t * srcfh); + int src_x = ifloor(srcfx + s * srcfw); + srcpel.pos(src_x, src_y, 0); + dstpel.set_deep_samples(srcpel.deep_samples()); + } + + OIIO_ASSERT(src.deep() == dst.deep()); + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const ImageSpec& srcspec(src.spec()); + const ImageSpec& dstspec(dst.spec()); + int nchannels = src.nchannels(); + + // Local copies of the source image window, converted to float + float srcfx = srcspec.full_x; + float srcfy = srcspec.full_y; + float srcfw = srcspec.full_width; + float srcfh = srcspec.full_height; + + float dstfx = dstspec.full_x; + float dstfy = dstspec.full_y; + float dstfw = dstspec.full_width; + float dstfh = dstspec.full_height; + float dstpixelwidth = 1.0f / dstfw; + float dstpixelheight = 1.0f / dstfh; + + ImageBuf::Iterator out(dst, roi); + ImageBuf::ConstIterator srcpel(src); + for (int y = roi.ybegin; y < roi.yend; ++y) { + // s,t are NDC space + float t = (y - dstfy + 0.5f) * dstpixelheight; + // src_xf, src_xf are image space float coordinates + float src_yf = srcfy + t * srcfh; + // src_x, src_y are image space integer coordinates of the floor + int src_y = ifloor(src_yf); + for (int x = roi.xbegin; x < roi.xend; ++x, ++out) { + float s = (x - dstfx + 0.5f) * dstpixelwidth; + float src_xf = srcfx + s * srcfw; + int src_x = ifloor(src_xf); + srcpel.pos(src_x, src_y, 0); + int nsamps = srcpel.deep_samples(); + OIIO_DASSERT(nsamps == out.deep_samples()); + if (!nsamps || nsamps != out.deep_samples()) + continue; + for (int c = 0; c < nchannels; ++c) { + if (dstspec.channelformat(c) == TypeDesc::UINT32) + for (int samp = 0; samp < nsamps; ++samp) + out.set_deep_value(c, samp, + srcpel.deep_value_uint(c, samp)); + else + for (int samp = 0; samp < nsamps; ++samp) + out.set_deep_value(c, samp, + srcpel.deep_value(c, samp)); + } + } + } + }); + + return true; +} + + + +template +static bool +resample_hwy(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, + int nthreads) +{ + using SimdType + = std::conditional_t, double, float>; + using D = hn::ScalableTag; + using Rebind = hn::Rebind; + + ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { + const ImageSpec& srcspec(src.spec()); + const ImageSpec& dstspec(dst.spec()); + + // Local copies of the source image window, converted to SimdType + float srcfx = srcspec.full_x; + float srcfy = srcspec.full_y; + float srcfw = srcspec.full_width; + float srcfh = srcspec.full_height; + + float dstfx = dstspec.full_x; + float dstfy = dstspec.full_y; + float dstfw = dstspec.full_width; + float dstfh = dstspec.full_height; + float dstpixelwidth = 1.0f / dstfw; + float dstpixelheight = 1.0f / dstfh; + + const size_t src_scanline_bytes = srcspec.scanline_bytes(); + const size_t dst_scanline_bytes = dstspec.scanline_bytes(); + const size_t src_pixel_bytes = srcspec.pixel_bytes(); + const size_t dst_pixel_bytes = dstspec.pixel_bytes(); + + const uint8_t* src_base = (const uint8_t*)src.localpixels(); + uint8_t* dst_base = (uint8_t*)dst.localpixels(); + + D d; + Rebind d_i32; + int N = hn::Lanes(d); + + for (int y = roi.ybegin; y < roi.yend; ++y) { + float t = (y - dstfy + 0.5f) * dstpixelheight; + float src_yf = srcfy + t * srcfh; + // Pixel-center convention: subtract 0.5 before interpolation + src_yf -= 0.5f; + int src_y = ifloor(src_yf); + SimdType fy = (SimdType)(src_yf - src_y); + + // Clamp Y to valid range + int src_y_clamped = clamp(src_y, src.ybegin(), src.yend() - 1); + // Neighbor Y (for bilinear) + int src_y_next_clamped = clamp(src_y + 1, src.ybegin(), + src.yend() - 1); + + // Pre-calculate row pointers + const uint8_t* row0 = src_base + + (src_y_clamped - src.ybegin()) + * src_scanline_bytes; + const uint8_t* row1 = src_base + + (src_y_next_clamped - src.ybegin()) + * src_scanline_bytes; + + uint8_t* dst_row = dst_base + + (y - dst.ybegin()) * dst_scanline_bytes; + + for (int x = roi.xbegin; x < roi.xend; x += N) { + // Handle remaining pixels if less than N + int n = std::min(N, roi.xend - x); + + // Compute src_xf for N pixels + auto idx_i32 = hn::Iota(d_i32, (float)x); + + auto x_simd = hn::ConvertTo(d, idx_i32); + auto s = hn::Mul(hn::Sub(hn::Add(x_simd, + hn::Set(d, (SimdType)0.5f)), + hn::Set(d, (SimdType)dstfx)), + hn::Set(d, (SimdType)dstpixelwidth)); + auto src_xf_vec = hn::MulAdd(s, hn::Set(d, (SimdType)srcfw), + hn::Set(d, (SimdType)srcfx)); + // Pixel-center convention: subtract 0.5 before interpolation + src_xf_vec = hn::Sub(src_xf_vec, hn::Set(d, (SimdType)0.5f)); + + auto src_x_vec = hn::Floor(src_xf_vec); + auto fx = hn::Sub(src_xf_vec, src_x_vec); + auto ix = hn::ConvertTo(d_i32, src_x_vec); + + // Clamp X + auto min_x = hn::Set(d_i32, src.xbegin()); + auto max_x = hn::Set(d_i32, src.xend() - 1); + auto ix0 = hn::Min(hn::Max(ix, min_x), max_x); + auto ix1 + = hn::Min(hn::Max(hn::Add(ix, hn::Set(d_i32, 1)), min_x), + max_x); + + // Adjust to 0-based offset from buffer start + auto x_offset = hn::Sub(ix0, min_x); + auto x1_offset = hn::Sub(ix1, min_x); + + // Loop over channels + for (int c = roi.chbegin; c < roi.chend; ++c) { + // Manual gather loop for now to be safe with types and offsets + SimdType v00_arr[16], v01_arr[16], v10_arr[16], v11_arr[16]; + int32_t x0_arr[16], x1_arr[16]; + hn::Store(x_offset, d_i32, x0_arr); + hn::Store(x1_offset, d_i32, x1_arr); + + for (int i = 0; i < n; ++i) { + size_t off0 = (size_t)x0_arr[i] * src_pixel_bytes + + (size_t)c * sizeof(SRCTYPE); + size_t off1 = (size_t)x1_arr[i] * src_pixel_bytes + + (size_t)c * sizeof(SRCTYPE); + + auto load_val = [](const uint8_t* ptr) -> SimdType { + return (SimdType)(*(const SRCTYPE*)ptr); + }; + + v00_arr[i] = load_val(row0 + off0); + v01_arr[i] = load_val(row0 + off1); + v10_arr[i] = load_val(row1 + off0); + v11_arr[i] = load_val(row1 + off1); + } + + auto val00 = hn::Load(d, v00_arr); + auto val01 = hn::Load(d, v01_arr); + auto val10 = hn::Load(d, v10_arr); + auto val11 = hn::Load(d, v11_arr); + + // Bilinear Interpolation + auto one = hn::Set(d, (SimdType)1.0f); + auto w00 = hn::Mul(hn::Sub(one, fx), + hn::Sub(one, hn::Set(d, fy))); + auto w01 = hn::Mul(fx, hn::Sub(one, hn::Set(d, fy))); + auto w10 = hn::Mul(hn::Sub(one, fx), hn::Set(d, fy)); + auto w11 = hn::Mul(fx, hn::Set(d, fy)); + + // Use FMA (Fused Multiply-Add) for better performance + auto res = hn::Mul(val00, w00); + res = hn::MulAdd(val01, w01, + res); // res = res + val01 * w01 + res = hn::MulAdd(val10, w10, + res); // res = res + val10 * w10 + res = hn::MulAdd(val11, w11, + res); // res = res + val11 * w11 + + // Store + SimdType res_arr[16]; + hn::Store(res, d, res_arr); + for (int i = 0; i < n; ++i) { + DSTTYPE* dptr + = (DSTTYPE*)(dst_row + + (size_t)(x - roi.xbegin + i) + * dst_pixel_bytes + + (size_t)c * sizeof(DSTTYPE)); + *dptr = (DSTTYPE)res_arr[i]; + } + } + } + } + }); + return true; +} + +template +static bool +resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi, + int nthreads) +{ + if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels()) + return resample_hwy(dst, src, interpolate, roi, + nthreads); + + return resample_scalar(dst, src, interpolate, roi, + nthreads); +} bool @@ -1155,27 +1417,7 @@ ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate, return false; if (dst.deep()) { - // If it's deep, figure out the sample allocations first, because - // it's not thread-safe to do that simultaneously with copying the - // values. - const ImageSpec& srcspec(src.spec()); - const ImageSpec& dstspec(dst.spec()); - float srcfx = srcspec.full_x; - float srcfy = srcspec.full_y; - float srcfw = srcspec.full_width; - float srcfh = srcspec.full_height; - float dstpixelwidth = 1.0f / dstspec.full_width; - float dstpixelheight = 1.0f / dstspec.full_height; - ImageBuf::ConstIterator srcpel(src, roi); - ImageBuf::Iterator dstpel(dst, roi); - for (; !dstpel.done(); ++dstpel, ++srcpel) { - float s = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth; - float t = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight; - int src_y = ifloor(srcfy + t * srcfh); - int src_x = ifloor(srcfx + s * srcfw); - srcpel.pos(src_x, src_y, 0); - dstpel.set_deep_samples(srcpel.deep_samples()); - } + return resample_deep(dst, src, interpolate, roi, nthreads); } bool ok; diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp index 909f8529d4..aa8babf9b4 100644 --- a/src/libOpenImageIO/imageio.cpp +++ b/src/libOpenImageIO/imageio.cpp @@ -53,6 +53,7 @@ int png_linear_premult(0); int tiff_half(0); int tiff_multithread(1); int dds_bc5normal(0); +int enable_hwy(1); // Enable Google Highway SIMD optimizations by default int limit_channels(1024); int limit_imagesize_MB(std::min(32 * 1024, int(Sysutil::physical_memory() >> 20))); @@ -406,6 +407,10 @@ attribute(string_view name, TypeDesc type, const void* val) dds_bc5normal = *(const int*)val; return true; } + if (name == "enable_hwy" && type == TypeInt) { + enable_hwy = *(const int*)val; + return true; + } if (name == "limits:channels" && type == TypeInt) { limit_channels = *(const int*)val; return true; @@ -612,6 +617,10 @@ getattribute(string_view name, TypeDesc type, void* val) *(int*)val = dds_bc5normal; return true; } + if (name == "enable_hwy" && type == TypeInt) { + *(int*)val = enable_hwy; + return true; + } if (name == "oiio:print_uncaught_errors" && type == TypeInt) { *(int*)val = oiio_print_uncaught_errors; return true;