Skip to content

Commit 0132094

Browse files
extra
1 parent 2993dc6 commit 0132094

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

include/xsimd/arch/xsimd_rvv.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <type_traits>
1515
#include <utility>
1616

17+
#include "../types/xsimd_batch_constant.hpp"
1718
#include "../types/xsimd_rvv_register.hpp"
1819
#include "xsimd_constants.hpp"
1920

@@ -1514,13 +1515,44 @@ namespace xsimd
15141515
{
15151516
XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool<T, A>::size)
15161517
{
1518+
// (A) Easy case: the number of slots fits in T.
15171519
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
15181520
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
15191521
auto iota = detail::vindex<A, as_unsigned_integer_t<T>>();
15201522
auto upowers = detail::rvvsll(ones, iota);
15211523
auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
15221524
return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r);
15231525
}
1526+
else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool<T, A>::size)
1527+
{
1528+
// (B) We need two rounds, one for the low part, one for the high part.
1529+
1530+
struct LowerHalf
1531+
{
1532+
static constexpr bool get(unsigned i, unsigned n) { return i < n / 2; }
1533+
};
1534+
struct UpperHalf
1535+
{
1536+
static constexpr bool get(unsigned i, unsigned n) { return i >= n / 2; }
1537+
};
1538+
1539+
// The low part is similar to the approach in (A).
1540+
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
1541+
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
1542+
auto iota = detail::vindex<A, as_unsigned_integer_t<T>>();
1543+
auto upowers_low = detail::rvvsll(ones, iota);
1544+
auto low_mask = self & make_batch_bool_constant<T, LowerHalf, A>();
1545+
auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers_low, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1546+
1547+
// The high part requires a sub before the shift.
1548+
auto iota_high = __riscv_vsub(detail::vindex<A, as_unsigned_integer_t<T>>(), 8 * sizeof(T), batch_bool<T, A>::size);
1549+
auto upowers_high = detail::rvvsll(ones, iota_high);
1550+
auto high_mask = self & make_batch_bool_constant<T, UpperHalf, A>();
1551+
auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers_high, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1552+
1553+
// Agglomerate the two parts.
1554+
return (uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | ((uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << (8 * sizeof(T)));
1555+
}
15241556
else
15251557
{
15261558
return mask(self, common {});

0 commit comments

Comments
 (0)