|
14 | 14 | #include <type_traits> |
15 | 15 | #include <utility> |
16 | 16 |
|
| 17 | +#include "../types/xsimd_batch_constant.hpp" |
17 | 18 | #include "../types/xsimd_rvv_register.hpp" |
18 | 19 | #include "xsimd_constants.hpp" |
19 | 20 |
|
@@ -1514,13 +1515,44 @@ namespace xsimd |
1514 | 1515 | { |
1515 | 1516 | XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool<T, A>::size) |
1516 | 1517 | { |
| 1518 | + // (A) Easy case: the number of slots fits in T. |
1517 | 1519 | const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0)); |
1518 | 1520 | auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1); |
1519 | 1521 | auto iota = detail::vindex<A, as_unsigned_integer_t<T>>(); |
1520 | 1522 | auto upowers = detail::rvvsll(ones, iota); |
1521 | 1523 | auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size); |
1522 | 1524 | return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r); |
1523 | 1525 | } |
| 1526 | + else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool<T, A>::size) |
| 1527 | + { |
| 1528 | + // (B) We need two rounds, one for the low part, one for the high part. |
| 1529 | + |
| 1530 | + struct LowerHalf |
| 1531 | + { |
| 1532 | + static constexpr bool get(unsigned i, unsigned n) { return i < n / 2; } |
| 1533 | + }; |
| 1534 | + struct UpperHalf |
| 1535 | + { |
| 1536 | + static constexpr bool get(unsigned i, unsigned n) { return i >= n / 2; } |
| 1537 | + }; |
| 1538 | + |
| 1539 | + // The low part is similar to the approach in (A). |
| 1540 | + const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0)); |
| 1541 | + auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1); |
| 1542 | + auto iota = detail::vindex<A, as_unsigned_integer_t<T>>(); |
| 1543 | + auto upowers_low = detail::rvvsll(ones, iota); |
| 1544 | + auto low_mask = self & make_batch_bool_constant<T, LowerHalf, A>(); |
| 1545 | + auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers_low, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size); |
| 1546 | + |
| 1547 | + // The high part requires a sub before the shift. |
| 1548 | + auto iota_high = __riscv_vsub(detail::vindex<A, as_unsigned_integer_t<T>>(), 8 * sizeof(T), batch_bool<T, A>::size); |
| 1549 | + auto upowers_high = detail::rvvsll(ones, iota_high); |
| 1550 | + auto high_mask = self & make_batch_bool_constant<T, UpperHalf, A>(); |
| 1551 | + auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers_high, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size); |
| 1552 | + |
| 1553 | + // Agglomerate the two parts. |
| 1554 | + return (uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | ((uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << (8 * sizeof(T))); |
| 1555 | + } |
1524 | 1556 | else |
1525 | 1557 | { |
1526 | 1558 | return mask(self, common {}); |
|
0 commit comments