Grok 10.0.5
arm_sve-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// ARM SVE[2] vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <arm_sve.h>
20#include <stddef.h>
21#include <stdint.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
26// If running on hardware whose vector length is known to be a power of two, we
27// can skip fixups for non-power of two sizes.
28#undef HWY_SVE_IS_POW2
29#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
30#define HWY_SVE_IS_POW2 1
31#else
32#define HWY_SVE_IS_POW2 0
33#endif
34
36namespace hwy {
37namespace HWY_NAMESPACE {
38
39template <class V>
40struct DFromV_t {}; // specialized in macros
41template <class V>
42using DFromV = typename DFromV_t<RemoveConst<V>>::type;
43
44template <class V>
46
47// ================================================== MACROS
48
49// Generate specializations and function definitions using X macros. Although
50// harder to read and debug, writing everything manually is too bulky.
51
52namespace detail { // for code folding
53
54// Unsigned:
55#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
56#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
57#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
58 X_MACRO(uint, u, 32, 16, NAME, OP)
59#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
60 X_MACRO(uint, u, 64, 32, NAME, OP)
61
62// Signed:
63#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
64#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
65#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
66#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
67
68// Float:
69#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
70 X_MACRO(float, f, 16, 16, NAME, OP)
71#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
72 X_MACRO(float, f, 32, 16, NAME, OP)
73#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
74 X_MACRO(float, f, 64, 32, NAME, OP)
75
76// For all element sizes:
77#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
79 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
80 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
81 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
82
83#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
85 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
86 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
87 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
88
89#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
90 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
91 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
92 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
93
94// Commonly used type categories for a given element size:
95#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
96 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
97 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
98
99#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
100 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
101 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
102
103#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
104 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
105 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
106
107#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
108 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
109 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
110
111#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
113 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
114 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
115 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
116
117// Commonly used type categories:
118#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
119 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
120 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
121
122#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
123 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
124 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
125
126#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
127 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
128 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
130
131// Assemble types for use in x-macros
132#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
133#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
134#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
135
136} // namespace detail
137
138#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
139 template <> \
140 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
141 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
142 };
143
145#undef HWY_SPECIALIZE
146
147// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
148// instructions, and we anyway only use it when the predicate is ptrue.
149
150// vector = f(vector), e.g. Not
151#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
152 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
153 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
154 }
155#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
156 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
157 return sv##OP##_##CHAR##BITS(v); \
158 }
159
160// vector = f(vector, scalar), e.g. detail::AddN
161#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
162 HWY_API HWY_SVE_V(BASE, BITS) \
163 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
164 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
165 }
166#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
167 HWY_API HWY_SVE_V(BASE, BITS) \
168 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
169 return sv##OP##_##CHAR##BITS(a, b); \
170 }
171
172// vector = f(vector, vector), e.g. Add
173#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
174 HWY_API HWY_SVE_V(BASE, BITS) \
175 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
176 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
177 }
178#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
179 HWY_API HWY_SVE_V(BASE, BITS) \
180 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
181 return sv##OP##_##CHAR##BITS(a, b); \
182 }
183
184#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
185 HWY_API HWY_SVE_V(BASE, BITS) \
186 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
187 HWY_SVE_V(BASE, BITS) c) { \
188 return sv##OP##_##CHAR##BITS(a, b, c); \
189 }
190
191// ------------------------------ Lanes
192
193namespace detail {
194
195// Returns actual lanes of a hardware vector without rounding to a power of two.
197 return svcntb_pat(SV_ALL);
198}
200 return svcnth_pat(SV_ALL);
201}
203 return svcntw_pat(SV_ALL);
204}
206 return svcntd_pat(SV_ALL);
207}
208
209// All-true mask from a macro
210#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
211
212#if HWY_SVE_IS_POW2
213#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
214#else
215#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
216
217// Returns actual lanes of a hardware vector, rounded down to a power of two.
218template <typename T, HWY_IF_LANE_SIZE(T, 1)>
219HWY_INLINE size_t HardwareLanes() {
220 return svcntb_pat(SV_POW2);
221}
222template <typename T, HWY_IF_LANE_SIZE(T, 2)>
223HWY_INLINE size_t HardwareLanes() {
224 return svcnth_pat(SV_POW2);
225}
226template <typename T, HWY_IF_LANE_SIZE(T, 4)>
227HWY_INLINE size_t HardwareLanes() {
228 return svcntw_pat(SV_POW2);
229}
230template <typename T, HWY_IF_LANE_SIZE(T, 8)>
231HWY_INLINE size_t HardwareLanes() {
232 return svcntd_pat(SV_POW2);
233}
234
235#endif // HWY_SVE_IS_POW2
236
237} // namespace detail
238
239// Returns actual number of lanes after capping by N and shifting. May return 0
240// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
241#if HWY_TARGET == HWY_SVE_256
242template <typename T, size_t N, int kPow2>
243HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
244 return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
245}
246#elif HWY_TARGET == HWY_SVE2_128
247template <typename T, size_t N, int kPow2>
248HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
249 return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
250}
251#else
252template <typename T, size_t N, int kPow2>
253HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
254 const size_t actual = detail::HardwareLanes<T>();
255 // Common case of full vectors: avoid any extra instructions.
256 if (detail::IsFull(d)) return actual;
257 return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
258}
259#endif // HWY_TARGET
260
261// ================================================== MASK INIT
262
263// One mask bit per byte; only the one belonging to the lowest byte is valid.
264
265// ------------------------------ FirstN
266#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
267 template <size_t N, int kPow2> \
268 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
269 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
270 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
271 }
273#undef HWY_SVE_FIRSTN
274
275template <class D>
276using MFromD = decltype(FirstN(D(), 0));
277
278namespace detail {
279
280#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
281 template <size_t N, int kPow2> \
282 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
283 return HWY_SVE_PTRUE(BITS); \
284 } \
285 template <size_t N, int kPow2> \
286 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
287 return HWY_SVE_ALL_PTRUE(BITS); \
288 }
289
290HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
291#undef HWY_SVE_WRAP_PTRUE
292
293HWY_API svbool_t PFalse() { return svpfalse_b(); }
294
295// Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
296//
297// This is used in functions that load/store memory; other functions (e.g.
298// arithmetic) can ignore d and use PTrue instead.
299template <class D>
300svbool_t MakeMask(D d) {
301 return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
302}
303
304} // namespace detail
305
306// ================================================== INIT
307
308// ------------------------------ Set
309// vector = f(d, scalar), e.g. Set
310#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
311 template <size_t N, int kPow2> \
312 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
313 HWY_SVE_T(BASE, BITS) arg) { \
314 return sv##OP##_##CHAR##BITS(arg); \
315 }
316
318#undef HWY_SVE_SET
319
320// Required for Zero and VFromD
321template <size_t N, int kPow2>
323 return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
324}
325
326template <class D>
327using VFromD = decltype(Set(D(), TFromD<D>()));
328
329// ------------------------------ Zero
330
331template <class D>
333 // Cast to support bfloat16_t.
334 const RebindToUnsigned<decltype(d)> du;
335 return BitCast(d, Set(du, 0));
336}
337
338// ------------------------------ Undefined
339
340#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
341 template <size_t N, int kPow2> \
342 HWY_API HWY_SVE_V(BASE, BITS) \
343 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
344 return sv##OP##_##CHAR##BITS(); \
345 }
346
348
349// ------------------------------ BitCast
350
351namespace detail {
352
353// u8: no change
354#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
355 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
356 return v; \
357 } \
358 template <size_t N, int kPow2> \
359 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
360 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
361 return v; \
362 }
363
364// All other types
365#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
366 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
367 return sv##OP##_u8_##CHAR##BITS(v); \
368 } \
369 template <size_t N, int kPow2> \
370 HWY_INLINE HWY_SVE_V(BASE, BITS) \
371 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
372 return sv##OP##_##CHAR##BITS##_u8(v); \
373 }
374
376HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
377HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
378HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
379HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
380HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
381
382#undef HWY_SVE_CAST_NOP
383#undef HWY_SVE_CAST
384
385template <size_t N, int kPow2>
387 svuint8_t v) {
389}
390
391} // namespace detail
392
393template <class D, class FromV>
397
398// ================================================== LOGICAL
399
400// detail::*N() functions accept a scalar argument to avoid extra Set().
401
402// ------------------------------ Not
404
405// ------------------------------ And
406
407namespace detail {
409} // namespace detail
410
412
413template <class V, HWY_IF_FLOAT_V(V)>
414HWY_API V And(const V a, const V b) {
415 const DFromV<V> df;
416 const RebindToUnsigned<decltype(df)> du;
417 return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
418}
419
420// ------------------------------ Or
421
423
424template <class V, HWY_IF_FLOAT_V(V)>
425HWY_API V Or(const V a, const V b) {
426 const DFromV<V> df;
427 const RebindToUnsigned<decltype(df)> du;
428 return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
429}
430
431// ------------------------------ Xor
432
433namespace detail {
435} // namespace detail
436
438
439template <class V, HWY_IF_FLOAT_V(V)>
440HWY_API V Xor(const V a, const V b) {
441 const DFromV<V> df;
442 const RebindToUnsigned<decltype(df)> du;
443 return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
444}
445
446// ------------------------------ AndNot
447
448namespace detail {
449#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
450 HWY_API HWY_SVE_V(BASE, BITS) \
451 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
452 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
453 }
454
456#undef HWY_SVE_RETV_ARGPVN_SWAP
457} // namespace detail
458
459#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
460 HWY_API HWY_SVE_V(BASE, BITS) \
461 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
462 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
463 }
465#undef HWY_SVE_RETV_ARGPVV_SWAP
466
467template <class V, HWY_IF_FLOAT_V(V)>
468HWY_API V AndNot(const V a, const V b) {
469 const DFromV<V> df;
470 const RebindToUnsigned<decltype(df)> du;
471 return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
472}
473
474// ------------------------------ Xor3
475
476#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
477
479
480template <class V, HWY_IF_FLOAT_V(V)>
481HWY_API V Xor3(const V x1, const V x2, const V x3) {
482 const DFromV<V> df;
483 const RebindToUnsigned<decltype(df)> du;
484 return BitCast(df, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
485}
486
487#else
488template <class V>
489HWY_API V Xor3(V x1, V x2, V x3) {
490 return Xor(x1, Xor(x2, x3));
491}
492#endif
493
494// ------------------------------ Or3
495template <class V>
496HWY_API V Or3(V o1, V o2, V o3) {
497 return Or(o1, Or(o2, o3));
498}
499
500// ------------------------------ OrAnd
501template <class V>
502HWY_API V OrAnd(const V o, const V a1, const V a2) {
503 return Or(o, And(a1, a2));
504}
505
506// ------------------------------ PopulationCount
507
508#ifdef HWY_NATIVE_POPCNT
509#undef HWY_NATIVE_POPCNT
510#else
511#define HWY_NATIVE_POPCNT
512#endif
513
514// Need to return original type instead of unsigned.
515#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
516 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
517 return BitCast(DFromV<decltype(v)>(), \
518 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
519 }
521#undef HWY_SVE_POPCNT
522
523// ================================================== SIGN
524
525// ------------------------------ Neg
527
528// ------------------------------ Abs
530
531// ------------------------------ CopySign[ToAbs]
532
533template <class V>
534HWY_API V CopySign(const V magn, const V sign) {
535 const auto msb = SignBit(DFromV<V>());
536 return Or(AndNot(msb, magn), And(msb, sign));
537}
538
539template <class V>
540HWY_API V CopySignToAbs(const V abs, const V sign) {
541 const auto msb = SignBit(DFromV<V>());
542 return Or(abs, And(msb, sign));
543}
544
545// ================================================== ARITHMETIC
546
547// ------------------------------ Add
548
549namespace detail {
551} // namespace detail
552
554
555// ------------------------------ Sub
556
557namespace detail {
558// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
559#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
560 HWY_API HWY_SVE_V(BASE, BITS) \
561 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
562 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
563 }
564
566#undef HWY_SVE_RETV_ARGPVN_MASK
567} // namespace detail
568
570
571// ------------------------------ SumsOf8
572HWY_API svuint64_t SumsOf8(const svuint8_t v) {
573 const ScalableTag<uint32_t> du32;
574 const ScalableTag<uint64_t> du64;
575 const svbool_t pg = detail::PTrue(du64);
576
577 const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
578 // Compute pairwise sum of u32 and extend to u64.
579 // TODO(janwas): on SVE2, we can instead use svaddp.
580 const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
581 // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
582 const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
583 return Add(hi, lo);
584}
585
586// ------------------------------ SaturatedAdd
587
590
591// ------------------------------ SaturatedSub
592
595
596// ------------------------------ AbsDiff
598
599// ------------------------------ ShiftLeft[Same]
600
601#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
602 template <int kBits> \
603 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
604 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
605 } \
606 HWY_API HWY_SVE_V(BASE, BITS) \
607 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
608 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
609 }
610
612
613// ------------------------------ ShiftRight[Same]
614
617
618#undef HWY_SVE_SHIFT_N
619
620// ------------------------------ RotateRight
621
622// TODO(janwas): svxar on SVE2
623template <int kBits, class V>
625 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
626 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
627 if (kBits == 0) return v;
628 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
629}
630
631// ------------------------------ Shl/r
632
633#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
634 HWY_API HWY_SVE_V(BASE, BITS) \
635 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
636 const RebindToUnsigned<DFromV<decltype(v)>> du; \
637 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
638 BitCast(du, bits)); \
639 }
640
642
645
646#undef HWY_SVE_SHIFT
647
648// ------------------------------ Min/Max
649
654
655namespace detail {
658} // namespace detail
659
660// ------------------------------ Mul
663
664// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
665#ifdef HWY_NATIVE_I64MULLO
666#undef HWY_NATIVE_I64MULLO
667#else
668#define HWY_NATIVE_I64MULLO
669#endif
670
671// ------------------------------ MulHigh
673// Not part of API, used internally:
676
677// ------------------------------ MulFixedPoint15
678HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
679#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
680 return svqrdmulh_s16(a, b);
681#else
682 const DFromV<decltype(a)> d;
683 const RebindToUnsigned<decltype(d)> du;
684
685 const svuint16_t lo = BitCast(du, Mul(a, b));
686 const svint16_t hi = MulHigh(a, b);
687 // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
688 // carry that into the result. Instead isolate the top two bits because only
689 // they can influence the result.
690 const svuint16_t lo_top2 = ShiftRight<14>(lo);
691 // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
692 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
693 return Add(Add(hi, hi), BitCast(d, rounding));
694#endif
695}
696
697// ------------------------------ Div
699
700// ------------------------------ ApproximateReciprocal
702
703// ------------------------------ Sqrt
705
706// ------------------------------ ApproximateReciprocalSqrt
708
709// ------------------------------ MulAdd
710#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
711 HWY_API HWY_SVE_V(BASE, BITS) \
712 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
713 HWY_SVE_V(BASE, BITS) add) { \
714 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
715 }
716
718
719// ------------------------------ NegMulAdd
721
722// ------------------------------ MulSub
724
725// ------------------------------ NegMulSub
727
728#undef HWY_SVE_FMA
729
730// ------------------------------ Round etc.
731
736
737// ================================================== MASK
738
739// ------------------------------ RebindMask
740template <class D, typename MFrom>
741HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
742 return mask;
743}
744
745// ------------------------------ Mask logical
746
747HWY_API svbool_t Not(svbool_t m) {
748 // We don't know the lane type, so assume 8-bit. For larger types, this will
749 // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
750 // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
751 return svnot_b_z(HWY_SVE_PTRUE(8), m);
752}
753HWY_API svbool_t And(svbool_t a, svbool_t b) {
754 return svand_b_z(b, b, a); // same order as AndNot for consistency
755}
756HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
757 return svbic_b_z(b, b, a); // reversed order like NEON
758}
759HWY_API svbool_t Or(svbool_t a, svbool_t b) {
760 return svsel_b(a, a, b); // a ? true : b
761}
762HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
763 return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
764}
765
766HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
767 return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b.
768}
769
770// ------------------------------ CountTrue
771
772#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
773 template <size_t N, int kPow2> \
774 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
775 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
776 }
777
779#undef HWY_SVE_COUNT_TRUE
780
781// For 16-bit Compress: full vector, not limited to SV_POW2.
782namespace detail {
783
784#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
785 template <size_t N, int kPow2> \
786 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
787 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
788 }
789
790HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
791#undef HWY_SVE_COUNT_TRUE_FULL
792
793} // namespace detail
794
795// ------------------------------ AllFalse
796template <class D>
797HWY_API bool AllFalse(D d, svbool_t m) {
798 return !svptest_any(detail::MakeMask(d), m);
799}
800
801// ------------------------------ AllTrue
802template <class D>
803HWY_API bool AllTrue(D d, svbool_t m) {
804 return CountTrue(d, m) == Lanes(d);
805}
806
807// ------------------------------ FindFirstTrue
808template <class D>
809HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
810 return AllFalse(d, m) ? intptr_t{-1}
811 : static_cast<intptr_t>(
812 CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
813}
814
815// ------------------------------ FindKnownFirstTrue
816template <class D>
817HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
818 return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
819}
820
821// ------------------------------ IfThenElse
822#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
823 HWY_API HWY_SVE_V(BASE, BITS) \
824 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
825 return sv##OP##_##CHAR##BITS(m, yes, no); \
826 }
827
829#undef HWY_SVE_IF_THEN_ELSE
830
831// ------------------------------ IfThenElseZero
832template <class V>
833HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
834 return IfThenElse(mask, yes, Zero(DFromV<V>()));
835}
836
837// ------------------------------ IfThenZeroElse
838template <class V>
839HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
840 return IfThenElse(mask, Zero(DFromV<V>()), no);
841}
842
843// ================================================== COMPARE
844
845// mask = f(vector, vector)
846#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
847 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
848 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
849 }
850#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
851 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
852 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
853 }
854
855// ------------------------------ Eq
857namespace detail {
859} // namespace detail
860
861// ------------------------------ Ne
863namespace detail {
865} // namespace detail
866
867// ------------------------------ Lt
869namespace detail {
871} // namespace detail
872
873// ------------------------------ Le
875
876#undef HWY_SVE_COMPARE
877#undef HWY_SVE_COMPARE_N
878
879// ------------------------------ Gt/Ge (swapped order)
880template <class V>
881HWY_API svbool_t Gt(const V a, const V b) {
882 return Lt(b, a);
883}
884template <class V>
885HWY_API svbool_t Ge(const V a, const V b) {
886 return Le(b, a);
887}
888
889// ------------------------------ TestBit
890template <class V>
891HWY_API svbool_t TestBit(const V a, const V bit) {
892 return detail::NeN(And(a, bit), 0);
893}
894
895// ------------------------------ MaskFromVec (Ne)
896template <class V>
897HWY_API svbool_t MaskFromVec(const V v) {
898 return detail::NeN(v, static_cast<TFromV<V>>(0));
899}
900
901// ------------------------------ VecFromMask
902template <class D>
903HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
904 const RebindToSigned<D> di;
905 // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
906 // requires an extra instruction plus M0 pipeline.
907 return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
908}
909
910// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
911
912#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
913
914#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
915 HWY_API HWY_SVE_V(BASE, BITS) \
916 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
917 HWY_SVE_V(BASE, BITS) no) { \
918 return sv##OP##_##CHAR##BITS(yes, no, mask); \
919 }
920
922#undef HWY_SVE_IF_VEC
923
924template <class V, HWY_IF_FLOAT_V(V)>
925HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
926 const DFromV<V> d;
927 const RebindToUnsigned<decltype(d)> du;
928 return BitCast(
929 d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
930}
931
932#else
933
934template <class V>
935HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
936 return Or(And(mask, yes), AndNot(mask, no));
937}
938
939#endif // HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
940
941// ------------------------------ Floating-point classification (Ne)
942
943template <class V>
944HWY_API svbool_t IsNaN(const V v) {
945 return Ne(v, v); // could also use cmpuo
946}
947
948template <class V>
949HWY_API svbool_t IsInf(const V v) {
950 using T = TFromV<V>;
951 const DFromV<decltype(v)> d;
952 const RebindToSigned<decltype(d)> di;
953 const VFromD<decltype(di)> vi = BitCast(di, v);
954 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
955 return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
956}
957
958// Returns whether normal/subnormal/zero.
959template <class V>
960HWY_API svbool_t IsFinite(const V v) {
961 using T = TFromV<V>;
962 const DFromV<decltype(v)> d;
963 const RebindToUnsigned<decltype(d)> du;
964 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
965 const VFromD<decltype(du)> vu = BitCast(du, v);
966 // 'Shift left' to clear the sign bit, then right so we can compare with the
967 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
968 // negative and non-negative floats would be greater).
969 const VFromD<decltype(di)> exp =
970 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
971 return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
972}
973
974// ================================================== MEMORY
975
976// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
977
978#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
979 template <size_t N, int kPow2> \
980 HWY_API HWY_SVE_V(BASE, BITS) \
981 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
982 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
983 return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
984 }
985
986#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
987 template <size_t N, int kPow2> \
988 HWY_API HWY_SVE_V(BASE, BITS) \
989 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
990 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
991 return sv##OP##_##CHAR##BITS(m, p); \
992 }
993
994#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
995 template <size_t N, int kPow2> \
996 HWY_API HWY_SVE_V(BASE, BITS) \
997 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
998 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
999 /* All-true predicate to load all 128 bits. */ \
1000 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
1001 }
1002
1003#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1004 template <size_t N, int kPow2> \
1005 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1006 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1007 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1008 sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
1009 }
1010
1011#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1012 template <size_t N, int kPow2> \
1013 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1014 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1015 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1016 sv##OP##_##CHAR##BITS(m, p, v); \
1017 }
1018
1025
1026#undef HWY_SVE_LOAD
1027#undef HWY_SVE_MASKED_LOAD
1028#undef HWY_SVE_LOAD_DUP128
1029#undef HWY_SVE_STORE
1030#undef HWY_SVE_BLENDED_STORE
1031
1032// BF16 is the same as svuint16_t because BF16 is optional before v8.6.
1033template <size_t N, int kPow2>
1035 const bfloat16_t* HWY_RESTRICT p) {
1036 return Load(RebindToUnsigned<decltype(d)>(),
1037 reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1038}
1039
1040template <size_t N, int kPow2>
1043 Store(v, RebindToUnsigned<decltype(d)>(),
1044 reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1045}
1046
1047// ------------------------------ Load/StoreU
1048
1049// SVE only requires lane alignment, not natural alignment of the entire
1050// vector.
1051template <class D>
1053 return Load(d, p);
1054}
1055
1056template <class V, class D>
1057HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1058 Store(v, d, p);
1059}
1060
1061// ------------------------------ ScatterOffset/Index
1062
1063#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1064 template <size_t N, int kPow2> \
1065 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1066 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1067 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1068 HWY_SVE_V(int, BITS) offset) { \
1069 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1070 v); \
1071 }
1072
1073#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1074 template <size_t N, int kPow2> \
1075 HWY_API void NAME( \
1076 HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1077 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
1078 sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
1079 }
1080
1083#undef HWY_SVE_SCATTER_OFFSET
1084#undef HWY_SVE_SCATTER_INDEX
1085
1086// ------------------------------ GatherOffset/Index
1087
1088#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1089 template <size_t N, int kPow2> \
1090 HWY_API HWY_SVE_V(BASE, BITS) \
1091 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1092 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1093 HWY_SVE_V(int, BITS) offset) { \
1094 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1095 offset); \
1096 }
1097#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1098 template <size_t N, int kPow2> \
1099 HWY_API HWY_SVE_V(BASE, BITS) \
1100 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1101 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1102 HWY_SVE_V(int, BITS) index) { \
1103 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
1104 index); \
1105 }
1106
1109#undef HWY_SVE_GATHER_OFFSET
1110#undef HWY_SVE_GATHER_INDEX
1111
1112// ------------------------------ LoadInterleaved2
1113
1114// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1115#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1116#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1117#else
1118#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1119#endif
1120
1121#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1122 template <size_t N, int kPow2> \
1123 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1124 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1125 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1126 const sv##BASE##BITS##x2_t tuple = \
1127 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1128 v0 = svget2(tuple, 0); \
1129 v1 = svget2(tuple, 1); \
1130 }
1132
1133#undef HWY_SVE_LOAD2
1134
1135// ------------------------------ LoadInterleaved3
1136
1137#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1138 template <size_t N, int kPow2> \
1139 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1140 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1141 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1142 HWY_SVE_V(BASE, BITS) & v2) { \
1143 const sv##BASE##BITS##x3_t tuple = \
1144 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1145 v0 = svget3(tuple, 0); \
1146 v1 = svget3(tuple, 1); \
1147 v2 = svget3(tuple, 2); \
1148 }
1150
1151#undef HWY_SVE_LOAD3
1152
1153// ------------------------------ LoadInterleaved4
1154
1155#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
1156 template <size_t N, int kPow2> \
1157 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1158 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1159 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1160 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1161 const sv##BASE##BITS##x4_t tuple = \
1162 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1163 v0 = svget4(tuple, 0); \
1164 v1 = svget4(tuple, 1); \
1165 v2 = svget4(tuple, 2); \
1166 v3 = svget4(tuple, 3); \
1167 }
1169
1170#undef HWY_SVE_LOAD4
1171
1172// ------------------------------ StoreInterleaved2
1173
1174#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1175 template <size_t N, int kPow2> \
1176 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1177 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1178 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1179 const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
1180 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \
1181 }
1183
1184#undef HWY_SVE_STORE2
1185
1186// ------------------------------ StoreInterleaved3
1187
1188#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
1189 template <size_t N, int kPow2> \
1190 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1191 HWY_SVE_V(BASE, BITS) v2, \
1192 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1193 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1194 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
1195 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
1196 }
1198
1199#undef HWY_SVE_STORE3
1200
1201// ------------------------------ StoreInterleaved4
1202
1203#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
1204 template <size_t N, int kPow2> \
1205 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1206 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1207 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1208 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1209 const sv##BASE##BITS##x4_t quad = \
1210 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1211 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1212 }
1214
1215#undef HWY_SVE_STORE4
1216
1217// ================================================== CONVERT
1218
1219// ------------------------------ PromoteTo
1220
1221// Same sign
1222#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1223 template <size_t N, int kPow2> \
1224 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1225 HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
1226 return sv##OP##_##CHAR##BITS(v); \
1227 }
1228
1232
1233// 2x
1234template <size_t N, int kPow2>
1235HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
1236 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1237 return PromoteTo(dto, PromoteTo(d2, vfrom));
1238}
1239template <size_t N, int kPow2>
1240HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
1241 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1242 return PromoteTo(dto, PromoteTo(d2, vfrom));
1243}
1244
1245// Sign change
1246template <size_t N, int kPow2>
1247HWY_API svint16_t PromoteTo(Simd<int16_t, N, kPow2> dto, svuint8_t vfrom) {
1248 const RebindToUnsigned<decltype(dto)> du;
1249 return BitCast(dto, PromoteTo(du, vfrom));
1250}
1251template <size_t N, int kPow2>
1252HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint16_t vfrom) {
1253 const RebindToUnsigned<decltype(dto)> du;
1254 return BitCast(dto, PromoteTo(du, vfrom));
1255}
1256template <size_t N, int kPow2>
1257HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
1258 const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
1259 const Repartition<int16_t, decltype(du16)> di16;
1260 return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
1261}
1262
1263// ------------------------------ PromoteTo F
1264
1265// Unlike Highway's ZipLower, this returns the same type.
1266namespace detail {
1267HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
1268} // namespace detail
1269
1270template <size_t N, int kPow2>
1272 const svfloat16_t v) {
1273 // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
1274 // first replicate each lane once.
1275 const svfloat16_t vv = detail::ZipLowerSame(v, v);
1276 return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1277}
1278
1279template <size_t N, int kPow2>
1281 const svfloat32_t v) {
1282 const svfloat32_t vv = detail::ZipLowerSame(v, v);
1283 return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
1284}
1285
1286template <size_t N, int kPow2>
1288 const svint32_t v) {
1289 const svint32_t vv = detail::ZipLowerSame(v, v);
1290 return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
1291}
1292
1293// For 16-bit Compress
1294namespace detail {
1296#undef HWY_SVE_PROMOTE_TO
1297
1298template <size_t N, int kPow2>
1299HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1300 const RebindToUnsigned<decltype(df)> du;
1301 const RepartitionToNarrow<decltype(du)> dn;
1302 return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
1303}
1304
1305} // namespace detail
1306
1307// ------------------------------ DemoteTo U
1308
1309namespace detail {
1310
1311// Saturates unsigned vectors to half/quarter-width TN.
1312template <typename TN, class VU>
1313VU SaturateU(VU v) {
1314 return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
1315}
1316
1317// Saturates unsigned vectors to half/quarter-width TN.
1318template <typename TN, class VI>
1319VI SaturateI(VI v) {
1320 return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1321}
1322
1323} // namespace detail
1324
1325template <size_t N, int kPow2>
1326HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
1327 const DFromV<decltype(v)> di;
1328 const RebindToUnsigned<decltype(di)> du;
1329 using TN = TFromD<decltype(dn)>;
1330 // First clamp negative numbers to zero and cast to unsigned.
1331 const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
1332 // Saturate to unsigned-max and halve the width.
1333 const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1334 return svuzp1_u8(vn, vn);
1335}
1336
1337template <size_t N, int kPow2>
1338HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
1339 const DFromV<decltype(v)> di;
1340 const RebindToUnsigned<decltype(di)> du;
1341 using TN = TFromD<decltype(dn)>;
1342 // First clamp negative numbers to zero and cast to unsigned.
1343 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1344 // Saturate to unsigned-max and halve the width.
1345 const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1346 return svuzp1_u16(vn, vn);
1347}
1348
1349template <size_t N, int kPow2>
1350HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
1351 const DFromV<decltype(v)> di;
1352 const RebindToUnsigned<decltype(di)> du;
1353 const RepartitionToNarrow<decltype(du)> d2;
1354 using TN = TFromD<decltype(dn)>;
1355 // First clamp negative numbers to zero and cast to unsigned.
1356 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1357 // Saturate to unsigned-max and quarter the width.
1358 const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
1359 const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
1360 return svuzp1_u8(x2, x2);
1361}
1362
1363HWY_API svuint8_t U8FromU32(const svuint32_t v) {
1364 const DFromV<svuint32_t> du32;
1365 const RepartitionToNarrow<decltype(du32)> du16;
1366 const RepartitionToNarrow<decltype(du16)> du8;
1367
1368 const svuint16_t cast16 = BitCast(du16, v);
1369 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1370 const svuint8_t cast8 = BitCast(du8, x2);
1371 return svuzp1_u8(cast8, cast8);
1372}
1373
1374// ------------------------------ Truncations
1375
1376template <size_t N, int kPow2>
1378 const svuint64_t v) {
1379 const DFromV<svuint8_t> d;
1380 const svuint8_t v1 = BitCast(d, v);
1381 const svuint8_t v2 = svuzp1_u8(v1, v1);
1382 const svuint8_t v3 = svuzp1_u8(v2, v2);
1383 return svuzp1_u8(v3, v3);
1384}
1385
1386template <size_t N, int kPow2>
1388 const svuint64_t v) {
1389 const DFromV<svuint16_t> d;
1390 const svuint16_t v1 = BitCast(d, v);
1391 const svuint16_t v2 = svuzp1_u16(v1, v1);
1392 return svuzp1_u16(v2, v2);
1393}
1394
1395template <size_t N, int kPow2>
1397 const svuint64_t v) {
1398 const DFromV<svuint32_t> d;
1399 const svuint32_t v1 = BitCast(d, v);
1400 return svuzp1_u32(v1, v1);
1401}
1402
1403template <size_t N, int kPow2>
1405 const svuint32_t v) {
1406 const DFromV<svuint8_t> d;
1407 const svuint8_t v1 = BitCast(d, v);
1408 const svuint8_t v2 = svuzp1_u8(v1, v1);
1409 return svuzp1_u8(v2, v2);
1410}
1411
1412template <size_t N, int kPow2>
1414 const svuint32_t v) {
1415 const DFromV<svuint16_t> d;
1416 const svuint16_t v1 = BitCast(d, v);
1417 return svuzp1_u16(v1, v1);
1418}
1419
1420template <size_t N, int kPow2>
1422 const svuint16_t v) {
1423 const DFromV<svuint8_t> d;
1424 const svuint8_t v1 = BitCast(d, v);
1425 return svuzp1_u8(v1, v1);
1426}
1427
1428// ------------------------------ DemoteTo I
1429
1430template <size_t N, int kPow2>
1431HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
1432#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1433 const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
1434#else
1435 using TN = TFromD<decltype(dn)>;
1436 const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1437#endif
1438 return svuzp1_s8(vn, vn);
1439}
1440
1441template <size_t N, int kPow2>
1442HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
1443#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1444 const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
1445#else
1446 using TN = TFromD<decltype(dn)>;
1447 const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1448#endif
1449 return svuzp1_s16(vn, vn);
1450}
1451
1452template <size_t N, int kPow2>
1453HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
1454 const RepartitionToWide<decltype(dn)> d2;
1455#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1456 const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1457#else
1458 using TN = TFromD<decltype(dn)>;
1459 const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
1460#endif
1461 const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
1462 return BitCast(dn, svuzp1_s8(v2, v2));
1463}
1464
1465// ------------------------------ ConcatEven/ConcatOdd
1466
1467// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1468// full vector length, not rounded down to a power of two as we require).
1469namespace detail {
1470
1471#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1472 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1473 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1474 return sv##OP##_##CHAR##BITS(lo, hi); \
1475 }
1476HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
1477HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
1478#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1479HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
1480HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1481#endif
1482#undef HWY_SVE_CONCAT_EVERY_SECOND
1483
1484// Used to slide up / shift whole register left; mask indicates which range
1485// to take from lo, and the rest is filled from hi starting at its lowest.
1486#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1487 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1488 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1489 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1490 }
1491HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
1492#undef HWY_SVE_SPLICE
1493
1494} // namespace detail
1495
1496template <class D>
1498#if HWY_SVE_IS_POW2
1499 (void)d;
1500 return detail::ConcatOddFull(hi, lo);
1501#else
1502 const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
1503 const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
1504 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1505#endif
1506}
1507
1508template <class D>
1510#if HWY_SVE_IS_POW2
1511 (void)d;
1512 return detail::ConcatEvenFull(hi, lo);
1513#else
1514 const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
1515 const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
1516 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1517#endif
1518}
1519
1520// ------------------------------ DemoteTo F
1521
1522template <size_t N, int kPow2>
1523HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
1524 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
1525 return detail::ConcatEvenFull(in_even,
1526 in_even); // lower half
1527}
1528
1529template <size_t N, int kPow2>
1530HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
1531 const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
1532 return detail::ConcatOddFull(in_even, in_even); // lower half
1533}
1534
1535template <size_t N, int kPow2>
1536HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
1537 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
1538 return detail::ConcatEvenFull(in_even,
1539 in_even); // lower half
1540}
1541
1542template <size_t N, int kPow2>
1543HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
1544 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
1545 return detail::ConcatEvenFull(in_even,
1546 in_even); // lower half
1547}
1548
1549// ------------------------------ ConvertTo F
1550
1551#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1552 /* signed integers */ \
1553 template <size_t N, int kPow2> \
1554 HWY_API HWY_SVE_V(BASE, BITS) \
1555 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
1556 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1557 } \
1558 /* unsigned integers */ \
1559 template <size_t N, int kPow2> \
1560 HWY_API HWY_SVE_V(BASE, BITS) \
1561 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
1562 return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1563 } \
1564 /* Truncates (rounds toward zero). */ \
1565 template <size_t N, int kPow2> \
1566 HWY_API HWY_SVE_V(int, BITS) \
1567 NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
1568 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1569 }
1570
1571// API only requires f32 but we provide f64 for use by Iota.
1573#undef HWY_SVE_CONVERT
1574
1575// ------------------------------ NearestInt (Round, ConvertTo)
1576template <class VF, class DI = RebindToSigned<DFromV<VF>>>
1578 // No single instruction, round then truncate.
1579 return ConvertTo(DI(), Round(v));
1580}
1581
1582// ------------------------------ Iota (Add, ConvertTo)
1583
1584#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1585 template <size_t N, int kPow2> \
1586 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1587 HWY_SVE_T(BASE, BITS) first) { \
1588 return sv##OP##_##CHAR##BITS(first, 1); \
1589 }
1590
1592#undef HWY_SVE_IOTA
1593
1594template <class D, HWY_IF_FLOAT_D(D)>
1596 const RebindToSigned<D> di;
1597 return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
1598}
1599
1600// ------------------------------ InterleaveLower
1601
1602template <class D, class V>
1603HWY_API V InterleaveLower(D d, const V a, const V b) {
1604 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1605#if HWY_TARGET == HWY_SVE2_128
1606 (void)d;
1607 return detail::ZipLowerSame(a, b);
1608#else
1609 // Move lower halves of blocks to lower half of vector.
1610 const Repartition<uint64_t, decltype(d)> d64;
1611 const auto a64 = BitCast(d64, a);
1612 const auto b64 = BitCast(d64, b);
1613 const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half
1614 const auto b_blocks = detail::ConcatEvenFull(b64, b64);
1615 return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
1616#endif
1617}
1618
1619template <class V>
1620HWY_API V InterleaveLower(const V a, const V b) {
1621 return InterleaveLower(DFromV<V>(), a, b);
1622}
1623
1624// ------------------------------ InterleaveUpper
1625
1626// Only use zip2 if vector are a powers of two, otherwise getting the actual
1627// "upper half" requires MaskUpperHalf.
1628#if HWY_TARGET == HWY_SVE2_128
1629namespace detail {
1630// Unlike Highway's ZipUpper, this returns the same type.
1631HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
1632} // namespace detail
1633#endif
1634
1635// Full vector: guaranteed to have at least one block
1636template <class D, class V = VFromD<D>,
1637 hwy::EnableIf<detail::IsFull(D())>* = nullptr>
1638HWY_API V InterleaveUpper(D d, const V a, const V b) {
1639#if HWY_TARGET == HWY_SVE2_128
1640 (void)d;
1641 return detail::ZipUpperSame(a, b);
1642#else
1643 // Move upper halves of blocks to lower half of vector.
1644 const Repartition<uint64_t, decltype(d)> d64;
1645 const auto a64 = BitCast(d64, a);
1646 const auto b64 = BitCast(d64, b);
1647 const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half
1648 const auto b_blocks = detail::ConcatOddFull(b64, b64);
1649 return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
1650#endif
1651}
1652
1653// Capped/fraction: need runtime check
1654template <class D, class V = VFromD<D>,
1655 hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
1656HWY_API V InterleaveUpper(D d, const V a, const V b) {
1657 // Less than one block: treat as capped
1658 if (Lanes(d) * sizeof(TFromD<D>) < 16) {
1659 const Half<decltype(d)> d2;
1660 return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1661 }
1662 return InterleaveUpper(DFromV<V>(), a, b);
1663}
1664
1665// ================================================== COMBINE
1666
1667namespace detail {
1668
1669#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
1670template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1671svbool_t MaskLowerHalf(D d) {
1672 switch (Lanes(d)) {
1673 case 32:
1674 return svptrue_pat_b8(SV_VL16);
1675 case 16:
1676 return svptrue_pat_b8(SV_VL8);
1677 case 8:
1678 return svptrue_pat_b8(SV_VL4);
1679 case 4:
1680 return svptrue_pat_b8(SV_VL2);
1681 default:
1682 return svptrue_pat_b8(SV_VL1);
1683 }
1684}
1685template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1686svbool_t MaskLowerHalf(D d) {
1687 switch (Lanes(d)) {
1688 case 16:
1689 return svptrue_pat_b16(SV_VL8);
1690 case 8:
1691 return svptrue_pat_b16(SV_VL4);
1692 case 4:
1693 return svptrue_pat_b16(SV_VL2);
1694 default:
1695 return svptrue_pat_b16(SV_VL1);
1696 }
1697}
1698template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1699svbool_t MaskLowerHalf(D d) {
1700 switch (Lanes(d)) {
1701 case 8:
1702 return svptrue_pat_b32(SV_VL4);
1703 case 4:
1704 return svptrue_pat_b32(SV_VL2);
1705 default:
1706 return svptrue_pat_b32(SV_VL1);
1707 }
1708}
1709template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1710svbool_t MaskLowerHalf(D d) {
1711 switch (Lanes(d)) {
1712 case 4:
1713 return svptrue_pat_b64(SV_VL2);
1714 default:
1715 return svptrue_pat_b64(SV_VL1);
1716 }
1717}
1718#endif
1719#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
1720template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1721svbool_t MaskLowerHalf(D d) {
1722 switch (Lanes(d)) {
1723 case 16:
1724 return svptrue_pat_b8(SV_VL8);
1725 case 8:
1726 return svptrue_pat_b8(SV_VL4);
1727 case 4:
1728 return svptrue_pat_b8(SV_VL2);
1729 case 2:
1730 case 1:
1731 default:
1732 return svptrue_pat_b8(SV_VL1);
1733 }
1734}
1735template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1736svbool_t MaskLowerHalf(D d) {
1737 switch (Lanes(d)) {
1738 case 8:
1739 return svptrue_pat_b16(SV_VL4);
1740 case 4:
1741 return svptrue_pat_b16(SV_VL2);
1742 case 2:
1743 case 1:
1744 default:
1745 return svptrue_pat_b16(SV_VL1);
1746 }
1747}
1748template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1749svbool_t MaskLowerHalf(D d) {
1750 return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
1751}
1752template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1753svbool_t MaskLowerHalf(D /*d*/) {
1754 return svptrue_pat_b64(SV_VL1);
1755}
1756#endif // HWY_TARGET == HWY_SVE2_128
1757#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
1758template <class D>
1759svbool_t MaskLowerHalf(D d) {
1760 return FirstN(d, Lanes(d) / 2);
1761}
1762#endif
1763
1764template <class D>
1765svbool_t MaskUpperHalf(D d) {
1766 // TODO(janwas): WHILEGE on pow2 SVE2
1767 if (HWY_SVE_IS_POW2 && IsFull(d)) {
1768 return Not(MaskLowerHalf(d));
1769 }
1770
1771 // For Splice to work as intended, make sure bits above Lanes(d) are zero.
1773}
1774
1775// Right-shift vector pair by constexpr; can be used to slide down (=N) or up
1776// (=Lanes()-N).
1777#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1778 template <size_t kIndex> \
1779 HWY_API HWY_SVE_V(BASE, BITS) \
1780 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1781 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1782 }
1784#undef HWY_SVE_EXT
1785
1786} // namespace detail
1787
1788// ------------------------------ ConcatUpperLower
1789template <class D, class V>
1790HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
1791 return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
1792}
1793
1794// ------------------------------ ConcatLowerLower
1795template <class D, class V>
1796HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
1797 if (detail::IsFull(d)) {
1798#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1799 return detail::ConcatEvenBlocks(hi, lo);
1800#endif
1801#if HWY_TARGET == HWY_SVE2_128
1802 const Repartition<uint64_t, D> du64;
1803 const auto lo64 = BitCast(du64, lo);
1804 return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
1805#endif
1806 }
1807 return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
1808}
1809
1810// ------------------------------ ConcatLowerUpper
1811template <class D, class V>
1812HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
1813#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1814 if (detail::IsFull(d)) {
1815 return detail::Ext<Lanes(d) / 2>(hi, lo);
1816 }
1817#endif
1818 return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
1819}
1820
1821// ------------------------------ ConcatUpperUpper
1822template <class D, class V>
1823HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
1824 if (detail::IsFull(d)) {
1825#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1826 return detail::ConcatOddBlocks(hi, lo);
1827#endif
1828#if HWY_TARGET == HWY_SVE2_128
1829 const Repartition<uint64_t, D> du64;
1830 const auto lo64 = BitCast(du64, lo);
1831 return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
1832#endif
1833 }
1834 const svbool_t mask_upper = detail::MaskUpperHalf(d);
1835 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1836 return IfThenElse(mask_upper, hi, lo_upper);
1837}
1838
1839// ------------------------------ Combine
1840template <class D, class V2>
1841HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
1842 return ConcatLowerLower(d, hi, lo);
1843}
1844
1845// ------------------------------ ZeroExtendVector
1846template <class D, class V>
1847HWY_API V ZeroExtendVector(const D d, const V lo) {
1848 return Combine(d, Zero(Half<D>()), lo);
1849}
1850
1851// ------------------------------ Lower/UpperHalf
1852
1853template <class D2, class V>
1854HWY_API V LowerHalf(D2 /* tag */, const V v) {
1855 return v;
1856}
1857
1858template <class V>
1859HWY_API V LowerHalf(const V v) {
1860 return v;
1861}
1862
1863template <class DH, class V>
1864HWY_API V UpperHalf(const DH dh, const V v) {
1865 const Twice<decltype(dh)> d;
1866 // Cast so that we support bfloat16_t.
1867 const RebindToUnsigned<decltype(d)> du;
1868 const VFromD<decltype(du)> vu = BitCast(du, v);
1869#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1870 return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
1871#else
1872 const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
1873 return BitCast(d, detail::Splice(vu, vu, mask));
1874#endif
1875}
1876
1877// ================================================== REDUCE
1878
1879// These return T, whereas the Highway op returns a broadcasted vector.
1880namespace detail {
1881#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
1882 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1883 /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
1884 using T = HWY_SVE_T(BASE, BITS); \
1885 using TU = MakeUnsigned<T>; \
1886 constexpr uint64_t kMask = LimitsMax<TU>(); \
1887 return static_cast<T>(static_cast<TU>( \
1888 static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
1889 }
1890
1891#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1892 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1893 return sv##OP##_##CHAR##BITS(pg, v); \
1894 }
1895
1896HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
1897HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
1898
1899HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
1900HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
1901// NaN if all are
1902HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
1903HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
1904
1905#undef HWY_SVE_REDUCE
1906#undef HWY_SVE_REDUCE_ADD
1907} // namespace detail
1908
1909template <class D, class V>
1910V SumOfLanes(D d, V v) {
1911 return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
1912}
1913
1914template <class D, class V>
1915V MinOfLanes(D d, V v) {
1916 return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
1917}
1918
1919template <class D, class V>
1920V MaxOfLanes(D d, V v) {
1921 return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
1922}
1923
1924
1925// ================================================== SWIZZLE
1926
1927// ------------------------------ GetLane
1928
1929namespace detail {
1930#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1931 HWY_INLINE HWY_SVE_T(BASE, BITS) \
1932 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1933 return sv##OP##_##CHAR##BITS(mask, v); \
1934 }
1935
1936HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
1937#undef HWY_SVE_GET_LANE
1938} // namespace detail
1939
1940template <class V>
1941HWY_API TFromV<V> GetLane(V v) {
1942 return detail::GetLaneM(v, detail::PFalse());
1943}
1944
1945// ------------------------------ ExtractLane
1946template <class V>
1948 return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
1949}
1950
1951// ------------------------------ InsertLane (IfThenElse)
1952template <class V>
1953HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1954 const DFromV<V> d;
1955 const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
1956 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1957}
1958
1959// ------------------------------ DupEven
1960
1961namespace detail {
1962HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
1963} // namespace detail
1964
1965template <class V>
1966HWY_API V DupEven(const V v) {
1967 return detail::InterleaveEven(v, v);
1968}
1969
1970// ------------------------------ DupOdd
1971
1972namespace detail {
1973HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
1974} // namespace detail
1975
1976template <class V>
1977HWY_API V DupOdd(const V v) {
1978 return detail::InterleaveOdd(v, v);
1979}
1980
1981// ------------------------------ OddEven
1982
1983#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1984
1985#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
1986 HWY_API HWY_SVE_V(BASE, BITS) \
1987 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
1988 return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \
1989 }
1990
1992#undef HWY_SVE_ODD_EVEN
1993
1994template <class V, HWY_IF_FLOAT_V(V)>
1995HWY_API V OddEven(const V odd, const V even) {
1996 const DFromV<V> d;
1997 const RebindToUnsigned<decltype(d)> du;
1998 return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
1999}
2000
2001#else
2002
2003template <class V>
2004HWY_API V OddEven(const V odd, const V even) {
2005 const auto odd_in_even = detail::Ext<1>(odd, odd);
2006 return detail::InterleaveEven(even, odd_in_even);
2007}
2008
2009#endif // HWY_TARGET
2010
2011// ------------------------------ OddEvenBlocks
2012template <class V>
2013HWY_API V OddEvenBlocks(const V odd, const V even) {
2014 const DFromV<V> d;
2015#if HWY_TARGET == HWY_SVE_256
2016 return ConcatUpperLower(d, odd, even);
2017#elif HWY_TARGET == HWY_SVE2_128
2018 (void)odd;
2019 (void)d;
2020 return even;
2021#else
2022 const RebindToUnsigned<decltype(d)> du;
2023 using TU = TFromD<decltype(du)>;
2024 constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
2025 const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
2026 const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
2027 const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
2028 return IfThenElse(is_even, even, odd);
2029#endif
2030}
2031
2032// ------------------------------ TableLookupLanes
2033
2034template <class D, class VI>
2036 using TI = TFromV<VI>;
2037 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
2038 const RebindToUnsigned<D> du;
2039 const auto indices = BitCast(du, vec);
2040#if HWY_IS_DEBUG_BUILD
2041 HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast<TI>(Lanes(d)))));
2042#else
2043 (void)d;
2044#endif
2045 return indices;
2046}
2047
2048template <class D, typename TI>
2050 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
2051 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2052}
2053
2054// <32bit are not part of Highway API, but used in Broadcast.
2055#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
2056 HWY_API HWY_SVE_V(BASE, BITS) \
2057 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
2058 return sv##OP##_##CHAR##BITS(v, idx); \
2059 }
2060
2062#undef HWY_SVE_TABLE
2063
2064// ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
2065
2066namespace detail {
2067
2068template <typename T, size_t N, int kPow2>
2069constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
2070 // We might have a capped vector smaller than a block, so honor that.
2071 return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2));
2072}
2073
2074} // namespace detail
2075
2076template <class V>
2078 const DFromV<V> d;
2079#if HWY_TARGET == HWY_SVE_256
2080 return ConcatLowerUpper(d, v, v);
2081#elif HWY_TARGET == HWY_SVE2_128
2082 (void)d;
2083 return v;
2084#else
2085 const RebindToUnsigned<decltype(d)> du;
2086 constexpr auto kLanesPerBlock =
2087 static_cast<TFromD<decltype(du)>>(detail::LanesPerBlock(d));
2088 const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
2089 return TableLookupLanes(v, idx);
2090#endif
2091}
2092
2093// ------------------------------ Reverse
2094
2095namespace detail {
2096
2097#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
2098 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2099 return sv##OP##_##CHAR##BITS(v); \
2100 }
2101
2102HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
2103#undef HWY_SVE_REVERSE
2104
2105} // namespace detail
2106
2107template <class D, class V>
2109 using T = TFromD<D>;
2110 const auto reversed = detail::ReverseFull(v);
2111 if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
2112 // Shift right to remove extra (non-pow2 and remainder) lanes.
2113 // TODO(janwas): on SVE2, use WHILEGE.
2114 // Avoids FirstN truncating to the return vector size. Must also avoid Not
2115 // because that is limited to SV_POW2.
2116 const ScalableTag<T> dfull;
2117 const svbool_t all_true = detail::AllPTrue(dfull);
2118 const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
2119 const svbool_t mask =
2120 svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
2121 return detail::Splice(reversed, reversed, mask);
2122}
2123
2124// ------------------------------ Reverse2
2125
2126template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
2128 const RebindToUnsigned<decltype(d)> du;
2129 const RepartitionToWide<decltype(du)> dw;
2130 return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
2131}
2132
2133template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2134HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2135 const RebindToUnsigned<decltype(d)> du;
2136 const RepartitionToWide<decltype(du)> dw;
2137 return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
2138}
2139
2140template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2141HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
2142#if HWY_TARGET == HWY_SVE2_128
2143 if (detail::IsFull(d)) {
2144 return detail::Ext<1>(v, v);
2145 }
2146#endif
2147 (void)d;
2148 const auto odd_in_even = detail::Ext<1>(v, v); // x321
2149 return detail::InterleaveEven(odd_in_even, v); // 2301
2150}
2151// ------------------------------ Reverse4 (TableLookupLanes)
2152template <class D>
2154 if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
2155 detail::IsFull(d)) {
2156 return detail::ReverseFull(v);
2157 }
2158 // TODO(janwas): is this approach faster than Shuffle0123?
2159 const RebindToUnsigned<decltype(d)> du;
2160 const auto idx = detail::XorN(Iota(du, 0), 3);
2161 return TableLookupLanes(v, idx);
2162}
2163
2164// ------------------------------ Reverse8 (TableLookupLanes)
2165template <class D>
2167 const RebindToUnsigned<decltype(d)> du;
2168 const auto idx = detail::XorN(Iota(du, 0), 7);
2169 return TableLookupLanes(v, idx);
2170}
2171
2172// ------------------------------ Compress (PromoteTo)
2173
2174template <typename T>
2175struct CompressIsPartition {
2176#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2177 // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
2178 // requires a larger table).
2179 enum { value = (sizeof(T) == 8) };
2180#else
2181 enum { value = 0 };
2182#endif // HWY_TARGET == HWY_SVE_256
2183};
2184
2185#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
2186 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
2187 return sv##OP##_##CHAR##BITS(mask, v); \
2188 }
2189
2190#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2193#else
2195#endif
2196#undef HWY_SVE_COMPRESS
2197
2198#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2199template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2200HWY_API V Compress(V v, svbool_t mask) {
2201 const DFromV<V> d;
2202 const RebindToUnsigned<decltype(d)> du64;
2203
2204 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2205 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2206 // SetTableIndices.
2207 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2208 const size_t offset = detail::SumOfLanesM(mask, bits);
2209
2210 // See CompressIsPartition.
2211 alignas(16) static constexpr uint64_t table[4 * 16] = {
2212 // PrintCompress64x4Tables
2213 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
2214 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
2215 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
2216 return TableLookupLanes(v, SetTableIndices(d, table + offset));
2217}
2218
2219#endif // HWY_TARGET == HWY_SVE_256
2220#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2221template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2222HWY_API V Compress(V v, svbool_t mask) {
2223 // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2224 // swaps upper/lower (the lower half is set to the upper half, and the
2225 // remaining upper half is filled from the lower half of the second v), and
2226 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
2227 // unchanged and map everything else to 00.
2228 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2229 return detail::Splice(v, v, AndNot(maskLL, mask));
2230}
2231
2232#endif // HWY_TARGET == HWY_SVE2_128
2233
2234template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
2235HWY_API V Compress(V v, svbool_t mask16) {
2236 static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
2237 const DFromV<V> d16;
2238
2239 // Promote vector and mask to 32-bit
2240 const RepartitionToWide<decltype(d16)> dw;
2241 const auto v32L = PromoteTo(dw, v);
2242 const auto v32H = detail::PromoteUpperTo(dw, v);
2243 const svbool_t mask32L = svunpklo_b(mask16);
2244 const svbool_t mask32H = svunpkhi_b(mask16);
2245
2246 const auto compressedL = Compress(v32L, mask32L);
2247 const auto compressedH = Compress(v32H, mask32H);
2248
2249 // Demote to 16-bit (already in range) - separately so we can splice
2250 const V evenL = BitCast(d16, compressedL);
2251 const V evenH = BitCast(d16, compressedH);
2252 const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half
2253 const V v16H = detail::ConcatEvenFull(evenH, evenH);
2254
2255 // We need to combine two vectors of non-constexpr length, so the only option
2256 // is Splice, which requires us to synthesize a mask. NOTE: this function uses
2257 // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
2258 const size_t countL = detail::CountTrueFull(dw, mask32L);
2259 const auto compressed_maskL = FirstN(d16, countL);
2260 return detail::Splice(v16H, v16L, compressed_maskL);
2261}
2262
2263// Must treat float16_t as integers so we can ConcatEven.
2264HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
2265 const DFromV<decltype(v)> df;
2266 const RebindToSigned<decltype(df)> di;
2267 return BitCast(df, Compress(BitCast(di, v), mask16));
2268}
2269
2270// ------------------------------ CompressNot
2271
2272// 2 or 4 bytes
2273template <class V, typename T = TFromV<V>, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
2274HWY_API V CompressNot(V v, const svbool_t mask) {
2275 return Compress(v, Not(mask));
2276}
2277
2278template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2279HWY_API V CompressNot(V v, svbool_t mask) {
2280#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2281 // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2282 // swaps upper/lower (the lower half is set to the upper half, and the
2283 // remaining upper half is filled from the lower half of the second v), and
2284 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
2285 // 01 to 10, and everything else to 00.
2286 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2287 return detail::Splice(v, v, AndNot(mask, maskLL));
2288#endif
2289#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2290 const DFromV<V> d;
2291 const RebindToUnsigned<decltype(d)> du64;
2292
2293 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2294 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2295 // SetTableIndices.
2296 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2297 const size_t offset = detail::SumOfLanesM(mask, bits);
2298
2299 // See CompressIsPartition.
2300 alignas(16) static constexpr uint64_t table[4 * 16] = {
2301 // PrintCompressNot64x4Tables
2302 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
2303 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
2304 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2305 return TableLookupLanes(v, SetTableIndices(d, table + offset));
2306#endif // HWY_TARGET == HWY_SVE_256
2307
2308 return Compress(v, Not(mask));
2309}
2310
2311// ------------------------------ CompressBlocksNot
2312HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
2313#if HWY_TARGET == HWY_SVE2_128
2314 (void)mask;
2315 return v;
2316#endif
2317#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2318 uint64_t bits = 0; // predicate reg is 32-bit
2319 CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient
2320 // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
2321 const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
2322 // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
2323 alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
2324 0, 1, 2, 3, 0, 1, 2, 3};
2326 return TableLookupLanes(v, SetTableIndices(d, table + offset));
2327#endif
2328
2329 return CompressNot(v, mask);
2330}
2331
2332// ------------------------------ CompressStore
2333template <class V, class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2334HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
2335 TFromD<D>* HWY_RESTRICT unaligned) {
2336 StoreU(Compress(v, mask), d, unaligned);
2337 return CountTrue(d, mask);
2338}
2339
2340// ------------------------------ CompressBlendedStore
2341template <class V, class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2342HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
2343 TFromD<D>* HWY_RESTRICT unaligned) {
2344 const size_t count = CountTrue(d, mask);
2345 const svbool_t store_mask = FirstN(d, count);
2346 BlendedStore(Compress(v, mask), store_mask, d, unaligned);
2347 return count;
2348}
2349
2350// ================================================== BLOCKWISE
2351
2352// ------------------------------ CombineShiftRightBytes
2353
2354// Prevent accidentally using these for 128-bit vectors - should not be
2355// necessary.
2356#if HWY_TARGET != HWY_SVE2_128
2357namespace detail {
2358
2359// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
2360// offsets are implicitly relative to the start of their 128-bit block.
2361template <class D, class V>
2362HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2363 using T = MakeUnsigned<TFromD<D>>;
2364 return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
2365}
2366
2367template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 1)>
2368svbool_t FirstNPerBlock(D d) {
2369 const RebindToUnsigned<decltype(d)> du;
2370 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2371 const svuint8_t idx_mod =
2372 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2373 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2374 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
2375 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
2376 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
2377 15 % kLanesPerBlock);
2378 return detail::LtN(BitCast(du, idx_mod), kLanes);
2379}
2380template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 2)>
2381svbool_t FirstNPerBlock(D d) {
2382 const RebindToUnsigned<decltype(d)> du;
2383 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2384 const svuint16_t idx_mod =
2385 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2386 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2387 6 % kLanesPerBlock, 7 % kLanesPerBlock);
2388 return detail::LtN(BitCast(du, idx_mod), kLanes);
2389}
2390template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 4)>
2391svbool_t FirstNPerBlock(D d) {
2392 const RebindToUnsigned<decltype(d)> du;
2393 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2394 const svuint32_t idx_mod =
2395 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2396 3 % kLanesPerBlock);
2397 return detail::LtN(BitCast(du, idx_mod), kLanes);
2398}
2399template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 8)>
2400svbool_t FirstNPerBlock(D d) {
2401 const RebindToUnsigned<decltype(d)> du;
2402 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2403 const svuint64_t idx_mod =
2404 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
2405 return detail::LtN(BitCast(du, idx_mod), kLanes);
2406}
2407
2408} // namespace detail
2409#endif // HWY_TARGET != HWY_SVE2_128
2410
2411template <size_t kBytes, class D, class V = VFromD<D>>
2412HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
2413 const Repartition<uint8_t, decltype(d)> d8;
2414 const auto hi8 = BitCast(d8, hi);
2415 const auto lo8 = BitCast(d8, lo);
2416#if HWY_TARGET == HWY_SVE2_128
2417 return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
2418#else
2419 const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
2420 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
2421 const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2422 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2423#endif
2424}
2425
2426// ------------------------------ Shuffle2301
2427template <class V>
2429 const DFromV<V> d;
2430 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2431 return Reverse2(d, v);
2432}
2433
2434// ------------------------------ Shuffle2103
2435template <class V>
2437 const DFromV<V> d;
2438 const Repartition<uint8_t, decltype(d)> d8;
2439 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2440 const svuint8_t v8 = BitCast(d8, v);
2441 return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
2442}
2443
2444// ------------------------------ Shuffle0321
2445template <class V>
2447 const DFromV<V> d;
2448 const Repartition<uint8_t, decltype(d)> d8;
2449 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2450 const svuint8_t v8 = BitCast(d8, v);
2451 return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
2452}
2453
2454// ------------------------------ Shuffle1032
2455template <class V>
2457 const DFromV<V> d;
2458 const Repartition<uint8_t, decltype(d)> d8;
2459 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2460 const svuint8_t v8 = BitCast(d8, v);
2461 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2462}
2463
2464// ------------------------------ Shuffle01
2465template <class V>
2466HWY_API V Shuffle01(const V v) {
2467 const DFromV<V> d;
2468 const Repartition<uint8_t, decltype(d)> d8;
2469 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2470 const svuint8_t v8 = BitCast(d8, v);
2471 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2472}
2473
2474// ------------------------------ Shuffle0123
2475template <class V>
2477 return Shuffle2301(Shuffle1032(v));
2478}
2479
2480// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2481template <class D, class V = VFromD<D>>
2483#if HWY_TARGET == HWY_SVE_256
2484 if (detail::IsFull(d)) {
2485 return SwapAdjacentBlocks(v);
2486 } else if (detail::IsFull(Twice<D>())) {
2487 return v;
2488 }
2489#elif HWY_TARGET == HWY_SVE2_128
2490 (void)d;
2491 return v;
2492#endif
2493 const Repartition<uint64_t, D> du64;
2494 return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
2495}
2496
2497// ------------------------------ TableLookupBytes
2498
2499template <class V, class VI>
2500HWY_API VI TableLookupBytes(const V v, const VI idx) {
2501 const DFromV<VI> d;
2502 const Repartition<uint8_t, decltype(d)> du8;
2503#if HWY_TARGET == HWY_SVE2_128
2504 return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
2505#else
2506 const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
2507 const auto idx8 = Add(BitCast(du8, idx), offsets128);
2508 return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
2509#endif
2510}
2511
2512template <class V, class VI>
2513HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
2514 const DFromV<VI> d;
2515 // Mask size must match vector type, so cast everything to this type.
2516 const Repartition<int8_t, decltype(d)> di8;
2517
2518 auto idx8 = BitCast(di8, idx);
2519 const auto msb = detail::LtN(idx8, 0);
2520
2521 const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
2522 return BitCast(d, IfThenZeroElse(msb, lookup));
2523}
2524
2525// ------------------------------ Broadcast
2526
2527#if HWY_TARGET == HWY_SVE2_128
2528namespace detail {
2529#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
2530 template <int kLane> \
2531 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2532 return sv##OP##_##CHAR##BITS(v, kLane); \
2533 }
2534
2535HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane)
2536#undef HWY_SVE_BROADCAST
2537} // namespace detail
2538#endif
2539
2540template <int kLane, class V>
2541HWY_API V Broadcast(const V v) {
2542 const DFromV<V> d;
2543 const RebindToUnsigned<decltype(d)> du;
2544 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2545 static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
2546#if HWY_TARGET == HWY_SVE2_128
2547 return detail::BroadcastLane<kLane>(v);
2548#else
2549 auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
2550 if (kLane != 0) {
2551 idx = detail::AddN(idx, kLane);
2552 }
2553 return TableLookupLanes(v, idx);
2554#endif
2555}
2556
2557// ------------------------------ ShiftLeftLanes
2558
2559template <size_t kLanes, class D, class V = VFromD<D>>
2560HWY_API V ShiftLeftLanes(D d, const V v) {
2561 const auto zero = Zero(d);
2562 const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
2563#if HWY_TARGET == HWY_SVE2_128
2564 return shifted;
2565#else
2566 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2567 return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
2568#endif
2569}
2570
2571template <size_t kLanes, class V>
2573 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2574}
2575
2576// ------------------------------ ShiftRightLanes
2577template <size_t kLanes, class D, class V = VFromD<D>>
2579 // For capped/fractional vectors, clear upper lanes so we shift in zeros.
2580 if (!detail::IsFull(d)) {
2582 }
2583
2584#if HWY_TARGET == HWY_SVE2_128
2585 return detail::Ext<kLanes>(Zero(d), v);
2586#else
2587 const auto shifted = detail::Ext<kLanes>(v, v);
2588 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2589 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
2590 const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
2591 return IfThenElseZero(mask, shifted);
2592#endif
2593}
2594
2595// ------------------------------ ShiftLeftBytes
2596
2597template <int kBytes, class D, class V = VFromD<D>>
2598HWY_API V ShiftLeftBytes(const D d, const V v) {
2599 const Repartition<uint8_t, decltype(d)> d8;
2600 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2601}
2602
2603template <int kBytes, class V>
2605 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2606}
2607
2608// ------------------------------ ShiftRightBytes
2609template <int kBytes, class D, class V = VFromD<D>>
2610HWY_API V ShiftRightBytes(const D d, const V v) {
2611 const Repartition<uint8_t, decltype(d)> d8;
2612 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2613}
2614
2615// ------------------------------ ZipLower
2616
2617template <class V, class DW = RepartitionToWide<DFromV<V>>>
2618HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2619 const RepartitionToNarrow<DW> dn;
2620 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2621 return BitCast(dw, InterleaveLower(dn, a, b));
2622}
2623template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2624HWY_API VFromD<DW> ZipLower(const V a, const V b) {
2625 return BitCast(DW(), InterleaveLower(D(), a, b));
2626}
2627
2628// ------------------------------ ZipUpper
2629template <class V, class DW = RepartitionToWide<DFromV<V>>>
2630HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2631 const RepartitionToNarrow<DW> dn;
2632 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2633 return BitCast(dw, InterleaveUpper(dn, a, b));
2634}
2635
2636// ================================================== Ops with dependencies
2637
2638// ------------------------------ PromoteTo bfloat16 (ZipLower)
2639template <size_t N, int kPow2>
2641 const svuint16_t v) {
2642 return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v));
2643}
2644
2645// ------------------------------ ReorderDemote2To (OddEven)
2646
2647template <size_t N, int kPow2>
2649 svfloat32_t a, svfloat32_t b) {
2650 const RebindToUnsigned<decltype(dbf16)> du16;
2651 const Repartition<uint32_t, decltype(dbf16)> du32;
2652 const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
2653 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2654}
2655
2656template <size_t N, int kPow2>
2658 svint32_t b) {
2659#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2660 (void)d16;
2661 const svint16_t a_in_even = svqxtnb_s32(a);
2662 return svqxtnt_s32(a_in_even, b);
2663#else
2664 const Half<decltype(d16)> dh;
2665 const svint16_t a16 = BitCast(dh, detail::SaturateI<int16_t>(a));
2666 const svint16_t b16 = BitCast(dh, detail::SaturateI<int16_t>(b));
2667 return detail::InterleaveEven(a16, b16);
2668#endif
2669}
2670
2671// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
2672template <class V>
2674 return IfThenZeroElse(detail::LtN(v, 0), v);
2675}
2676
2677// ------------------------------ BroadcastSignBit (ShiftRight)
2678template <class V>
2680 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
2681}
2682
2683// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2684template <class V>
2685HWY_API V IfNegativeThenElse(V v, V yes, V no) {
2686 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
2687 const DFromV<V> d;
2688 const RebindToSigned<decltype(d)> di;
2689
2690 const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2691 return IfThenElse(m, yes, no);
2692}
2693
2694// ------------------------------ AverageRound (ShiftRight)
2695
2696#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2699#else
2700template <class V>
2701V AverageRound(const V a, const V b) {
2702 return ShiftRight<1>(detail::AddN(Add(a, b), 1));
2703}
2704#endif // HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2705
2706// ------------------------------ LoadMaskBits (TestBit)
2707
2708// `p` points to at least 8 readable bytes, not all of which need be valid.
2709template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2710HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2711 const RebindToUnsigned<D> du;
2712 const svuint8_t iota = Iota(du, 0);
2713
2714 // Load correct number of bytes (bits/8) with 7 zeros after each.
2715 const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
2716 // Replicate bytes 8x such that each byte contains the bit that governs it.
2717 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
2718
2719 const svuint8_t bit =
2720 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
2721 return TestBit(rep8, bit);
2722}
2723
2724template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
2725HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2726 const uint8_t* HWY_RESTRICT bits) {
2727 const RebindToUnsigned<D> du;
2728 const Repartition<uint8_t, D> du8;
2729
2730 // There may be up to 128 bits; avoid reading past the end.
2731 const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
2732
2733 // Replicate bytes 16x such that each lane contains the bit that governs it.
2734 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
2735
2736 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
2737 return TestBit(BitCast(du, rep16), bit);
2738}
2739
2740template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2741HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2742 const uint8_t* HWY_RESTRICT bits) {
2743 const RebindToUnsigned<D> du;
2744 const Repartition<uint8_t, D> du8;
2745
2746 // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
2747 // so we can skip computing the actual length (Lanes(du)+7)/8.
2748 const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
2749
2750 // Replicate bytes 32x such that each lane contains the bit that governs it.
2751 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
2752
2753 // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
2754 const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
2755
2756 return TestBit(BitCast(du, rep32), bit);
2757}
2758
2759template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2760HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2761 const uint8_t* HWY_RESTRICT bits) {
2762 const RebindToUnsigned<D> du;
2763
2764 // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
2765 // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
2766 uint32_t mask_bits;
2767 CopyBytes<4>(bits, &mask_bits); // copy from bytes
2768 const auto vbits = Set(du, mask_bits);
2769
2770 // 2 ^ {0,1, .., 31}, will not have more lanes than that.
2771 const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
2772
2773 return TestBit(vbits, bit);
2774}
2775
2776// ------------------------------ StoreMaskBits
2777
2778namespace detail {
2779
2780// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
2781template <class T, HWY_IF_LANE_SIZE(T, 1)>
2782HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2783 return svdup_n_u8_z(m, 1);
2784}
2785template <class T, HWY_IF_LANE_SIZE(T, 2)>
2786HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2787 const ScalableTag<uint8_t> d8;
2788 const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
2789 return detail::ConcatEvenFull(b16, b16); // lower half
2790}
2791template <class T, HWY_IF_LANE_SIZE(T, 4)>
2792HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2793 return U8FromU32(svdup_n_u32_z(m, 1));
2794}
2795template <class T, HWY_IF_LANE_SIZE(T, 8)>
2796HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2797 const ScalableTag<uint32_t> d32;
2798 const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
2799 return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
2800}
2801
2802// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
2803HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
2804 const ScalableTag<uint8_t> d8;
2805 const ScalableTag<uint16_t> d16;
2806 const ScalableTag<uint32_t> d32;
2807 const ScalableTag<uint64_t> d64;
2808 // TODO(janwas): could use SVE2 BDEP, but it's optional.
2809 x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
2810 x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
2811 x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
2812 return BitCast(d64, x);
2813}
2814
2815} // namespace detail
2816
2817// `p` points to at least 8 writable bytes.
2818// TODO(janwas): specialize for HWY_SVE_256
2819template <class D>
2820HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
2821 svuint64_t bits_in_u64 =
2823
2824 const size_t num_bits = Lanes(d);
2825 const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
2826
2827 // Truncate each u64 to 8 bits and store to u8.
2828 svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
2829
2830 // Non-full byte, need to clear the undefined upper bits. Can happen for
2831 // capped/fractional vectors or large T and small hardware vectors.
2832 if (num_bits < 8) {
2833 const int mask = static_cast<int>((1ull << num_bits) - 1);
2834 bits[0] = static_cast<uint8_t>(bits[0] & mask);
2835 }
2836 // Else: we wrote full bytes because num_bits is a power of two >= 8.
2837
2838 return num_bytes;
2839}
2840
2841// ------------------------------ CompressBits (LoadMaskBits)
2842template <class V, class D = DFromV<V>, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2843HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2844 return Compress(v, LoadMaskBits(D(), bits));
2845}
2846
2847// ------------------------------ CompressBitsStore (LoadMaskBits)
2848template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2850 D d, TFromD<D>* HWY_RESTRICT unaligned) {
2851 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2852}
2853
2854// ------------------------------ MulEven (InterleaveEven)
2855
2856#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2857namespace detail {
2858#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2859 HWY_API HWY_SVE_V(BASE, BITS) \
2860 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2861 return sv##OP##_##CHAR##BITS(a, b); \
2862 }
2863
2864HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
2865#undef HWY_SVE_MUL_EVEN
2866} // namespace detail
2867#endif
2868
2869template <class V, class DW = RepartitionToWide<DFromV<V>>>
2870HWY_API VFromD<DW> MulEven(const V a, const V b) {
2871#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2872 return BitCast(DW(), detail::MulEvenNative(a, b));
2873#else
2874 const auto lo = Mul(a, b);
2875 const auto hi = MulHigh(a, b);
2876 return BitCast(DW(), detail::InterleaveEven(lo, hi));
2877#endif
2878}
2879
2880HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
2881 const auto lo = Mul(a, b);
2882 const auto hi = MulHigh(a, b);
2883 return detail::InterleaveEven(lo, hi);
2884}
2885
2886HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
2887 const auto lo = Mul(a, b);
2888 const auto hi = MulHigh(a, b);
2889 return detail::InterleaveOdd(lo, hi);
2890}
2891
2892// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2893
2894template <size_t N, int kPow2>
2896 svuint16_t a, svuint16_t b,
2897 const svfloat32_t sum0,
2898 svfloat32_t& sum1) {
2899 // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
2900 const RebindToUnsigned<decltype(df32)> du32;
2901 // Using shift/and instead of Zip leads to the odd/even order that
2902 // RearrangeToOddPlusEven prefers.
2903 using VU32 = VFromD<decltype(du32)>;
2904 const VU32 odd = Set(du32, 0xFFFF0000u);
2905 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2906 const VU32 ao = And(BitCast(du32, a), odd);
2907 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2908 const VU32 bo = And(BitCast(du32, b), odd);
2909 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2910 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2911}
2912
2913template <size_t N, int kPow2>
2915 svint16_t a, svint16_t b,
2916 const svint32_t sum0,
2917 svint32_t& sum1) {
2918#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2919 (void)d32;
2920 sum1 = svmlalt_s32(sum1, a, b);
2921 return svmlalb_s32(sum0, a, b);
2922#else
2923 const svbool_t pg = detail::PTrue(d32);
2924 // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
2925 // Fortunately SVE has sign-extension for the even lanes.
2926 const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
2927 const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
2928 const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
2929 const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
2930 sum1 = svmla_s32_x(pg, sum1, ao, bo);
2931 return svmla_s32_x(pg, sum0, ae, be);
2932#endif
2933}
2934
2935// ------------------------------ RearrangeToOddPlusEven
2936template <class VW>
2937HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
2938 // sum0 is the sum of bottom/even lanes and sum1 of top/odd lanes.
2939 return Add(sum0, sum1);
2940}
2941
2942// ------------------------------ AESRound / CLMul
2943
2944#if defined(__ARM_FEATURE_SVE2_AES) || \
2945 ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \
2946 HWY_HAVE_RUNTIME_DISPATCH)
2947
2948// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2949#ifdef HWY_NATIVE_AES
2950#undef HWY_NATIVE_AES
2951#else
2952#define HWY_NATIVE_AES
2953#endif
2954
2955HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
2956 // It is not clear whether E and MC fuse like they did on NEON.
2957 const svuint8_t zero = svdup_n_u8(0);
2958 return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2959}
2960
2961HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
2962 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2963}
2964
2965HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
2966 return svpmullb_pair(a, b);
2967}
2968
2969HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
2970 return svpmullt_pair(a, b);
2971}
2972
2973#endif // __ARM_FEATURE_SVE2_AES
2974
2975// ------------------------------ Lt128
2976
2977namespace detail {
2978#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
2979 template <size_t N, int kPow2> \
2980 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
2981 return sv##OP##_b##BITS(m, m); \
2982 }
2983
2984HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool
2985HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
2986#undef HWY_SVE_DUP
2987
2988#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2989template <class D>
2990HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
2991 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
2992 "D must be u64");
2993 const svbool_t eqHx = Eq(a, b); // only odd lanes used
2994 // Convert to vector: more pipelines can execute vector TRN* instructions
2995 // than the predicate version.
2996 const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
2997 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
2998 // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
2999 const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
3000 // Duplicate upper lane into lower.
3001 return DupOdd(ltHx);
3002}
3003#endif
3004} // namespace detail
3005
3006template <class D>
3007HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
3008#if HWY_TARGET == HWY_SVE_256
3009 return MaskFromVec(detail::Lt128Vec(d, a, b));
3010#else
3011 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3012 "D must be u64");
3013 const svbool_t eqHx = Eq(a, b); // only odd lanes used
3014 const svbool_t ltHL = Lt(a, b);
3015 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
3016 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
3017 // Duplicate upper lane into lower.
3018 return detail::DupOddB(d, ltHx);
3019#endif // HWY_TARGET != HWY_SVE_256
3020}
3021
3022// ------------------------------ Lt128Upper
3023
3024template <class D>
3025HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
3026 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3027 "D must be u64");
3028 const svbool_t ltHL = Lt(a, b);
3029 return detail::DupOddB(d, ltHL);
3030}
3031
3032// ------------------------------ Eq128, Ne128
3033
3034#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3035namespace detail {
3036
3037template <class D>
3038HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
3039 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3040 "D must be u64");
3041 // Convert to vector: more pipelines can execute vector TRN* instructions
3042 // than the predicate version.
3043 const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
3044 // Duplicate upper and lower.
3045 const svuint64_t eqHH = DupOdd(eqHL);
3046 const svuint64_t eqLL = DupEven(eqHL);
3047 return And(eqLL, eqHH);
3048}
3049
3050template <class D>
3051HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
3052 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3053 "D must be u64");
3054 // Convert to vector: more pipelines can execute vector TRN* instructions
3055 // than the predicate version.
3056 const svuint64_t neHL = VecFromMask(d, Ne(a, b));
3057 // Duplicate upper and lower.
3058 const svuint64_t neHH = DupOdd(neHL);
3059 const svuint64_t neLL = DupEven(neHL);
3060 return Or(neLL, neHH);
3061}
3062
3063} // namespace detail
3064#endif
3065
3066template <class D>
3067HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
3068#if HWY_TARGET == HWY_SVE_256
3069 return MaskFromVec(detail::Eq128Vec(d, a, b));
3070#else
3071 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3072 "D must be u64");
3073 const svbool_t eqHL = Eq(a, b);
3074 const svbool_t eqHH = detail::DupOddB(d, eqHL);
3075 const svbool_t eqLL = detail::DupEvenB(d, eqHL);
3076 return And(eqLL, eqHH);
3077#endif // HWY_TARGET != HWY_SVE_256
3078}
3079
3080template <class D>
3081HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
3082#if HWY_TARGET == HWY_SVE_256
3083 return MaskFromVec(detail::Ne128Vec(d, a, b));
3084#else
3085 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3086 "D must be u64");
3087 const svbool_t neHL = Ne(a, b);
3088 const svbool_t neHH = detail::DupOddB(d, neHL);
3089 const svbool_t neLL = detail::DupEvenB(d, neHL);
3090 return Or(neLL, neHH);
3091#endif // HWY_TARGET != HWY_SVE_256
3092}
3093
3094// ------------------------------ Eq128Upper, Ne128Upper
3095
3096template <class D>
3097HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
3098 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3099 "D must be u64");
3100 const svbool_t eqHL = Eq(a, b);
3101 return detail::DupOddB(d, eqHL);
3102}
3103
3104template <class D>
3105HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
3106 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
3107 "D must be u64");
3108 const svbool_t neHL = Ne(a, b);
3109 return detail::DupOddB(d, neHL);
3110}
3111
3112// ------------------------------ Min128, Max128 (Lt128)
3113
3114template <class D>
3115HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
3116#if HWY_TARGET == HWY_SVE_256
3117 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
3118#else
3119 return IfThenElse(Lt128(d, a, b), a, b);
3120#endif
3121}
3122
3123template <class D>
3124HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
3125#if HWY_TARGET == HWY_SVE_256
3126 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
3127#else
3128 return IfThenElse(Lt128(d, b, a), a, b);
3129#endif
3130}
3131
3132template <class D>
3133HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
3134 return IfThenElse(Lt128Upper(d, a, b), a, b);
3135}
3136
3137template <class D>
3138HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
3139 return IfThenElse(Lt128Upper(d, b, a), a, b);
3140}
3141
3142// ================================================== END MACROS
3143namespace detail { // for code folding
3144#undef HWY_IF_FLOAT_V
3145#undef HWY_IF_LANE_SIZE_V
3146#undef HWY_SVE_ALL_PTRUE
3147#undef HWY_SVE_D
3148#undef HWY_SVE_FOREACH
3149#undef HWY_SVE_FOREACH_F
3150#undef HWY_SVE_FOREACH_F16
3151#undef HWY_SVE_FOREACH_F32
3152#undef HWY_SVE_FOREACH_F64
3153#undef HWY_SVE_FOREACH_I
3154#undef HWY_SVE_FOREACH_I08
3155#undef HWY_SVE_FOREACH_I16
3156#undef HWY_SVE_FOREACH_I32
3157#undef HWY_SVE_FOREACH_I64
3158#undef HWY_SVE_FOREACH_IF
3159#undef HWY_SVE_FOREACH_U
3160#undef HWY_SVE_FOREACH_U08
3161#undef HWY_SVE_FOREACH_U16
3162#undef HWY_SVE_FOREACH_U32
3163#undef HWY_SVE_FOREACH_U64
3164#undef HWY_SVE_FOREACH_UI
3165#undef HWY_SVE_FOREACH_UI08
3166#undef HWY_SVE_FOREACH_UI16
3167#undef HWY_SVE_FOREACH_UI32
3168#undef HWY_SVE_FOREACH_UI64
3169#undef HWY_SVE_FOREACH_UIF3264
3170#undef HWY_SVE_PTRUE
3171#undef HWY_SVE_RETV_ARGPV
3172#undef HWY_SVE_RETV_ARGPVN
3173#undef HWY_SVE_RETV_ARGPVV
3174#undef HWY_SVE_RETV_ARGV
3175#undef HWY_SVE_RETV_ARGVN
3176#undef HWY_SVE_RETV_ARGVV
3177#undef HWY_SVE_RETV_ARGVVV
3178#undef HWY_SVE_T
3179#undef HWY_SVE_UNDEFINED
3180#undef HWY_SVE_V
3181
3182} // namespace detail
3183// NOLINTNEXTLINE(google-readability-namespace-comments)
3184} // namespace HWY_NAMESPACE
3185} // namespace hwy
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:103
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:59
HWY_AFTER_NAMESPACE()
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1121
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:71
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2978
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:772
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1486
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1073
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1471
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:55
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1584
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1155
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:365
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:710
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:354
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:126
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:266
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2097
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:994
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:559
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1088
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:310
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:986
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:184
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1985
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:118
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2529
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:111
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:155
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1881
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:63
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:340
#define HWY_SVE_PTRUE(BITS)
Definition arm_sve-inl.h:213
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1137
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1777
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1203
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1188
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:784
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1097
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:89
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:161
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1003
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2055
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:178
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:633
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:95
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1222
#define HWY_SVE_IS_POW2
Definition arm_sve-inl.h:30
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1011
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2185
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:978
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:850
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:515
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:83
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:138
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1551
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:822
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:449
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:122
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:56
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:77
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:846
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:914
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:280
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2858
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1174
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:107
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1063
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:601
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1891
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:99
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:173
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:151
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:459
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1930
#define HWY_RESTRICT
Definition base.h:64
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_TARGET
Definition detect_targets.h:380
#define HWY_SVE_256
Definition detect_targets.h:81
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition arm_sve-inl.h:2782
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition arm_sve-inl.h:196
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition arm_sve-inl.h:2803
svbool_t MaskLowerHalf(D d)
Definition arm_sve-inl.h:1671
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2078
svbool_t MakeMask(D d)
Definition arm_sve-inl.h:300
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:2069
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
VI SaturateI(VI v)
Definition arm_sve-inl.h:1319
HWY_API svbool_t PFalse()
Definition arm_sve-inl.h:293
svbool_t MaskUpperHalf(D d)
Definition arm_sve-inl.h:1765
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
VU SaturateU(VU v)
Definition arm_sve-inl.h:1313
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition arm_sve-inl.h:1299
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3051
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:2990
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:123
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:115
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2084
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3038
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition x86_512-inl.h:1613
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:173
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6549
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:885
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API constexpr bool IsSame()
Definition base.h:396
constexpr size_t CeilLog2(TI x)
Definition base.h:899
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
@ value
Definition arm_neon-inl.h:5730
Definition arm_sve-inl.h:40
Definition ops/shared-inl.h:52
Definition base.h:435
Definition base.h:296
uint16_t bits
Definition base.h:297