17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
22#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
36template <
size_t kBits>
38template <
size_t kBits>
47 using VU16 =
Vec<
decltype(d16)>;
48 const size_t N8 =
Lanes(d8);
60 Xor3(
Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
61 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
62 Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
70 using VU16 =
Vec<
decltype(d16)>;
71 const size_t N8 =
Lanes(d8);
72 const VU16 mask =
Set(d16, 0x0101u);
76 const VU16 raw0 =
And(packed, mask);
79 const VU16 raw1 =
And(ShiftRight<1>(packed), mask);
82 const VU16 raw2 =
And(ShiftRight<2>(packed), mask);
85 const VU16 raw3 =
And(ShiftRight<3>(packed), mask);
88 const VU16 raw4 =
And(ShiftRight<4>(packed), mask);
91 const VU16 raw5 =
And(ShiftRight<5>(packed), mask);
94 const VU16 raw6 =
And(ShiftRight<6>(packed), mask);
97 const VU16 raw7 =
And(ShiftRight<7>(packed), mask);
108 using VU16 =
Vec<
decltype(d16)>;
109 const size_t N8 =
Lanes(d8);
111 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
112 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
113 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
114 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
115 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
116 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
117 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
118 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
120 const VU16 packed0 =
Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
121 Or(ShiftLeft<2>(raw2), raw0));
122 const VU16 packed1 =
Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
123 Or(ShiftLeft<2>(raw3), raw1));
132 using VU16 =
Vec<
decltype(d16)>;
133 const size_t N8 =
Lanes(d8);
134 const VU16 mask =
Set(d16, 0x0303u);
136 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
137 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
139 const VU16 raw0 =
And(packed0, mask);
142 const VU16 raw1 =
And(packed1, mask);
145 const VU16 raw2 =
And(ShiftRight<2>(packed0), mask);
148 const VU16 raw3 =
And(ShiftRight<2>(packed1), mask);
151 const VU16 raw4 =
And(ShiftRight<4>(packed0), mask);
154 const VU16 raw5 =
And(ShiftRight<4>(packed1), mask);
157 const VU16 raw6 =
And(ShiftRight<6>(packed0), mask);
160 const VU16 raw7 =
And(ShiftRight<6>(packed1), mask);
171 using VU16 =
Vec<
decltype(d16)>;
172 const size_t N8 =
Lanes(d8);
173 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
174 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
175 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
176 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
177 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
178 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
179 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
180 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
183 VU16 packed0 =
Or(ShiftLeft<3>(raw4), raw0);
184 VU16 packed1 =
Or(ShiftLeft<3>(raw5), raw1);
185 VU16 packed2 =
Or(ShiftLeft<3>(raw6), raw2);
186 const VU16 packed3 =
Or(ShiftLeft<3>(raw7), raw3);
188 const VU16 hi2 =
Set(d16, 0xC0C0u);
189 packed0 =
OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
190 packed1 =
OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
191 packed2 =
OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
201 using VU16 =
Vec<
decltype(d16)>;
202 const size_t N8 =
Lanes(d8);
203 const VU16 mask =
Set(d16, 0x0707u);
205 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
206 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
207 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
209 const VU16 raw0 =
And(packed0, mask);
212 const VU16 raw1 =
And(packed1, mask);
215 const VU16 raw2 =
And(packed2, mask);
218 const VU16 raw4 =
And(ShiftRight<3>(packed0), mask);
221 const VU16 raw5 =
And(ShiftRight<3>(packed1), mask);
224 const VU16 raw6 =
And(ShiftRight<3>(packed2), mask);
228 const VU16 hi2 =
Set(d16, 0xC0C0u);
229 const VU16 raw73 =
Xor3(ShiftRight<6>(
And(packed2, hi2)),
230 ShiftRight<4>(
And(packed1, hi2)),
231 ShiftRight<2>(
And(packed0, hi2)));
233 const VU16 raw3 =
And(mask, raw73);
236 const VU16 raw7 =
And(mask, ShiftRight<3>(raw73));
247 using VU16 =
Vec<
decltype(d16)>;
248 const size_t N8 =
Lanes(d8);
250 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
251 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
252 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
253 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
254 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
255 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
256 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
257 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
259 const VU16 packed0 =
Or(ShiftLeft<4>(raw2), raw0);
260 const VU16 packed1 =
Or(ShiftLeft<4>(raw3), raw1);
261 const VU16 packed2 =
Or(ShiftLeft<4>(raw6), raw4);
262 const VU16 packed3 =
Or(ShiftLeft<4>(raw7), raw5);
274 using VU16 =
Vec<
decltype(d16)>;
275 const size_t N8 =
Lanes(d8);
276 const VU16 mask =
Set(d16, 0x0F0Fu);
278 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
279 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
280 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
281 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
283 const VU16 raw0 =
And(packed0, mask);
286 const VU16 raw1 =
And(packed1, mask);
289 const VU16 raw2 =
And(ShiftRight<4>(packed0), mask);
292 const VU16 raw3 =
And(ShiftRight<4>(packed1), mask);
295 const VU16 raw4 =
And(packed2, mask);
298 const VU16 raw5 =
And(packed3, mask);
301 const VU16 raw6 =
And(ShiftRight<4>(packed2), mask);
304 const VU16 raw7 =
And(ShiftRight<4>(packed3), mask);
315 using VU16 =
Vec<
decltype(d16)>;
316 const size_t N8 =
Lanes(d8);
317 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
318 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
319 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
320 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
321 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
322 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
323 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
324 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
327 const VU16 hi3 =
Set(d16, 0xE0E0u);
328 const VU16 packed0 =
OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
329 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
330 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
331 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
339 const VU16 lo2 =
Set(d16, 0x0303u);
340 const VU16 packed4 =
Or(
And(raw4, lo2),
Xor3(ShiftLeft<2>(
And(raw5, lo2)),
341 ShiftLeft<4>(
And(raw6, lo2)),
342 ShiftLeft<6>(
And(raw7, lo2))));
350 using VU16 =
Vec<
decltype(d16)>;
351 const size_t N8 =
Lanes(d8);
353 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
354 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
355 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
356 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
357 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
359 const VU16 mask =
Set(d16, 0x1F1Fu);
361 const VU16 raw0 =
And(packed0, mask);
364 const VU16 raw1 =
And(packed1, mask);
367 const VU16 raw2 =
And(packed2, mask);
370 const VU16 raw3 =
And(packed3, mask);
374 const VU16 top4 = ShiftRight<3>(
AndNot(mask, packed0));
375 const VU16 top5 = ShiftRight<3>(
AndNot(mask, packed1));
376 const VU16 top6 = ShiftRight<3>(
AndNot(mask, packed2));
377 const VU16 top7 = ShiftRight<3>(
AndNot(mask, packed3));
380 const VU16 lo2 =
Set(d16, 0x0303u);
381 const VU16 raw4 =
OrAnd(top4, lo2, packed4);
382 const VU16 raw5 =
OrAnd(top5, lo2, ShiftRight<2>(packed4));
383 const VU16 raw6 =
OrAnd(top6, lo2, ShiftRight<4>(packed4));
384 const VU16 raw7 =
OrAnd(top7, lo2, ShiftRight<6>(packed4));
399 using VU16 =
Vec<
decltype(d16)>;
400 const size_t N8 =
Lanes(d8);
401 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
402 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
403 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
404 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
405 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
406 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
407 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
408 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
410 const VU16 hi2 =
Set(d16, 0xC0C0u);
412 const VU16 packed0 =
OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
413 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
414 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
415 const VU16 packed3 =
OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
416 const VU16 packed4 =
OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
417 const VU16 packed5 =
OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
431 using VU16 =
Vec<
decltype(d16)>;
432 const size_t N8 =
Lanes(d8);
433 const VU16 mask =
Set(d16, 0x3F3Fu);
435 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
436 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
437 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
438 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
439 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
440 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
442 const VU16 raw0 =
And(packed0, mask);
445 const VU16 raw1 =
And(packed1, mask);
448 const VU16 raw2 =
And(packed2, mask);
451 const VU16 raw4 =
And(packed3, mask);
454 const VU16 raw5 =
And(packed4, mask);
457 const VU16 raw6 =
And(packed5, mask);
461 const VU16 raw3 =
Xor3(ShiftRight<6>(
AndNot(mask, packed2)),
462 ShiftRight<4>(
AndNot(mask, packed1)),
463 ShiftRight<2>(
AndNot(mask, packed0)));
464 const VU16 raw7 =
Xor3(ShiftRight<6>(
AndNot(mask, packed5)),
465 ShiftRight<4>(
AndNot(mask, packed4)),
466 ShiftRight<2>(
AndNot(mask, packed3)));
478 using VU16 =
Vec<
decltype(d16)>;
479 const size_t N8 =
Lanes(d8);
480 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
481 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
482 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
483 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
484 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
485 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
486 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
488 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
490 const VU16 hi1 =
Set(d16, 0x8080u);
491 const VU16 packed0 =
OrAnd(raw0, Add(raw7, raw7), hi1);
492 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
493 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
494 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
495 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
496 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
497 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
512 using VU16 =
Vec<
decltype(d16)>;
513 const size_t N8 =
Lanes(d8);
515 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
516 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
517 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
518 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
519 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
520 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
521 const VU16 packed6 =
BitCast(d16,
LoadU(d8, packed_in + 6 * N8));
523 const VU16 mask =
Set(d16, 0x7F7Fu);
525 const VU16 raw0 =
And(packed0, mask);
528 const VU16 raw1 =
And(packed1, mask);
531 const VU16 raw2 =
And(packed2, mask);
534 const VU16 raw3 =
And(packed3, mask);
537 const VU16 raw4 =
And(packed4, mask);
540 const VU16 raw5 =
And(packed5, mask);
543 const VU16 raw6 =
And(packed6, mask);
546 const VU16 p0 =
Xor3(ShiftRight<7>(
AndNot(mask, packed6)),
547 ShiftRight<6>(
AndNot(mask, packed5)),
548 ShiftRight<5>(
AndNot(mask, packed4)));
549 const VU16 p1 =
Xor3(ShiftRight<4>(
AndNot(mask, packed3)),
550 ShiftRight<3>(
AndNot(mask, packed2)),
551 ShiftRight<2>(
AndNot(mask, packed1)));
552 const VU16 raw7 =
Xor3(ShiftRight<1>(
AndNot(mask, packed0)), p0, p1);
562 using VU8 =
Vec<
decltype(d8)>;
563 const size_t N8 =
Lanes(d8);
564 const VU8 raw0 =
LoadU(d8, raw + 0 * N8);
565 const VU8 raw1 =
LoadU(d8, raw + 1 * N8);
566 const VU8 raw2 =
LoadU(d8, raw + 2 * N8);
567 const VU8 raw3 =
LoadU(d8, raw + 3 * N8);
568 const VU8 raw4 =
LoadU(d8, raw + 4 * N8);
569 const VU8 raw5 =
LoadU(d8, raw + 5 * N8);
570 const VU8 raw6 =
LoadU(d8, raw + 6 * N8);
571 const VU8 raw7 =
LoadU(d8, raw + 7 * N8);
573 StoreU(raw0, d8, packed_out + 0 * N8);
574 StoreU(raw1, d8, packed_out + 1 * N8);
575 StoreU(raw2, d8, packed_out + 2 * N8);
576 StoreU(raw3, d8, packed_out + 3 * N8);
577 StoreU(raw4, d8, packed_out + 4 * N8);
578 StoreU(raw5, d8, packed_out + 5 * N8);
579 StoreU(raw6, d8, packed_out + 6 * N8);
580 StoreU(raw7, d8, packed_out + 7 * N8);
586 using VU8 =
Vec<
decltype(d8)>;
587 const size_t N8 =
Lanes(d8);
588 const VU8 raw0 =
LoadU(d8, packed_in + 0 * N8);
589 const VU8 raw1 =
LoadU(d8, packed_in + 1 * N8);
590 const VU8 raw2 =
LoadU(d8, packed_in + 2 * N8);
591 const VU8 raw3 =
LoadU(d8, packed_in + 3 * N8);
592 const VU8 raw4 =
LoadU(d8, packed_in + 4 * N8);
593 const VU8 raw5 =
LoadU(d8, packed_in + 5 * N8);
594 const VU8 raw6 =
LoadU(d8, packed_in + 6 * N8);
595 const VU8 raw7 =
LoadU(d8, packed_in + 7 * N8);
597 StoreU(raw0, d8, raw + 0 * N8);
598 StoreU(raw1, d8, raw + 1 * N8);
599 StoreU(raw2, d8, raw + 2 * N8);
600 StoreU(raw3, d8, raw + 3 * N8);
601 StoreU(raw4, d8, raw + 4 * N8);
602 StoreU(raw5, d8, raw + 5 * N8);
603 StoreU(raw6, d8, raw + 6 * N8);
604 StoreU(raw7, d8, raw + 7 * N8);
613 using VU16 =
Vec<
decltype(
d)>;
615 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
616 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
617 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
618 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
619 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
620 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
621 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
622 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
623 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
624 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
625 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
626 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
627 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
628 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
629 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
630 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
632 const VU16 p0 =
Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0);
634 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
636 Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
638 Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
640 Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC));
642 Or(
Xor3(ShiftLeft<0xF>(rawF), p0, p1),
Xor3(p2, p3, p4));
649 using VU16 =
Vec<
decltype(
d)>;
651 const VU16 mask =
Set(
d, 1u);
653 const VU16 packed =
LoadU(
d, packed_in);
655 const VU16 raw0 =
And(packed, mask);
658 const VU16 raw1 =
And(ShiftRight<1>(packed), mask);
661 const VU16 raw2 =
And(ShiftRight<2>(packed), mask);
664 const VU16 raw3 =
And(ShiftRight<3>(packed), mask);
667 const VU16 raw4 =
And(ShiftRight<4>(packed), mask);
670 const VU16 raw5 =
And(ShiftRight<5>(packed), mask);
673 const VU16 raw6 =
And(ShiftRight<6>(packed), mask);
676 const VU16 raw7 =
And(ShiftRight<7>(packed), mask);
679 const VU16 raw8 =
And(ShiftRight<8>(packed), mask);
682 const VU16 raw9 =
And(ShiftRight<9>(packed), mask);
685 const VU16 rawA =
And(ShiftRight<0xA>(packed), mask);
688 const VU16 rawB =
And(ShiftRight<0xB>(packed), mask);
691 const VU16 rawC =
And(ShiftRight<0xC>(packed), mask);
694 const VU16 rawD =
And(ShiftRight<0xD>(packed), mask);
697 const VU16 rawE =
And(ShiftRight<0xE>(packed), mask);
700 const VU16 rawF = ShiftRight<0xF>(packed);
710 using VU16 =
Vec<
decltype(
d)>;
712 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
713 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
714 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
715 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
716 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
717 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
718 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
719 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
720 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
721 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
722 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
723 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
724 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
725 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
726 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
727 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
729 VU16 packed0 =
Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
730 VU16 packed1 =
Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
731 packed0 =
Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
732 packed1 =
Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
734 packed0 =
Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
735 packed1 =
Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
737 packed0 =
Or(packed0, ShiftLeft<14>(rawE));
738 packed1 =
Or(packed1, ShiftLeft<14>(rawF));
739 StoreU(packed0,
d, packed_out + 0 *
N);
740 StoreU(packed1,
d, packed_out + 1 *
N);
746 using VU16 =
Vec<
decltype(
d)>;
748 const VU16 mask =
Set(
d, 0x3u);
750 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
751 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
753 const VU16 raw0 =
And(packed0, mask);
756 const VU16 raw1 =
And(packed1, mask);
759 const VU16 raw2 =
And(ShiftRight<2>(packed0), mask);
762 const VU16 raw3 =
And(ShiftRight<2>(packed1), mask);
765 const VU16 raw4 =
And(ShiftRight<4>(packed0), mask);
768 const VU16 raw5 =
And(ShiftRight<4>(packed1), mask);
771 const VU16 raw6 =
And(ShiftRight<6>(packed0), mask);
774 const VU16 raw7 =
And(ShiftRight<6>(packed1), mask);
777 const VU16 raw8 =
And(ShiftRight<8>(packed0), mask);
780 const VU16 raw9 =
And(ShiftRight<8>(packed1), mask);
783 const VU16 rawA =
And(ShiftRight<0xA>(packed0), mask);
786 const VU16 rawB =
And(ShiftRight<0xA>(packed1), mask);
789 const VU16 rawC =
And(ShiftRight<0xC>(packed0), mask);
792 const VU16 rawD =
And(ShiftRight<0xC>(packed1), mask);
795 const VU16 rawE = ShiftRight<0xE>(packed0);
798 const VU16 rawF = ShiftRight<0xE>(packed1);
808 using VU16 =
Vec<
decltype(
d)>;
810 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
811 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
812 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
813 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
814 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
815 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
816 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
817 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
818 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
819 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
820 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
821 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
822 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
823 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
824 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
825 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
828 VU16 packed0 =
Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
829 VU16 packed1 =
Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
830 VU16 packed2 =
Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
833 packed0 =
Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
834 packed1 =
Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
835 packed2 =
Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
837 const VU16 hi1 =
Set(
d, 0x8000u);
838 packed0 =
Or(packed0, ShiftLeft<15>(rawF));
839 packed1 =
OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
840 packed2 =
OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
841 StoreU(packed0,
d, packed_out + 0 *
N);
842 StoreU(packed1,
d, packed_out + 1 *
N);
843 StoreU(packed2,
d, packed_out + 2 *
N);
849 using VU16 =
Vec<
decltype(
d)>;
851 const VU16 mask =
Set(
d, 0x7u);
853 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
854 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
855 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
857 const VU16 raw0 =
And(mask, packed0);
860 const VU16 raw1 =
And(mask, packed1);
863 const VU16 raw2 =
And(mask, packed2);
866 const VU16 raw3 =
And(mask, ShiftRight<3>(packed0));
869 const VU16 raw4 =
And(mask, ShiftRight<3>(packed1));
872 const VU16 raw5 =
And(mask, ShiftRight<3>(packed2));
875 const VU16 raw6 =
And(mask, ShiftRight<6>(packed0));
878 const VU16 raw7 =
And(mask, ShiftRight<6>(packed1));
881 const VU16 raw8 =
And(mask, ShiftRight<6>(packed2));
884 const VU16 raw9 =
And(mask, ShiftRight<9>(packed0));
887 const VU16 rawA =
And(mask, ShiftRight<9>(packed1));
890 const VU16 rawB =
And(mask, ShiftRight<9>(packed2));
893 const VU16 rawC =
And(mask, ShiftRight<12>(packed0));
896 const VU16 rawD =
And(mask, ShiftRight<12>(packed1));
899 const VU16 rawE =
And(mask, ShiftRight<12>(packed2));
903 const VU16 down0 = ShiftRight<15>(packed0);
904 const VU16 down1 = ShiftRight<15>(packed1);
905 const VU16 down2 = ShiftRight<15>(packed2);
906 const VU16 rawF =
Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0);
916 using VU16 =
Vec<
decltype(
d)>;
918 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
919 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
920 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
921 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
922 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
923 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
924 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
925 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
926 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
927 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
928 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
929 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
930 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
931 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
932 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
933 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
935 VU16 packed0 =
Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
936 VU16 packed1 =
Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
937 packed0 =
Or(packed0, ShiftLeft<12>(raw6));
938 packed1 =
Or(packed1, ShiftLeft<12>(raw7));
939 VU16 packed2 =
Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
940 VU16 packed3 =
Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
941 packed2 =
Or(packed2, ShiftLeft<12>(rawE));
942 packed3 =
Or(packed3, ShiftLeft<12>(rawF));
944 StoreU(packed0,
d, packed_out + 0 *
N);
945 StoreU(packed1,
d, packed_out + 1 *
N);
946 StoreU(packed2,
d, packed_out + 2 *
N);
947 StoreU(packed3,
d, packed_out + 3 *
N);
953 using VU16 =
Vec<
decltype(
d)>;
955 const VU16 mask =
Set(
d, 0xFu);
957 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
958 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
959 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
960 const VU16 packed3 =
LoadU(
d, packed_in + 3 *
N);
962 const VU16 raw0 =
And(packed0, mask);
965 const VU16 raw1 =
And(packed1, mask);
968 const VU16 raw2 =
And(ShiftRight<4>(packed0), mask);
971 const VU16 raw3 =
And(ShiftRight<4>(packed1), mask);
974 const VU16 raw4 =
And(ShiftRight<8>(packed0), mask);
977 const VU16 raw5 =
And(ShiftRight<8>(packed1), mask);
980 const VU16 raw6 = ShiftRight<12>(packed0);
983 const VU16 raw7 = ShiftRight<12>(packed1);
986 const VU16 raw8 =
And(packed2, mask);
989 const VU16 raw9 =
And(packed3, mask);
992 const VU16 rawA =
And(ShiftRight<4>(packed2), mask);
995 const VU16 rawB =
And(ShiftRight<4>(packed3), mask);
998 const VU16 rawC =
And(ShiftRight<8>(packed2), mask);
1001 const VU16 rawD =
And(ShiftRight<8>(packed3), mask);
1004 const VU16 rawE = ShiftRight<12>(packed2);
1007 const VU16 rawF = ShiftRight<12>(packed3);
1017 using VU16 =
Vec<
decltype(
d)>;
1019 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1020 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1021 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1022 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1023 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1024 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1025 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1026 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1027 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1028 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1029 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1030 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1031 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1032 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1033 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1034 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1037 VU16 packed0 =
Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
1038 VU16 packed1 =
Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
1039 VU16 packed2 =
Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
1040 VU16 packed3 =
Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
1041 VU16 packed4 =
Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
1044 const VU16 hi1 =
Set(
d, 0x8000u);
1045 packed0 =
Or(packed0, ShiftLeft<15>(rawF));
1046 packed1 =
OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
1047 packed2 =
OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
1048 packed3 =
OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
1049 packed4 =
OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
1051 StoreU(packed0,
d, packed_out + 0 *
N);
1052 StoreU(packed1,
d, packed_out + 1 *
N);
1053 StoreU(packed2,
d, packed_out + 2 *
N);
1054 StoreU(packed3,
d, packed_out + 3 *
N);
1055 StoreU(packed4,
d, packed_out + 4 *
N);
1061 using VU16 =
Vec<
decltype(
d)>;
1064 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
1065 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
1066 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
1067 const VU16 packed3 =
LoadU(
d, packed_in + 3 *
N);
1068 const VU16 packed4 =
LoadU(
d, packed_in + 4 *
N);
1070 const VU16 mask =
Set(
d, 0x1Fu);
1072 const VU16 raw0 =
And(packed0, mask);
1075 const VU16 raw1 =
And(packed1, mask);
1078 const VU16 raw2 =
And(packed2, mask);
1081 const VU16 raw3 =
And(packed3, mask);
1084 const VU16 raw4 =
And(packed4, mask);
1087 const VU16 raw5 =
And(ShiftRight<5>(packed0), mask);
1090 const VU16 raw6 =
And(ShiftRight<5>(packed1), mask);
1093 const VU16 raw7 =
And(ShiftRight<5>(packed2), mask);
1096 const VU16 raw8 =
And(ShiftRight<5>(packed3), mask);
1099 const VU16 raw9 =
And(ShiftRight<5>(packed4), mask);
1102 const VU16 rawA =
And(ShiftRight<10>(packed0), mask);
1105 const VU16 rawB =
And(ShiftRight<10>(packed1), mask);
1108 const VU16 rawC =
And(ShiftRight<10>(packed2), mask);
1111 const VU16 rawD =
And(ShiftRight<10>(packed3), mask);
1114 const VU16 rawE =
And(ShiftRight<10>(packed4), mask);
1118 const VU16 down0 = ShiftRight<15>(packed0);
1119 const VU16 down1 = ShiftRight<15>(packed1);
1120 const VU16 hi1 =
Set(
d, 0x8000u);
1122 Xor3(ShiftRight<13>(
And(packed2, hi1)), Add(down1, down1), down0);
1123 const VU16 rawF =
Xor3(ShiftRight<11>(
And(packed4, hi1)),
1124 ShiftRight<12>(
And(packed3, hi1)), p0);
1134 using VU16 =
Vec<
decltype(
d)>;
1136 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1137 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1138 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1139 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1140 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1141 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1142 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1143 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1144 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1145 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1146 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1147 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1148 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1149 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1150 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1151 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1153 const VU16 packed3 =
Or(ShiftLeft<6>(raw7), raw3);
1154 const VU16 packed7 =
Or(ShiftLeft<6>(rawF), rawB);
1157 const VU16 packed0 =
Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
1158 VU16 packed1 =
Or(ShiftLeft<6>(raw5), raw1);
1159 VU16 packed2 =
Or(ShiftLeft<6>(raw6), raw2);
1160 const VU16 packed4 =
Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
1161 VU16 packed5 =
Or(ShiftLeft<6>(rawD), raw9);
1162 VU16 packed6 =
Or(ShiftLeft<6>(rawE), rawA);
1164 const VU16 hi4 =
Set(
d, 0xF000u);
1165 packed1 =
OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
1166 packed2 =
OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
1167 packed5 =
OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
1168 packed6 =
OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
1170 StoreU(packed0,
d, packed_out + 0 *
N);
1171 StoreU(packed1,
d, packed_out + 1 *
N);
1172 StoreU(packed2,
d, packed_out + 2 *
N);
1173 StoreU(packed4,
d, packed_out + 3 *
N);
1174 StoreU(packed5,
d, packed_out + 4 *
N);
1175 StoreU(packed6,
d, packed_out + 5 *
N);
1181 using VU16 =
Vec<
decltype(
d)>;
1183 const VU16 mask =
Set(
d, 0x3Fu);
1185 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
1186 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
1187 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
1188 const VU16 packed4 =
LoadU(
d, packed_in + 3 *
N);
1189 const VU16 packed5 =
LoadU(
d, packed_in + 4 *
N);
1190 const VU16 packed6 =
LoadU(
d, packed_in + 5 *
N);
1192 const VU16 raw0 =
And(packed0, mask);
1195 const VU16 raw1 =
And(packed1, mask);
1198 const VU16 raw2 =
And(packed2, mask);
1201 const VU16 raw4 =
And(ShiftRight<6>(packed0), mask);
1204 const VU16 raw5 =
And(ShiftRight<6>(packed1), mask);
1207 const VU16 raw6 =
And(ShiftRight<6>(packed2), mask);
1210 const VU16 raw8 =
And(packed4, mask);
1213 const VU16 raw9 =
And(packed5, mask);
1216 const VU16 rawA =
And(packed6, mask);
1219 const VU16 rawC =
And(ShiftRight<6>(packed4), mask);
1222 const VU16 rawD =
And(ShiftRight<6>(packed5), mask);
1225 const VU16 rawE =
And(ShiftRight<6>(packed6), mask);
1229 const VU16 down0 = ShiftRight<12>(packed0);
1230 const VU16 down4 = ShiftRight<12>(packed4);
1231 const VU16 hi4 =
Set(
d, 0xF000u);
1232 const VU16 packed3 =
Xor3(ShiftRight<4>(
And(packed2, hi4)),
1233 ShiftRight<8>(
And(packed1, hi4)), down0);
1234 const VU16 packed7 =
Xor3(ShiftRight<4>(
And(packed6, hi4)),
1235 ShiftRight<8>(
And(packed5, hi4)), down4);
1236 const VU16 raw3 =
And(packed3, mask);
1239 const VU16 rawB =
And(packed7, mask);
1242 const VU16 raw7 = ShiftRight<6>(packed3);
1245 const VU16 rawF = ShiftRight<6>(packed7);
1255 using VU16 =
Vec<
decltype(
d)>;
1257 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1258 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1259 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1260 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1261 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1262 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1263 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1264 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1265 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1266 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1267 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1268 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1269 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1270 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1271 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1272 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1274 const VU16 packed7 =
Or(ShiftLeft<7>(rawF), raw7);
1277 const VU16 packed0 =
Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
1278 VU16 packed1 =
Or(ShiftLeft<7>(raw9), raw1);
1279 VU16 packed2 =
Or(ShiftLeft<7>(rawA), raw2);
1280 VU16 packed3 =
Or(ShiftLeft<7>(rawB), raw3);
1281 VU16 packed4 =
Or(ShiftLeft<7>(rawC), raw4);
1282 VU16 packed5 =
Or(ShiftLeft<7>(rawD), raw5);
1283 VU16 packed6 =
Or(ShiftLeft<7>(rawE), raw6);
1285 const VU16 hi2 =
Set(
d, 0xC000u);
1286 packed1 =
OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
1287 packed2 =
OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
1288 packed3 =
OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
1289 packed4 =
OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
1290 packed5 =
OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
1291 packed6 =
OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
1293 StoreU(packed0,
d, packed_out + 0 *
N);
1294 StoreU(packed1,
d, packed_out + 1 *
N);
1295 StoreU(packed2,
d, packed_out + 2 *
N);
1296 StoreU(packed3,
d, packed_out + 3 *
N);
1297 StoreU(packed4,
d, packed_out + 4 *
N);
1298 StoreU(packed5,
d, packed_out + 5 *
N);
1299 StoreU(packed6,
d, packed_out + 6 *
N);
1305 using VU16 =
Vec<
decltype(
d)>;
1316 const VU16 mask =
Set(
d, 0x7Fu);
1318 const VU16 raw0 =
And(packed0, mask);
1321 const VU16 raw1 =
And(packed1, mask);
1324 const VU16 raw2 =
And(packed2, mask);
1327 const VU16 raw3 =
And(packed3, mask);
1330 const VU16 raw4 =
And(packed4, mask);
1333 const VU16 raw5 =
And(packed5, mask);
1336 const VU16 raw6 =
And(packed6, mask);
1339 const VU16 raw8 =
And(ShiftRight<7>(packed0), mask);
1342 const VU16 raw9 =
And(ShiftRight<7>(packed1), mask);
1345 const VU16 rawA =
And(ShiftRight<7>(packed2), mask);
1348 const VU16 rawB =
And(ShiftRight<7>(packed3), mask);
1351 const VU16 rawC =
And(ShiftRight<7>(packed4), mask);
1354 const VU16 rawD =
And(ShiftRight<7>(packed5), mask);
1357 const VU16 rawE =
And(ShiftRight<7>(packed6), mask);
1361 const VU16 down0 = ShiftRight<14>(packed0);
1362 const VU16 hi2 =
Set(
d, 0xC000u);
1363 const VU16 p0 =
Xor3(ShiftRight<12>(
And(packed1, hi2)),
1364 ShiftRight<10>(
And(packed2, hi2)), down0);
1365 const VU16 p1 =
Xor3(ShiftRight<8>(
And(packed3, hi2)),
1366 ShiftRight<6>(
And(packed4, hi2)),
1367 ShiftRight<4>(
And(packed5, hi2)));
1368 const VU16 packed7 =
Xor3(ShiftRight<2>(
And(packed6, hi2)), p1, p0);
1370 const VU16 raw7 =
And(packed7, mask);
1373 const VU16 rawF = ShiftRight<7>(packed7);
1383 using VU16 =
Vec<
decltype(
d)>;
1385 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1386 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1387 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1388 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1389 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1390 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1391 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1392 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1393 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1394 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1395 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1396 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1397 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1398 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1399 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1400 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1404 const VU16 packed0 =
Or(ShiftLeft<8>(raw2), raw0);
1405 const VU16 packed1 =
Or(ShiftLeft<8>(raw3), raw1);
1406 const VU16 packed2 =
Or(ShiftLeft<8>(raw6), raw4);
1407 const VU16 packed3 =
Or(ShiftLeft<8>(raw7), raw5);
1408 const VU16 packed4 =
Or(ShiftLeft<8>(rawA), raw8);
1409 const VU16 packed5 =
Or(ShiftLeft<8>(rawB), raw9);
1410 const VU16 packed6 =
Or(ShiftLeft<8>(rawE), rawC);
1411 const VU16 packed7 =
Or(ShiftLeft<8>(rawF), rawD);
1413 StoreU(packed0,
d, packed_out + 0 *
N);
1414 StoreU(packed1,
d, packed_out + 1 *
N);
1415 StoreU(packed2,
d, packed_out + 2 *
N);
1416 StoreU(packed3,
d, packed_out + 3 *
N);
1417 StoreU(packed4,
d, packed_out + 4 *
N);
1418 StoreU(packed5,
d, packed_out + 5 *
N);
1419 StoreU(packed6,
d, packed_out + 6 *
N);
1420 StoreU(packed7,
d, packed_out + 7 *
N);
1426 using VU16 =
Vec<
decltype(
d)>;
1437 const VU16 mask =
Set(
d, 0xFFu);
1439 const VU16 raw0 =
And(packed0, mask);
1442 const VU16 raw1 =
And(packed1, mask);
1445 const VU16 raw2 = ShiftRight<8>(packed0);
1448 const VU16 raw3 = ShiftRight<8>(packed1);
1451 const VU16 raw4 =
And(packed2, mask);
1454 const VU16 raw5 =
And(packed3, mask);
1457 const VU16 raw6 = ShiftRight<8>(packed2);
1460 const VU16 raw7 = ShiftRight<8>(packed3);
1463 const VU16 raw8 =
And(packed4, mask);
1466 const VU16 raw9 =
And(packed5, mask);
1469 const VU16 rawA = ShiftRight<8>(packed4);
1472 const VU16 rawB = ShiftRight<8>(packed5);
1475 const VU16 rawC =
And(packed6, mask);
1478 const VU16 rawD =
And(packed7, mask);
1481 const VU16 rawE = ShiftRight<8>(packed6);
1484 const VU16 rawF = ShiftRight<8>(packed7);
1494 using VU16 =
Vec<
decltype(
d)>;
1496 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1497 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1498 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1499 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1500 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1501 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1502 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1503 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1504 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1505 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1506 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1507 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1508 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1509 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1510 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1511 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1513 const VU16 packed0 =
Or(ShiftLeft<9>(raw8), raw0);
1514 const VU16 packed1 =
Or(ShiftLeft<9>(raw9), raw1);
1515 const VU16 packed2 =
Or(ShiftLeft<9>(rawA), raw2);
1516 const VU16 packed3 =
Or(ShiftLeft<9>(rawB), raw3);
1517 const VU16 packed4 =
Or(ShiftLeft<9>(rawC), raw4);
1518 const VU16 packed5 =
Or(ShiftLeft<9>(rawD), raw5);
1519 const VU16 packed6 =
Or(ShiftLeft<9>(rawE), raw6);
1520 const VU16 packed7 =
Or(ShiftLeft<9>(rawF), raw7);
1525 const VU16 mid2 =
Set(
d, 0x180u);
1526 const VU16 part8 = ShiftRight<7>(
And(raw8, mid2));
1527 const VU16 part9 = ShiftRight<5>(
And(raw9, mid2));
1528 const VU16 partA = ShiftRight<3>(
And(rawA, mid2));
1529 const VU16 partB = ShiftRight<1>(
And(rawB, mid2));
1530 const VU16 partC = ShiftLeft<1>(
And(rawC, mid2));
1531 const VU16 partD = ShiftLeft<3>(
And(rawD, mid2));
1532 const VU16 partE = ShiftLeft<5>(
And(rawE, mid2));
1533 const VU16 partF = ShiftLeft<7>(
And(rawF, mid2));
1534 const VU16 packed8 =
Xor3(
Xor3(part8, part9, partA),
1535 Xor3(partB, partC, partD),
Or(partE, partF));
1537 StoreU(packed0,
d, packed_out + 0 *
N);
1538 StoreU(packed1,
d, packed_out + 1 *
N);
1539 StoreU(packed2,
d, packed_out + 2 *
N);
1540 StoreU(packed3,
d, packed_out + 3 *
N);
1541 StoreU(packed4,
d, packed_out + 4 *
N);
1542 StoreU(packed5,
d, packed_out + 5 *
N);
1543 StoreU(packed6,
d, packed_out + 6 *
N);
1544 StoreU(packed7,
d, packed_out + 7 *
N);
1545 StoreU(packed8,
d, packed_out + 8 *
N);
1551 using VU16 =
Vec<
decltype(
d)>;
1564 const VU16 mask =
Set(
d, 0x1FFu);
1566 const VU16 raw0 =
And(packed0, mask);
1569 const VU16 raw1 =
And(packed1, mask);
1572 const VU16 raw2 =
And(packed2, mask);
1575 const VU16 raw3 =
And(packed3, mask);
1578 const VU16 raw4 =
And(packed4, mask);
1581 const VU16 raw5 =
And(packed5, mask);
1584 const VU16 raw6 =
And(packed6, mask);
1587 const VU16 raw7 =
And(packed7, mask);
1590 const VU16 mid2 =
Set(
d, 0x180u);
1592 OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
1594 OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
1596 OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
1598 OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
1600 OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
1602 OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
1604 OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
1606 OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
1624 using VU16 =
Vec<
decltype(
d)>;
1626 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1627 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1628 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1629 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1630 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1631 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1632 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1633 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1634 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1635 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1636 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1637 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1638 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1639 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1640 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1641 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1645 const VU16 packed0 =
Or(ShiftLeft<10>(raw8), raw0);
1646 const VU16 packed1 =
Or(ShiftLeft<10>(raw9), raw1);
1647 const VU16 packed2 =
Or(ShiftLeft<10>(rawA), raw2);
1648 const VU16 packed3 =
Or(ShiftLeft<10>(rawB), raw3);
1649 const VU16 packed4 =
Or(ShiftLeft<10>(rawC), raw4);
1650 const VU16 packed5 =
Or(ShiftLeft<10>(rawD), raw5);
1651 const VU16 packed6 =
Or(ShiftLeft<10>(rawE), raw6);
1652 const VU16 packed7 =
Or(ShiftLeft<10>(rawF), raw7);
1657 const VU16 mid4 =
Set(
d, 0x3C0u);
1658 const VU16 part8 = ShiftRight<6>(
And(raw8, mid4));
1659 const VU16 part9 = ShiftRight<2>(
And(raw9, mid4));
1660 const VU16 partA = ShiftLeft<2>(
And(rawA, mid4));
1661 const VU16 partB = ShiftLeft<6>(
And(rawB, mid4));
1662 const VU16 partC = ShiftRight<6>(
And(rawC, mid4));
1663 const VU16 partD = ShiftRight<2>(
And(rawD, mid4));
1664 const VU16 partE = ShiftLeft<2>(
And(rawE, mid4));
1665 const VU16 partF = ShiftLeft<6>(
And(rawF, mid4));
1666 const VU16 packed8 =
Or(
Xor3(part8, part9, partA), partB);
1667 const VU16 packed9 =
Or(
Xor3(partC, partD, partE), partF);
1669 StoreU(packed0,
d, packed_out + 0 *
N);
1670 StoreU(packed1,
d, packed_out + 1 *
N);
1671 StoreU(packed2,
d, packed_out + 2 *
N);
1672 StoreU(packed3,
d, packed_out + 3 *
N);
1673 StoreU(packed4,
d, packed_out + 4 *
N);
1674 StoreU(packed5,
d, packed_out + 5 *
N);
1675 StoreU(packed6,
d, packed_out + 6 *
N);
1676 StoreU(packed7,
d, packed_out + 7 *
N);
1677 StoreU(packed8,
d, packed_out + 8 *
N);
1678 StoreU(packed9,
d, packed_out + 9 *
N);
1684 using VU16 =
Vec<
decltype(
d)>;
1698 const VU16 mask =
Set(
d, 0x3FFu);
1700 const VU16 raw0 =
And(packed0, mask);
1703 const VU16 raw1 =
And(packed1, mask);
1706 const VU16 raw2 =
And(packed2, mask);
1709 const VU16 raw3 =
And(packed3, mask);
1712 const VU16 raw4 =
And(packed4, mask);
1715 const VU16 raw5 =
And(packed5, mask);
1718 const VU16 raw6 =
And(packed6, mask);
1721 const VU16 raw7 =
And(packed7, mask);
1724 const VU16 mid4 =
Set(
d, 0x3C0u);
1726 OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
1728 OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
1730 OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
1732 OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
1734 OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
1736 OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
1738 OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
1740 OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
1758 using VU16 =
Vec<
decltype(
d)>;
1760 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1761 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1762 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1763 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1764 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1765 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1766 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1767 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1768 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1769 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1770 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1771 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1772 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1773 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1774 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1775 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1781 const VU16 lo8 =
Set(
d, 0xFFu);
1784 const VU16 packed0 =
OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
1785 const VU16 packed1 =
OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
1786 const VU16 packed2 =
OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
1787 const VU16 packed3 =
OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
1788 const VU16 packed4 =
OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
1789 const VU16 packed5 =
OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
1790 const VU16 packed6 =
OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
1791 const VU16 packed7 =
OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
1793 StoreU(packed0,
d, packed_out + 0 *
N);
1794 StoreU(packed1,
d, packed_out + 1 *
N);
1795 StoreU(packed2,
d, packed_out + 2 *
N);
1796 StoreU(packed3,
d, packed_out + 3 *
N);
1797 StoreU(packed4,
d, packed_out + 4 *
N);
1798 StoreU(packed5,
d, packed_out + 5 *
N);
1799 StoreU(packed6,
d, packed_out + 6 *
N);
1800 StoreU(packed7,
d, packed_out + 7 *
N);
1803 const VU16 top0 = ShiftRight<8>(raw0);
1804 const VU16 top1 = ShiftRight<8>(raw1);
1805 const VU16 top2 = ShiftRight<8>(raw2);
1808 VU16 next =
Set(
d, 0x38u);
1809 VU16 packed8 =
OrAnd(top0, ShiftRight<5>(raw3), next);
1810 VU16 packed9 =
OrAnd(top1, ShiftRight<5>(raw4), next);
1811 VU16 packedA =
OrAnd(top2, ShiftRight<5>(raw5), next);
1812 next = ShiftLeft<3>(next);
1813 packed8 =
OrAnd(packed8, ShiftRight<2>(raw6), next);
1814 packed9 =
OrAnd(packed9, ShiftRight<2>(raw7), next);
1815 packedA =
OrAnd(packedA, ShiftRight<2>(raw8), next);
1816 next = ShiftLeft<3>(next);
1817 packed8 =
OrAnd(packed8, Add(raw9, raw9), next);
1818 packed9 =
OrAnd(packed9, Add(rawA, rawA), next);
1819 packedA =
OrAnd(packedA, Add(rawB, rawB), next);
1820 next = ShiftLeft<3>(next);
1821 packed8 =
OrAnd(packed8, ShiftLeft<4>(rawC), next);
1822 packed9 =
OrAnd(packed9, ShiftLeft<4>(rawD), next);
1823 packedA =
OrAnd(packedA, ShiftLeft<4>(rawE), next);
1826 next = ShiftLeft<3>(next);
1827 packed8 =
OrAnd(packed8, ShiftLeft<7>(rawF), next);
1828 packed9 =
OrAnd(packed9, ShiftLeft<6>(rawF), next);
1829 packedA =
OrAnd(packedA, ShiftLeft<5>(rawF), next);
1831 StoreU(packed8,
d, packed_out + 8 *
N);
1832 StoreU(packed9,
d, packed_out + 9 *
N);
1833 StoreU(packedA,
d, packed_out + 0xA *
N);
1839 using VU16 =
Vec<
decltype(
d)>;
1854 const VU16 mask =
Set(
d, 0xFFu);
1856 const VU16 down0 =
And(packed0, mask);
1857 const VU16 down1 = ShiftRight<8>(packed0);
1858 const VU16 down2 =
And(packed1, mask);
1859 const VU16 down3 = ShiftRight<8>(packed1);
1860 const VU16 down4 =
And(packed2, mask);
1861 const VU16 down5 = ShiftRight<8>(packed2);
1862 const VU16 down6 =
And(packed3, mask);
1863 const VU16 down7 = ShiftRight<8>(packed3);
1864 const VU16 down8 =
And(packed4, mask);
1865 const VU16 down9 = ShiftRight<8>(packed4);
1866 const VU16 downA =
And(packed5, mask);
1867 const VU16 downB = ShiftRight<8>(packed5);
1868 const VU16 downC =
And(packed6, mask);
1869 const VU16 downD = ShiftRight<8>(packed6);
1870 const VU16 downE =
And(packed7, mask);
1871 const VU16 downF = ShiftRight<8>(packed7);
1874 const VU16 hi3 =
Set(
d, 0x700u);
1875 const VU16 raw0 =
OrAnd(down0, ShiftLeft<8>(packed8), hi3);
1876 const VU16 raw1 =
OrAnd(down1, ShiftLeft<8>(packed9), hi3);
1877 const VU16 raw2 =
OrAnd(down2, ShiftLeft<8>(packedA), hi3);
1879 const VU16 raw3 =
OrAnd(down3, ShiftLeft<5>(packed8), hi3);
1880 const VU16 raw4 =
OrAnd(down4, ShiftLeft<5>(packed9), hi3);
1881 const VU16 raw5 =
OrAnd(down5, ShiftLeft<5>(packedA), hi3);
1883 const VU16 raw6 =
OrAnd(down6, ShiftLeft<2>(packed8), hi3);
1884 const VU16 raw7 =
OrAnd(down7, ShiftLeft<2>(packed9), hi3);
1885 const VU16 raw8 =
OrAnd(down8, ShiftLeft<2>(packedA), hi3);
1887 const VU16 raw9 =
OrAnd(down9, ShiftRight<1>(packed8), hi3);
1888 const VU16 rawA =
OrAnd(downA, ShiftRight<1>(packed9), hi3);
1889 const VU16 rawB =
OrAnd(downB, ShiftRight<1>(packedA), hi3);
1891 const VU16 rawC =
OrAnd(downC, ShiftRight<4>(packed8), hi3);
1892 const VU16 rawD =
OrAnd(downD, ShiftRight<4>(packed9), hi3);
1893 const VU16 rawE =
OrAnd(downE, ShiftRight<4>(packedA), hi3);
1896 const VU16 rawF =
Or(downF,
Xor3(
And(ShiftRight<7>(packed8), hi3),
1897 And(ShiftRight<6>(packed9), hi3),
1898 And(ShiftRight<5>(packedA), hi3)));
1924 using VU16 =
Vec<
decltype(
d)>;
1926 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1927 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1928 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1929 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1930 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1931 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1932 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1933 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1934 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1935 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1936 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1937 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1938 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1939 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1940 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1941 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1945 const VU16 packed0 =
Or(ShiftLeft<12>(raw8), raw0);
1946 const VU16 packed1 =
Or(ShiftLeft<12>(raw9), raw1);
1947 const VU16 packed2 =
Or(ShiftLeft<12>(rawA), raw2);
1948 const VU16 packed3 =
Or(ShiftLeft<12>(rawB), raw3);
1949 const VU16 packed4 =
Or(ShiftLeft<12>(rawC), raw4);
1950 const VU16 packed5 =
Or(ShiftLeft<12>(rawD), raw5);
1951 const VU16 packed6 =
Or(ShiftLeft<12>(rawE), raw6);
1952 const VU16 packed7 =
Or(ShiftLeft<12>(rawF), raw7);
1955 const VU16 hi8 =
Set(
d, 0xFF00u);
1956 const VU16 packed8 =
OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
1957 const VU16 packed9 =
OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
1958 const VU16 packedA =
OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
1959 const VU16 packedB =
OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
1960 StoreU(packed0,
d, packed_out + 0 *
N);
1961 StoreU(packed1,
d, packed_out + 1 *
N);
1962 StoreU(packed2,
d, packed_out + 2 *
N);
1963 StoreU(packed3,
d, packed_out + 3 *
N);
1964 StoreU(packed4,
d, packed_out + 4 *
N);
1965 StoreU(packed5,
d, packed_out + 5 *
N);
1966 StoreU(packed6,
d, packed_out + 6 *
N);
1967 StoreU(packed7,
d, packed_out + 7 *
N);
1968 StoreU(packed8,
d, packed_out + 8 *
N);
1969 StoreU(packed9,
d, packed_out + 9 *
N);
1970 StoreU(packedA,
d, packed_out + 0xA *
N);
1971 StoreU(packedB,
d, packed_out + 0xB *
N);
1977 using VU16 =
Vec<
decltype(
d)>;
1993 const VU16 mask =
Set(
d, 0xFFFu);
1995 const VU16 raw0 =
And(packed0, mask);
1998 const VU16 raw1 =
And(packed1, mask);
2001 const VU16 raw2 =
And(packed2, mask);
2004 const VU16 raw3 =
And(packed3, mask);
2007 const VU16 raw4 =
And(packed4, mask);
2010 const VU16 raw5 =
And(packed5, mask);
2013 const VU16 raw6 =
And(packed6, mask);
2016 const VU16 raw7 =
And(packed7, mask);
2019 const VU16 mid8 =
Set(
d, 0xFF0u);
2021 OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
2023 OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
2025 OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
2027 OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
2029 OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
2031 OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
2033 OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
2035 OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
2052 using VU16 =
Vec<
decltype(
d)>;
2054 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2055 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2056 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2057 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2058 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2059 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2060 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2061 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2062 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2063 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2064 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2065 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2066 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2067 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2068 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2069 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2073 const VU16 lo8 =
Set(
d, 0xFFu);
2076 const VU16 packed0 =
OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
2077 const VU16 packed1 =
OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
2078 const VU16 packed2 =
OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
2079 const VU16 packed3 =
OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
2080 const VU16 packed4 =
OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
2081 const VU16 packed5 =
OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
2082 const VU16 packed6 =
OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
2083 const VU16 packed7 =
OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
2085 StoreU(packed0,
d, packed_out + 0 *
N);
2086 StoreU(packed1,
d, packed_out + 1 *
N);
2087 StoreU(packed2,
d, packed_out + 2 *
N);
2088 StoreU(packed3,
d, packed_out + 3 *
N);
2089 StoreU(packed4,
d, packed_out + 4 *
N);
2090 StoreU(packed5,
d, packed_out + 5 *
N);
2091 StoreU(packed6,
d, packed_out + 6 *
N);
2092 StoreU(packed7,
d, packed_out + 7 *
N);
2095 const VU16 top0 = ShiftRight<8>(raw0);
2096 const VU16 top1 = ShiftRight<8>(raw1);
2097 const VU16 top2 = ShiftRight<8>(raw2);
2098 const VU16 top3 = ShiftRight<8>(raw3);
2099 const VU16 top4 = ShiftRight<8>(raw4);
2103 VU16 next =
Set(
d, 0x3E0u);
2104 VU16 packed8 =
OrAnd(top0, ShiftRight<3>(raw5), next);
2105 VU16 packed9 =
OrAnd(top1, ShiftRight<3>(raw6), next);
2106 VU16 packedA =
OrAnd(top2, ShiftRight<3>(raw7), next);
2107 VU16 packedB =
OrAnd(top3, ShiftRight<3>(raw8), next);
2108 VU16 packedC =
OrAnd(top4, ShiftRight<3>(raw9), next);
2109 next = ShiftLeft<5>(next);
2110 packed8 =
OrAnd(packed8, ShiftLeft<2>(rawA), next);
2111 packed9 =
OrAnd(packed9, ShiftLeft<2>(rawB), next);
2112 packedA =
OrAnd(packedA, ShiftLeft<2>(rawC), next);
2113 packedB =
OrAnd(packedB, ShiftLeft<2>(rawD), next);
2114 packedC =
OrAnd(packedC, ShiftLeft<2>(rawE), next);
2117 next = ShiftLeft<3>(next);
2118 packed8 =
OrAnd(packed8, ShiftLeft<7>(rawF), next);
2119 packed9 =
OrAnd(packed9, ShiftLeft<6>(rawF), next);
2120 packedA =
OrAnd(packedA, ShiftLeft<5>(rawF), next);
2121 packedB =
OrAnd(packedB, ShiftLeft<4>(rawF), next);
2122 packedC =
OrAnd(packedC, ShiftLeft<3>(rawF), next);
2124 StoreU(packed8,
d, packed_out + 8 *
N);
2125 StoreU(packed9,
d, packed_out + 9 *
N);
2126 StoreU(packedA,
d, packed_out + 0xA *
N);
2127 StoreU(packedB,
d, packed_out + 0xB *
N);
2128 StoreU(packedC,
d, packed_out + 0xC *
N);
2134 using VU16 =
Vec<
decltype(
d)>;
2151 const VU16 mask =
Set(
d, 0xFFu);
2153 const VU16 down0 =
And(packed0, mask);
2154 const VU16 down1 = ShiftRight<8>(packed0);
2155 const VU16 down2 =
And(packed1, mask);
2156 const VU16 down3 = ShiftRight<8>(packed1);
2157 const VU16 down4 =
And(packed2, mask);
2158 const VU16 down5 = ShiftRight<8>(packed2);
2159 const VU16 down6 =
And(packed3, mask);
2160 const VU16 down7 = ShiftRight<8>(packed3);
2161 const VU16 down8 =
And(packed4, mask);
2162 const VU16 down9 = ShiftRight<8>(packed4);
2163 const VU16 downA =
And(packed5, mask);
2164 const VU16 downB = ShiftRight<8>(packed5);
2165 const VU16 downC =
And(packed6, mask);
2166 const VU16 downD = ShiftRight<8>(packed6);
2167 const VU16 downE =
And(packed7, mask);
2168 const VU16 downF = ShiftRight<8>(packed7);
2171 const VU16 hi5 =
Set(
d, 0x1F00u);
2172 const VU16 raw0 =
OrAnd(down0, ShiftLeft<8>(packed8), hi5);
2173 const VU16 raw1 =
OrAnd(down1, ShiftLeft<8>(packed9), hi5);
2174 const VU16 raw2 =
OrAnd(down2, ShiftLeft<8>(packedA), hi5);
2175 const VU16 raw3 =
OrAnd(down3, ShiftLeft<8>(packedB), hi5);
2176 const VU16 raw4 =
OrAnd(down4, ShiftLeft<8>(packedC), hi5);
2178 const VU16 raw5 =
OrAnd(down5, ShiftLeft<3>(packed8), hi5);
2179 const VU16 raw6 =
OrAnd(down6, ShiftLeft<3>(packed9), hi5);
2180 const VU16 raw7 =
OrAnd(down7, ShiftLeft<3>(packedA), hi5);
2181 const VU16 raw8 =
OrAnd(down8, ShiftLeft<3>(packed9), hi5);
2182 const VU16 raw9 =
OrAnd(down9, ShiftLeft<3>(packedA), hi5);
2184 const VU16 rawA =
OrAnd(downA, ShiftRight<2>(packed8), hi5);
2185 const VU16 rawB =
OrAnd(downB, ShiftRight<2>(packed9), hi5);
2186 const VU16 rawC =
OrAnd(downC, ShiftRight<2>(packedA), hi5);
2187 const VU16 rawD =
OrAnd(downD, ShiftRight<2>(packed9), hi5);
2188 const VU16 rawE =
OrAnd(downE, ShiftRight<2>(packedA), hi5);
2191 const VU16 p0 =
Xor3(
And(ShiftRight<7>(packed8), hi5),
2192 And(ShiftRight<6>(packed9), hi5),
2193 And(ShiftRight<5>(packedA), hi5));
2194 const VU16 p1 =
Xor3(
And(ShiftRight<4>(packedB), hi5),
2195 And(ShiftRight<3>(packedC), hi5), downF);
2196 const VU16 rawF =
Or(p0, p1);
2222 using VU16 =
Vec<
decltype(
d)>;
2224 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2225 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2226 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2227 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2228 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2229 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2230 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2231 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2232 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2233 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2234 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2235 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2236 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2237 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2238 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2239 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2243 const VU16 hi2 =
Set(
d, 0xC000u);
2244 const VU16 packed0 =
Or(raw0, ShiftLeft<14>(rawE));
2245 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
2246 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
2247 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
2248 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
2249 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
2250 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
2251 const VU16 packed7 =
Or(raw7, ShiftLeft<14>(rawF));
2252 const VU16 packed8 =
OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
2253 const VU16 packed9 =
OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
2254 const VU16 packedA =
OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
2255 const VU16 packedB =
OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
2256 const VU16 packedC =
OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
2257 const VU16 packedD =
OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
2259 StoreU(packed0,
d, packed_out + 0 *
N);
2260 StoreU(packed1,
d, packed_out + 1 *
N);
2261 StoreU(packed2,
d, packed_out + 2 *
N);
2262 StoreU(packed3,
d, packed_out + 3 *
N);
2263 StoreU(packed4,
d, packed_out + 4 *
N);
2264 StoreU(packed5,
d, packed_out + 5 *
N);
2265 StoreU(packed6,
d, packed_out + 6 *
N);
2266 StoreU(packed7,
d, packed_out + 7 *
N);
2267 StoreU(packed8,
d, packed_out + 8 *
N);
2268 StoreU(packed9,
d, packed_out + 9 *
N);
2269 StoreU(packedA,
d, packed_out + 0xA *
N);
2270 StoreU(packedB,
d, packed_out + 0xB *
N);
2271 StoreU(packedC,
d, packed_out + 0xC *
N);
2272 StoreU(packedD,
d, packed_out + 0xD *
N);
2278 using VU16 =
Vec<
decltype(
d)>;
2296 const VU16 mask =
Set(
d, 0x3FFFu);
2298 const VU16 raw0 =
And(packed0, mask);
2301 const VU16 raw1 =
And(packed1, mask);
2304 const VU16 raw2 =
And(packed2, mask);
2307 const VU16 raw3 =
And(packed3, mask);
2310 const VU16 raw4 =
And(packed4, mask);
2313 const VU16 raw5 =
And(packed5, mask);
2316 const VU16 raw6 =
And(packed6, mask);
2319 const VU16 raw7 =
And(packed7, mask);
2322 const VU16 raw8 =
And(packed8, mask);
2325 const VU16 raw9 =
And(packed9, mask);
2328 const VU16 rawA =
And(packedA, mask);
2331 const VU16 rawB =
And(packedB, mask);
2334 const VU16 rawC =
And(packedC, mask);
2337 const VU16 rawD =
And(packedD, mask);
2341 const VU16 E0 =
Xor3(ShiftRight<14>(packed0),
2342 ShiftRight<12>(
AndNot(mask, packed1)),
2343 ShiftRight<10>(
AndNot(mask, packed2)));
2344 const VU16 E1 =
Xor3(ShiftRight<8>(
AndNot(mask, packed3)),
2345 ShiftRight<6>(
AndNot(mask, packed4)),
2346 ShiftRight<4>(
AndNot(mask, packed5)));
2347 const VU16 rawE =
Xor3(ShiftRight<2>(
AndNot(mask, packed6)), E0, E1);
2348 const VU16 F0 =
Xor3(ShiftRight<14>(
AndNot(mask, packed7)),
2349 ShiftRight<12>(
AndNot(mask, packed8)),
2350 ShiftRight<10>(
AndNot(mask, packed9)));
2351 const VU16 F1 =
Xor3(ShiftRight<8>(
AndNot(mask, packedA)),
2352 ShiftRight<6>(
AndNot(mask, packedB)),
2353 ShiftRight<4>(
AndNot(mask, packedC)));
2354 const VU16 rawF =
Xor3(ShiftRight<2>(
AndNot(mask, packedD)), F0, F1);
2365 using VU16 =
Vec<
decltype(
d)>;
2367 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2368 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2369 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2370 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2371 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2372 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2373 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2374 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2375 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2376 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2377 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2378 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2379 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2380 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2381 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2382 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2386 const VU16 hi1 =
Set(
d, 0x8000u);
2387 const VU16 packed0 =
Or(raw0, ShiftLeft<15>(rawF));
2388 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
2389 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
2390 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
2391 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
2392 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
2393 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
2394 const VU16 packed7 =
OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
2395 const VU16 packed8 =
OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
2396 const VU16 packed9 =
OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
2397 const VU16 packedA =
OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
2398 const VU16 packedB =
OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
2399 const VU16 packedC =
OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
2400 const VU16 packedD =
OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
2401 const VU16 packedE =
OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
2403 StoreU(packed0,
d, packed_out + 0 *
N);
2404 StoreU(packed1,
d, packed_out + 1 *
N);
2405 StoreU(packed2,
d, packed_out + 2 *
N);
2406 StoreU(packed3,
d, packed_out + 3 *
N);
2407 StoreU(packed4,
d, packed_out + 4 *
N);
2408 StoreU(packed5,
d, packed_out + 5 *
N);
2409 StoreU(packed6,
d, packed_out + 6 *
N);
2410 StoreU(packed7,
d, packed_out + 7 *
N);
2411 StoreU(packed8,
d, packed_out + 8 *
N);
2412 StoreU(packed9,
d, packed_out + 9 *
N);
2413 StoreU(packedA,
d, packed_out + 0xA *
N);
2414 StoreU(packedB,
d, packed_out + 0xB *
N);
2415 StoreU(packedC,
d, packed_out + 0xC *
N);
2416 StoreU(packedD,
d, packed_out + 0xD *
N);
2417 StoreU(packedE,
d, packed_out + 0xE *
N);
2423 using VU16 =
Vec<
decltype(
d)>;
2442 const VU16 mask =
Set(
d, 0x7FFFu);
2444 const VU16 raw0 =
And(packed0, mask);
2447 const VU16 raw1 =
And(packed1, mask);
2450 const VU16 raw2 =
And(packed2, mask);
2453 const VU16 raw3 =
And(packed3, mask);
2456 const VU16 raw4 =
And(packed4, mask);
2459 const VU16 raw5 =
And(packed5, mask);
2462 const VU16 raw6 =
And(packed6, mask);
2465 const VU16 raw7 =
And(packed7, mask);
2468 const VU16 raw8 =
And(packed8, mask);
2471 const VU16 raw9 =
And(packed9, mask);
2474 const VU16 rawA =
And(packedA, mask);
2477 const VU16 rawB =
And(packedB, mask);
2480 const VU16 rawC =
And(packedC, mask);
2483 const VU16 rawD =
And(packedD, mask);
2486 const VU16 rawE =
And(packedE, mask);
2490 const VU16 F0 =
Xor3(ShiftRight<15>(packed0),
2491 ShiftRight<14>(
AndNot(mask, packed1)),
2492 ShiftRight<13>(
AndNot(mask, packed2)));
2493 const VU16 F1 =
Xor3(ShiftRight<12>(
AndNot(mask, packed3)),
2494 ShiftRight<11>(
AndNot(mask, packed4)),
2495 ShiftRight<10>(
AndNot(mask, packed5)));
2496 const VU16 F2 =
Xor3(ShiftRight<9>(
AndNot(mask, packed6)),
2497 ShiftRight<8>(
AndNot(mask, packed7)),
2498 ShiftRight<7>(
AndNot(mask, packed8)));
2499 const VU16 F3 =
Xor3(ShiftRight<6>(
AndNot(mask, packed9)),
2500 ShiftRight<5>(
AndNot(mask, packedA)),
2501 ShiftRight<4>(
AndNot(mask, packedB)));
2502 const VU16 F4 =
Xor3(ShiftRight<3>(
AndNot(mask, packedC)),
2503 ShiftRight<2>(
AndNot(mask, packedD)),
2504 ShiftRight<1>(
AndNot(mask, packedE)));
2505 const VU16 rawF =
Xor3(F0, F1,
Xor3(F2, F3, F4));
2515 using VU16 =
Vec<
decltype(
d)>;
2517 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2518 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2519 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2520 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2521 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2522 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2523 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2524 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2525 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2526 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2527 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2528 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2529 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2530 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2531 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2532 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2534 StoreU(raw0,
d, packed_out + 0 *
N);
2535 StoreU(raw1,
d, packed_out + 1 *
N);
2536 StoreU(raw2,
d, packed_out + 2 *
N);
2537 StoreU(raw3,
d, packed_out + 3 *
N);
2538 StoreU(raw4,
d, packed_out + 4 *
N);
2539 StoreU(raw5,
d, packed_out + 5 *
N);
2540 StoreU(raw6,
d, packed_out + 6 *
N);
2541 StoreU(raw7,
d, packed_out + 7 *
N);
2542 StoreU(raw8,
d, packed_out + 8 *
N);
2543 StoreU(raw9,
d, packed_out + 9 *
N);
2544 StoreU(rawA,
d, packed_out + 0xA *
N);
2545 StoreU(rawB,
d, packed_out + 0xB *
N);
2546 StoreU(rawC,
d, packed_out + 0xC *
N);
2547 StoreU(rawD,
d, packed_out + 0xD *
N);
2548 StoreU(rawE,
d, packed_out + 0xE *
N);
2549 StoreU(rawF,
d, packed_out + 0xF *
N);
2555 using VU16 =
Vec<
decltype(
d)>;
#define HWY_RESTRICT
Definition base.h:64
#define HWY_INLINE
Definition base.h:70
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
N
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
Definition aligned_allocator.h:27
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1682
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1622
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1756
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1837
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1975
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1922
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2050
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2276
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2220
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2363
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2421
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2553
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2513
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:647
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:611
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:708
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:744
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:847
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:806
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:951
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:914
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1059
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1015
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1179
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1303
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1253
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1381
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1424
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1549
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1492
Definition bit_pack-inl.h:39
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:67
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:44
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:129
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:105
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:198
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:168
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:271
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:244
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:312
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:347
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:428
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:396
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:475
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:509
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:584
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:560
Definition bit_pack-inl.h:37