POWER Vector Library Manual  1.0.4
vec_int32_ppc.h
Go to the documentation of this file.
1 /*
2  Copyright (c) [2018] IBM Corporation.
3 
4  Licensed under the Apache License, Version 2.0 (the "License");
5  you may not use this file except in compliance with the License.
6  You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  vec_int32_ppc.h
17 
18  Contributors:
19  IBM Corporation, Steven Munroe
20  Created on: Mar 29, 2018
21  */
22 
23 #ifndef VEC_INT32_PPC_H_
24 #define VEC_INT32_PPC_H_
25 
26 #include <pveclib/vec_int16_ppc.h>
27 
394 #ifdef _ARCH_PWR8
396 /*
397  * Vector Shift Left Doubleword was added to PowerISA 2.07 (PWR8).
398  * Operations vec_vsld and vec_sldi are define in vec_int64_ppc.h
399  * but using those here would create a circular dependency.
400  * So need the equivalent of the altivec.h specific vec_vsld.
401  * Currently GCC defines both vec_vsld and vec_sl for type long long.
402  * But older GCC versions may not and are more likely to support only
403  * vec_vsld. Clang seems to only support the generic vec_sl for the
404  * long long type and does not define the macro vec_vsld.
405  *
406  * The following allows vec_int32_ppc.h to use __pvec_vsld as a work
407  * around for clang and possible future versions of GCC that drop
408  * support for altivec specific built-ins.
409  */
410 #ifdef vec_vsld
411 #define __pvec_vsld vec_vsld
412 #else
413 #define __pvec_vsld vec_sl
414 #endif
415 #endif
416 
417 static inline vui64_t vec_muleuw (vui32_t a, vui32_t b);
418 static inline vui64_t vec_mulouw (vui32_t a, vui32_t b);
419 #ifndef vec_popcntw
420 static inline vui32_t vec_popcntw (vui32_t vra);
421 #else
422 /* Work around for GCC PR85830. */
423 #undef vec_popcntw
424 #define vec_popcntw __builtin_vec_vpopcntw
425 #endif
426 static inline vi32_t vec_srawi (vi32_t vra, const unsigned int shb);
427 static inline vui64_t
428 vec_vlxsiwzx (const signed long long ra, const unsigned int *rb);
429 static inline vi64_t
430 vec_vlxsiwax (const signed long long ra, const signed int *rb);
431 static inline vui64_t vec_vmuleuw (vui32_t a, vui32_t b);
432 static inline vui64_t vec_vmulouw (vui32_t a, vui32_t b);
433 static inline void
434 vec_vsstwso (vui64_t xs, unsigned int *array,
435  const long long offset0, const long long offset1);
436 static inline void
437 vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb);
439 
455 static inline vui32_t
457 {
458  vui32_t result;
459 #ifdef _ARCH_PWR9
460 #ifdef vec_absdw
461  result = vec_absdw (vra, vrb);
462 #else
463  __asm__(
464  "vabsduw %0,%1,%2;"
465  : "=v" (result)
466  : "v" (vra), "v" (vrb)
467  : );
468 #endif
469 #else
470  vui32_t vmin, vmax;
471 
472  vmin = vec_min (vra, vrb);
473  vmax = vec_max (vra, vrb);
474  result = vec_sub (vmax, vmin);
475 #endif
476  return (result);
477 }
478 
502 static inline vui32_t
504 {
505  vui32_t r;
506 #ifdef _ARCH_PWR8
507 #if defined (vec_vclzw)
508  r = vec_vclzw (vra);
509 #elif defined (__clang__)
510  r = vec_cntlz (vra);
511 #else
512  __asm__(
513  "vclzw %0,%1;"
514  : "=v" (r)
515  : "v" (vra)
516  : );
517 #endif
518 #else
519 //#warning Implememention pre POWER8
520  vui32_t n, nt, y, x, s, m;
521  vui32_t z= {0,0,0,0};
522  vui32_t one = {1,1,1,1};
523 
524  /* n = 32 s = 16 */
525  s = vec_splat_u32(8);
526  s = vec_add (s, s);
527  n = vec_add (s, s);
528 
529  x = vra;
530  /* y=x>>16 if (y!=0) (n=n-16 x=y) */
531  y = vec_sr(x, s);
532  nt = vec_sub(n,s);
533  m = (vui32_t)vec_cmpgt(y, z);
534  s = vec_sr(s,one);
535  x = vec_sel (x, y, m);
536  n = vec_sel (n, nt, m);
537 
538  /* y=x>>8 if (y!=0) (n=n-8 x=y) */
539  y = vec_sr(x, s);
540  nt = vec_sub(n,s);
541  m = (vui32_t)vec_cmpgt(y, z);
542  s = vec_sr(s,one);
543  x = vec_sel (x, y, m);
544  n = vec_sel (n, nt, m);
545 
546  /* y=x>>4 if (y!=0) (n=n-4 x=y) */
547  y = vec_sr(x, s);
548  nt = vec_sub(n,s);
549  m = (vui32_t)vec_cmpgt(y, z);
550  s = vec_sr(s,one);
551  x = vec_sel (x, y, m);
552  n = vec_sel (n, nt, m);
553 
554  /* y=x>>2 if (y!=0) (n=n-2 x=y) */
555  y = vec_sr(x, s);
556  nt = vec_sub(n,s);
557  m = (vui32_t)vec_cmpgt(y, z);
558  s = vec_sr(s,one);
559  x = vec_sel (x, y, m);
560  n = vec_sel (n, nt, m);
561 
562  /* y=x>>1 if (y!=0) return (n=n-2) */
563  y = vec_sr(x, s);
564  nt = vec_sub(n,s);
565  nt = vec_sub(nt,s);
566  m = (vui32_t)vec_cmpgt(y, z);
567  n = vec_sel (n, nt, m);
568 
569  /* else return (x-n) */
570  nt = vec_sub (n, x);
571  n = vec_sel (nt, n, m);
572  r = n;
573 #endif
574  return ((vui32_t) r);
575 }
576 
600 static inline vui32_t
602 {
603  vui32_t r;
604 #ifdef _ARCH_PWR9
605 #if defined (vec_cnttz) || defined (__clang__)
606  r = vec_cnttz (vra);
607 #else
608  __asm__(
609  "vctzw %0,%1;"
610  : "=v" (r)
611  : "v" (vra)
612  : );
613 #endif
614 #else
615 // For _ARCH_PWR8 and earlier. Generate 1's for the trailing zeros
616 // and 0's otherwise. Then count (popcnt) the 1's. _ARCH_PWR8 uses
617 // the hardware vpopcntw instruction. _ARCH_PWR7 and earlier use the
618 // PVECLIB vec_popcntw implementation which runs ~20-28 instructions.
619  const vui32_t ones = { 1, 1, 1, 1 };
620  vui32_t tzmask;
621  // tzmask = (!vra & (vra - 1))
622  tzmask = vec_andc (vec_sub (vra, ones), vra);
623  // return = vec_popcnt (!vra & (vra - 1))
624  r = vec_popcntw (tzmask);
625 #endif
626  return ((vui32_t) r);
627 }
628 
652 static inline vui32_t
654 {
655  vui32_t res;
656 #ifdef _ARCH_PWR8
657 #ifdef vec_vmrgew
658 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
659  res = vec_vmrgow ((vui32_t)vrb, (vui32_t)vra);
660 #else
661  res = vec_vmrgew ((vui32_t)vra, (vui32_t)vrb);
662 #endif
663 #else
664  __asm__(
665  "vmrgew %0,%1,%2;\n"
666  : "=v" (res)
667  : "v" (vra),
668  "v" (vrb)
669  : );
670 #endif
671 #else
672  const vui32_t vconstp =
673  CONST_VINT32_W(0x00010203, 0x10111213, 0x08090a0b, 0x18191a1b);
674  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
675 #endif
676  return (res);
677 }
678 
702 static inline vui32_t
704 {
705  vui32_t res;
706 #ifdef _ARCH_PWR8
707 #ifdef vec_vmrgow
708 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
709  res = vec_vmrgew ((vui32_t)vrb, (vui32_t)vra);
710 #else
711  res = vec_vmrgow ((vui32_t)vra, (vui32_t)vrb);
712 #endif
713 #else
714  __asm__(
715  "vmrgow %0,%1,%2;\n"
716  : "=v" (res)
717  : "v" (vra),
718  "v" (vrb)
719  : );
720 #endif
721 #else
722  const vui32_t vconstp =
723  CONST_VINT32_W(0x04050607, 0x14151617, 0x0c0d0e0f, 0x1c1d1e1f);
724  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
725 #endif
726  return (res);
727 }
728 
752 static inline vui32_t
754 {
755  vui32_t res;
756 #ifdef _ARCH_PWR8
757 #ifdef vec_vmrgew
758  res = vec_vmrgew (vra, vrb);
759 #else
760  __asm__(
761 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
762  "vmrgow %0,%2,%1;\n"
763 #else
764  "vmrgew %0,%1,%2;\n"
765 #endif
766  : "=v" (res)
767  : "v" (vra),
768  "v" (vrb)
769  : );
770 #endif
771 #else
772  const vui32_t vconstp =
773  CONST_VINT32_W(0x00010203, 0x10111213, 0x08090a0b, 0x18191a1b);
774  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
775 #endif
776  return (res);
777 }
778 
802 static inline vui32_t
804 {
805  vui32_t res;
806 #ifdef _ARCH_PWR8
807 #ifdef vec_vmrgew
808  res = vec_vmrgow (vra, vrb);
809 #else
810  __asm__(
811 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
812  "vmrgew %0,%2,%1;\n"
813 #else
814  "vmrgow %0,%1,%2;\n"
815 #endif
816  : "=v" (res)
817  : "v" (vra),
818  "v" (vrb)
819  : );
820 #endif
821 #else
822  const vui32_t vconstp =
823  CONST_VINT32_W(0x04050607, 0x14151617, 0x0c0d0e0f, 0x1c1d1e1f);
824  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
825 #endif
826  return (res);
827 }
828 
853 static inline vi64_t
855 {
856  vi64_t res;
857 #ifdef _ARCH_PWR8
858  // The vector vmulosw/vmulesw instructions introduced in PRW8
859 #if defined __GNUC__ && (__GNUC__ > 7)
860  res = vec_mule (a, b);
861 #else
862  __asm__(
863 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
864  "vmulosw %0,%1,%2;\n"
865 #else
866  "vmulesw %0,%1,%2;\n"
867 #endif
868  : "=v" (res)
869  : "v" (a),
870  "v" (b)
871  : );
872 #endif
873 #else
874  // must be PWR7 or older
875  vui32_t uia, uib;
876  vui32_t amask, bmask, t1, t2, r;
877  vui64_t ui_prod;
878  const vui32_t zero= { 0,0,0,0};
879 
880  uia = (vui32_t) a;
881  uib = (vui32_t) b;
882  // Generate 32-bit masks from the sign of each input word.
883  amask = (vui32_t) vec_srawi (a, 31);
884  bmask = (vui32_t) vec_srawi (b, 31);
885  // Extend the even masks to the right with zeros to form two 64-bit
886  // masks. We need the trailing zeros as the low 32-bits of the
887  // product are correct as-is and should not change.
888  amask = vec_mrgew (amask, zero);
889  bmask = vec_mrgew (bmask, zero);
890  // Compute the doubleword even unsigned word product
891  ui_prod = vec_muleuw (uia, uib);
892 
893  // Generate t1 = amask & b and t2 = bmask & a
894  t1 = vec_and (amask, uib);
895  t2 = vec_and (bmask, uia);
896  // Apply the correction res = ui_prod - t1 - t2
897  r = vec_sub ((vui32_t) ui_prod, t1);
898  res = (vi64_t) vec_sub (r, t2);
899 #endif
900  return (res);
901 }
902 
927 static inline vi64_t
929 {
930  vi64_t res;
931 #ifdef _ARCH_PWR8
932  // The vector vmulosw/vmulesw instructions introduced in PRW8
933 #if defined __GNUC__ && (__GNUC__ > 7) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
934  res = vec_mulo (a, b);
935 #else
936  __asm__(
937 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
938  "vmulesw %0,%1,%2;\n"
939 #else
940  "vmulosw %0,%1,%2;\n"
941 #endif
942  : "=v" (res)
943  : "v" (a),
944  "v" (b)
945  : );
946 #endif
947 #else
948  // must be PWR7 or older
949  vui32_t uia, uib;
950  vui32_t amask, bmask, t1, t2, r;
951  vui64_t ui_prod;
952  const vui32_t zero= { 0,0,0,0};
953 
954  // duplicate odd words to even
955  uia = (vui32_t) a;
956  uib = (vui32_t) b;
957  uia = vec_mrgow (uia, uia);
958  uib = vec_mrgow (uib, uib);
959  // Generate 32-bit masks from the sign of each input word.
960  amask = (vui32_t) vec_srawi ((vi32_t) uia, 31);
961  bmask = (vui32_t) vec_srawi ((vi32_t) uib, 31);
962  // Shift the odd masks to the left 32 and extend to the right with
963  // zeros to form two 64-bit masks. We need the trailing zeros as the
964  // low 32-bits of the product are correct as-is.
965  amask = vec_mrgow (amask, zero);
966  bmask = vec_mrgow (bmask, zero);
967  // Compute the doubleword odd unsigned word product
968  ui_prod = vec_mulouw (uia, uib);
969 
970  // Generate t1 = amask & b and t2 = bmask & a
971  t1 = vec_and (amask, uib);
972  t2 = vec_and (bmask, uia);
973  // Apply the correction res = ui_prod - t1 - t2
974  r = vec_sub ((vui32_t) ui_prod, t1);
975  res = (vi64_t) vec_sub (r, t2);
976 #endif
977  return (res);
978 }
979 
1006 static inline vui64_t
1008 {
1009 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1010  return vec_vmulouw (a, b);
1011 #else
1012  return vec_vmuleuw (a, b);
1013 #endif
1014 }
1015 
1042 static inline vui64_t
1044 {
1045 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1046  return vec_vmuleuw (a, b);
1047 #else
1048  return vec_vmulouw (a, b);
1049 #endif
1050 }
1051 
1068 static inline vi32_t
1070 {
1071 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1072  return (vi32_t) vec_mrgahw ((vui64_t) vec_mulosw (vra, vrb),
1073  (vui64_t) vec_mulesw (vra, vrb));
1074 #else
1075  return (vi32_t) vec_mrgahw ((vui64_t) vec_mulesw (vra, vrb),
1076  (vui64_t) vec_mulosw (vra, vrb));
1077 #endif
1078 }
1079 
1102 static inline vui32_t
1104 {
1105 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1106  return vec_mrgahw (vec_mulouw (vra, vrb), vec_muleuw (vra, vrb));
1107 #else
1108  return vec_mrgahw (vec_muleuw (vra, vrb), vec_mulouw (vra, vrb));
1109 #endif
1110 }
1111 
1131 static inline vui32_t
1133 {
1134 #if defined __GNUC__ && (__GNUC__ > 7)
1135  return vec_mul (a, b);
1136 #else
1137  vui32_t r;
1138 #ifdef _ARCH_PWR8
1139  __asm__(
1140  "vmuluwm %0,%1,%2;\n"
1141  : "=v" (r)
1142  : "v" (a),
1143  "v" (b)
1144  : );
1145 #else
1146  vui32_t s16 = (vui32_t)vec_vspltisw (-16);
1147  vui32_t z = (vui32_t)vec_vspltisw (0);
1148  vui32_t t4;
1149  vui32_t t2, t3;
1150  vui16_t t1;
1151 
1152  t1 = (vui16_t)vec_vrlw (b, s16);
1153  t2 = vec_vmulouh ((vui16_t)a, (vui16_t)b);
1154  t3 = vec_vmsumuhm ((vui16_t)a, t1, z);
1155  t4 = vec_vslw (t3, s16);
1156  r = (vui32_t)vec_vadduwm (t4, t2);
1157 #endif
1158  return (r);
1159 #endif
1160 }
1161 
1182 #ifndef vec_popcntw
1183 static inline vui32_t
1185 {
1186  vui32_t r;
1187 #ifdef _ARCH_PWR8
1188 #if defined (vec_vpopcntw)
1189  r = vec_vpopcntw (vra);
1190 #elif defined (__clang__)
1191  r = vec_popcnt (vra);
1192 #else
1193  __asm__(
1194  "vpopcntw %0,%1;"
1195  : "=v" (r)
1196  : "v" (vra)
1197  : );
1198 #endif
1199 #else
1200 //#warning Implememention pre power8
1201  vui32_t z= { 0,0,0,0};
1202  vui8_t x;
1203  x = vec_popcntb ((vui8_t)vra);
1204  r = vec_vsum4ubs (x, z);
1205 #endif
1206  return (r);
1207 }
1208 #else
1209 /* Work around for GCC PR85830. */
1210 #undef vec_popcntw
1211 #define vec_popcntw __builtin_vec_vpopcntw
1212 #endif
1213 
1228 static inline vui32_t
1230 {
1231  vui32_t result;
1232 
1233 #ifdef _ARCH_PWR9
1234 #if defined (vec_revb) || defined (__clang__)
1235  result = vec_revb (vra);
1236 #else
1237  __asm__(
1238  "xxbrw %x0,%x1;"
1239  : "=wa" (result)
1240  : "wa" (vra)
1241  : );
1242 #endif
1243 #else
1244  const vui64_t vconstp =
1245 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1246  CONST_VINT64_DW(0x0302010007060504UL, 0x0B0A09080F0E0D0CUL);
1247 #else
1248  CONST_VINT64_DW(0x0C0D0E0F08090A0BUL, 0x0405060700010203UL);
1249 #endif
1250  result = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);
1251 #endif
1252 
1253  return (result);
1254 }
1255 
1272 static inline vb32_t
1274 {
1275  vb32_t result;
1276 
1277 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
1278  __asm__(
1279  "vexpandwm %0,%1"
1280  : "=v" (result)
1281  : "v" (vra)
1282  : );
1283 #else
1284  // Compare signed word less than zero
1285  const vi32_t zero = {0, 0, 0, 0};
1286  result = vec_cmplt (vra, zero);
1287 #endif
1288  return result;
1289 }
1290 
1308 static inline vui32_t
1309 vec_slwi (vui32_t vra, const unsigned int shb)
1310 {
1311  vui32_t lshift;
1312  vui32_t result;
1313 
1314  if (shb < 32)
1315  {
1316  /* Load the shift const in a vector. The element shifts require
1317  a shift amount for each element. For the immediate form the
1318  shift constant is splatted to all elements of the
1319  shift control. */
1320  if (__builtin_constant_p (shb) && (shb < 16))
1321  lshift = (vui32_t) vec_splat_s32(shb);
1322  else
1323  lshift = vec_splats ((unsigned int) shb);
1324 
1325  /* Vector Shift right bytes based on the lower 5-bits of
1326  corresponding element of lshift. */
1327  result = vec_vslw (vra, lshift);
1328  }
1329  else
1330  { /* shifts greater then 31 bits return zeros. */
1331  result = vec_xor ((vui32_t) vra, (vui32_t) vra);
1332  }
1333 
1334  return (vui32_t) result;
1335 }
1336 
1355 static inline vi32_t
1356 vec_srawi (vi32_t vra, const unsigned int shb)
1357 {
1358  vui32_t lshift;
1359  vi32_t result;
1360 
1361  if (shb < 32)
1362  {
1363  /* Load the shift const in a vector. The element shifts require
1364  a shift amount for each element. For the immediate form the
1365  shift constant is splatted to all elements of the
1366  shift control. */
1367  if (__builtin_constant_p (shb) && (shb < 16))
1368  lshift = (vui32_t) vec_splat_s32(shb);
1369  else
1370  lshift = vec_splats ((unsigned int) shb);
1371 
1372  /* Vector Shift Right Algebraic Words based on the lower 5-bits
1373  of corresponding element of lshift. */
1374  result = vec_vsraw (vra, lshift);
1375  }
1376  else
1377  { /* shifts greater then 31 bits returns the sign bit propagated to
1378  all bits. This is equivalent to shift Right Algebraic of
1379  31 bits. */
1380  lshift = (vui32_t) vec_splats(31);
1381  result = vec_vsraw (vra, lshift);
1382  }
1383 
1384  return (vi32_t) result;
1385 }
1386 
1404 static inline vui32_t
1405 vec_srwi (vui32_t vra, const unsigned int shb)
1406 {
1407  vui32_t lshift;
1408  vui32_t result;
1409 
1410  if (shb < 32)
1411  {
1412  /* Load the shift const in a vector. The element shifts require
1413  a shift amount for each element. For the immediate form the
1414  shift constant is splatted to all elements of the
1415  shift control. */
1416  if (__builtin_constant_p (shb) && (shb < 16))
1417  lshift = (vui32_t) vec_splat_s32(shb);
1418  else
1419  lshift = vec_splats ((unsigned int) shb);
1420 
1421  /* Vector Shift right bytes based on the lower 5-bits of
1422  corresponding element of lshift. */
1423  result = vec_vsrw (vra, lshift);
1424  }
1425  else
1426  { /* shifts greater then 31 bits return zeros. */
1427  result = vec_xor ((vui32_t) vra, (vui32_t) vra);
1428  }
1429  return (vui32_t) result;
1430 }
1431 
1452 static inline vui32_t
1453 vec_vgl4wso (unsigned int *array, const long long offset0,
1454  const long long offset1, const long long offset2,
1455  const long long offset3)
1456 {
1457  vui32_t result;
1458 
1459 #ifdef _ARCH_PWR8
1460  vui64_t re0, re1, re2, re3;
1461  re0 = vec_vlxsiwzx (offset0, array);
1462  re1 = vec_vlxsiwzx (offset1, array);
1463  re2 = vec_vlxsiwzx (offset2, array);
1464  re3 = vec_vlxsiwzx (offset3, array);
1465  /* Need to handle endian as the vec_vlxsiwzx result is always left
1466  * justified in VR, while element [0] may be left or right. */
1467 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1468  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't
1469  * support doubleword vec_merge. */
1470  re0 = vec_xxpermdi (re0, re2, 3);
1471  re1 = vec_xxpermdi (re1, re3, 3);
1472  result = vec_mergee ((vui32_t) re0, (vui32_t) re1);
1473 #else
1474  re0 = vec_xxpermdi (re0, re2, 0);
1475  re1 = vec_xxpermdi (re1, re3, 0);
1476  result = vec_mergeo ((vui32_t) re0, (vui32_t) re1);
1477 #endif
1478 #else // _ARCH_PWR7
1479  vui32_t xte0, xte1, xte2, xte3;
1480  vui8_t perm0, perm1, perm2, perm3;
1481 
1482  perm0 = vec_lvsl (offset0, array);
1483  xte0 = vec_lde (offset0, array);
1484  xte0 = vec_perm (xte0, xte0, perm0);
1485 
1486  perm1 = vec_lvsl (offset1, array);
1487  xte1 = vec_lde (offset1, array);
1488  xte1 = vec_perm (xte1, xte1, perm1);
1489 
1490  perm2 = vec_lvsl (offset2, array);
1491  xte2 = vec_lde (offset2, array);
1492  xte2 = vec_perm (xte2, xte2, perm2);
1493 
1494  perm3 = vec_lvsl (offset3, array);
1495  xte3 = vec_lde (offset3, array);
1496  xte3 = vec_perm (xte3, xte3, perm3);
1497 
1498  xte0 = vec_mergeh (xte0, xte2);
1499  xte1 = vec_mergeh (xte1, xte3);
1500  result = vec_mergeh (xte0, xte1);
1501 #endif
1502  return result;
1503 }
1504 
1521 static inline
1522 vui32_t
1523 vec_vgl4wwo (unsigned int *array, vi32_t vra)
1524 {
1525  vui32_t r;
1526 
1527 #ifdef _ARCH_PWR8
1528 #if 1
1529  vi64_t off01, off23;
1530 
1531  off01 = vec_vupkhsw (vra);
1532  off23 = vec_vupklsw (vra);
1533 
1534  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);
1535 #else
1536  r = vec_vgl4wso (array, vra[0], vra[1], vra[2], vra[3]);
1537 #endif
1538 #else
1539  // Need to explicitly manage the VR/GPR xfer for PWR7
1540  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1541  signed int off0, off1, off2, off3;
1542 
1543  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;
1544  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);
1545  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;
1546  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);
1547 
1548  r = vec_vgl4wso (array, off0, off1, off2, off3);
1549 #endif
1550  return r;
1551 }
1552 
1575 static inline
1576 vui32_t
1577 vec_vgl4wwsx (unsigned int *array, vi32_t vra,
1578  const unsigned char scale)
1579 {
1580  vui32_t r;
1581 
1582 #ifdef _ARCH_PWR8
1583  vi64_t off01, off23;
1584  vi64_t lshift = vec_splats ((long long) (2+ scale));
1585 
1586  off01 = vec_vupkhsw (vra);
1587  off23 = vec_vupklsw (vra);
1588 
1589  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);
1590  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);
1591 
1592  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);
1593 #else
1594  // Need to explicitly manage the VR/GPR xfer for PWR7
1595  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1596  signed long long off0, off1, off2, off3;
1597 
1598  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2+ scale);
1599  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2+ scale);
1600  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2+ scale);
1601  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2+ scale);
1602 
1603  r = vec_vgl4wso (array, off0, off1, off2, off3);
1604 #endif
1605  return r;
1606 }
1607 
1628 static inline
1629 vui32_t
1630 vec_vgl4wwx (unsigned int *array, vi32_t vra)
1631 {
1632  vui32_t r;
1633 
1634 #ifdef _ARCH_PWR8
1635  vi64_t off01, off23;
1636  vi64_t lshift = vec_splats ((long long) (2));
1637 
1638  off01 = vec_vupkhsw (vra);
1639  off23 = vec_vupklsw (vra);
1640 
1641  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);
1642  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);
1643 
1644  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);
1645 #else
1646  // Need to explicitly manage the VR/GPR xfer for PWR7
1647  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1648  signed long long off0, off1, off2, off3;
1649 
1650  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;
1651  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;
1652  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;
1653  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;
1654 
1655  r = vec_vgl4wso (array, off0, off1, off2, off3);
1656 #endif
1657  return r;
1658 }
1659 
1678 static inline vi64_t
1679 vec_vglswso (signed int *array, const long long offset0,
1680  const long long offset1)
1681 {
1682  vi64_t re0, re1, result;
1683 
1684  re0 = vec_vlxsiwax (offset0, array);
1685  re1 = vec_vlxsiwax (offset1, array);
1686  /* Need to handle endian as the vec_vlxsiwzx result is always left
1687  * justified in VR, while element [0] may be left or right. */
1688 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1689  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't
1690  * support doubleword vec_merge. */
1691  result = vec_xxpermdi (re0, re1, 3);
1692 #else
1693 #ifdef _ARCH_PWR7
1694  result = vec_xxpermdi (re0, re1, 0);
1695 #else
1696  re0 = (vi64_t) vec_sld (re0, re0, 8);
1697  result = (vi64_t) vec_sld (re0, re1, 8);
1698 #endif
1699 #endif
1700  return result;
1701 }
1702 
1719 static inline
1720 vi64_t
1721 vec_vglswdo (signed int *array, vi64_t vra)
1722 {
1723  vi64_t r;
1724 
1725 #ifdef _ARCH_PWR8
1726  r = vec_vglswso (array, vra[0], vra[1]);
1727 #else
1728  // Need to explicitly manage the VR/GPR xfer for PWR7
1729  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1730 
1733 #endif
1734  return r;
1735 }
1736 
1755 static inline
1756 vi64_t
1757 vec_vglswdsx (signed int *array, vi64_t vra,
1758  const unsigned char scale)
1759 {
1760  vi64_t r;
1761 
1762 #ifdef _ARCH_PWR8
1763  vi64_t lshift = vec_splats ((long long) (2 + scale));
1764  vi64_t offset;
1765 
1766  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);
1767  r = vec_vglswso (array, offset[0], offset[1]);
1768 #else
1769  long long offset0, offset1;
1770  // Need to explicitly manage the VR/GPR xfer for PWR7
1771  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1772  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);
1773  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);
1774 
1775  r = vec_vglswso (array, offset0, offset1);
1776 #endif
1777  return r;
1778 }
1779 
1799 static inline
1800 vi64_t
1801 vec_vglswdx (signed int *array, vi64_t vra)
1802 {
1803  vi64_t r;
1804 
1805 #ifdef _ARCH_PWR8
1806  vi64_t lshift = vec_splats ((long long) 2);
1807  vi64_t offset;
1808 
1809  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);
1810  r = vec_vglswso (array, offset[0], offset[1]);
1811 #else
1812  long long offset0, offset1;
1813  // Need to explicitly manage the VR/GPR xfer for PWR7
1814  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1815  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;
1816  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;
1817 
1818  r = vec_vglswso (array, offset0, offset1);
1819 #endif
1820  return r;
1821 }
1822 
1841 static inline vui64_t
1842 vec_vgluwso (unsigned int *array, const long long offset0,
1843  const long long offset1)
1844 {
1845  vui64_t re0, re1, result;
1846 
1847  re0 = vec_vlxsiwzx (offset0, array);
1848  re1 = vec_vlxsiwzx (offset1, array);
1849  /* Need to handle endian as the vec_vlxsiwzx result is always left
1850  * justified in VR, while element [0] may be left or right. */
1851 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1852  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't
1853  * support doubleword vec_merge. */
1854  result = vec_xxpermdi (re0, re1, 3);
1855 #else
1856 #ifdef _ARCH_PWR7
1857  result = vec_xxpermdi (re0, re1, 0);
1858 #else
1859  re0 = (vui64_t) vec_sld (re0, re0, 8);
1860  result = (vui64_t) vec_sld (re0, re1, 8);
1861 #endif
1862 #endif
1863  return result;
1864 }
1865 
1882 static inline
1883 vui64_t
1884 vec_vgluwdo (unsigned int *array, vi64_t vra)
1885 {
1886  vui64_t r;
1887 
1888 #ifdef _ARCH_PWR8
1889  r = vec_vgluwso (array, vra[0], vra[1]);
1890 #else
1891  // Need to explicitly manage the VR/GPR xfer for PWR7
1892  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1893 
1896 #endif
1897  return r;
1898 }
1899 
1918 static inline
1919 vui64_t
1920 vec_vgluwdsx (unsigned int *array, vi64_t vra,
1921  const unsigned char scale)
1922 {
1923  vui64_t r;
1924 
1925 #ifdef _ARCH_PWR8
1926  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));
1927  vui64_t offset;
1928 
1929  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);
1930  r = vec_vgluwso (array, offset[0], offset[1]);
1931 #else
1932  long long offset0, offset1;
1933  // Need to explicitly manage the VR/GPR xfer for PWR7
1934  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1935  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);
1936  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);
1937 
1938  r = vec_vgluwso (array, offset0, offset1);
1939 #endif
1940  return r;
1941 }
1942 
1959 static inline
1960 vui64_t
1961 vec_vgluwdx (unsigned int *array, vi64_t vra)
1962 {
1963  vui64_t r;
1964 
1965 #ifdef _ARCH_PWR8
1966  vui64_t lshift = vec_splats ((unsigned long long) 2);
1967  vui64_t offset;
1968 
1969  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);
1970  r = vec_vgluwso (array, offset[0], offset[1]);
1971 #else
1972  long long offset0, offset1;
1973  // Need to explicitly manage the VR/GPR xfer for PWR7
1974  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
1975  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;
1976  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;
1977 
1978  r = vec_vgluwso (array, offset0, offset1);
1979 #endif
1980  return r;
1981 }
1982 
2017 static inline vi64_t
2018 vec_vlxsiwax (const signed long long ra, const signed int *rb)
2019 {
2020  vi64_t xt;
2021 
2022 #if (defined(__clang__) && __clang_major__ < 8)
2023  __VEC_U_128 t;
2024 
2025  signed int *p = (signed int *)((char *)rb + ra);
2026  t.ulong.upper = *p;
2027  xt = t.vx2;
2028 #elif _ARCH_PWR8
2029  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768))
2030  {
2031  if (ra == 0)
2032  {
2033  __asm__(
2034  "lxsiwax %x0,%y1;"
2035  : "=wa" (xt)
2036  : "Z" (*rb)
2037  : );
2038  } else {
2039  unsigned long long rt;
2040  __asm__(
2041  "li %0,%1;"
2042  : "=r" (rt)
2043  : "I" (ra)
2044  : );
2045  __asm__(
2046  "lxsiwax %x0,%y1;"
2047  : "=wa" (xt)
2048  : "Z" (*(signed int *)((char *)rb+rt))
2049  : );
2050  }
2051  } else {
2052  __asm__(
2053  "lxsiwax %x0,%y1;"
2054  : "=wa" (xt)
2055  : "Z" (*(signed int *)((char *)rb+ra))
2056  : );
2057  }
2058 #else // _ARCH_PWR7
2059  vui32_t const shb = { 31, 0, 0 ,0 };
2060  vi32_t xte;
2061  vui8_t perm;
2062 
2063  perm = vec_lvsl (ra, rb);
2064  xte = vec_lde (ra, rb);
2065  perm = (vui8_t) vec_mergeh ((vui32_t) perm, (vui32_t) perm);
2066  xte = vec_perm (xte, xte, perm);
2067  xt = (vi64_t) vec_sra (xte, shb);
2068 #endif
2069  return xt;
2070 }
2071 
2108 static inline vui64_t
2109 vec_vlxsiwzx (const signed long long ra, const unsigned int *rb)
2110 {
2111  vui64_t xt;
2112 
2113 #if (defined(__clang__) && __clang_major__ < 8)
2114  __VEC_U_128 t;
2115 
2116  unsigned int *p = (unsigned int *)((char *)rb + ra);
2117  t.ulong.upper = *p;
2118  xt = t.vx2;
2119 #elif _ARCH_PWR8
2120  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768))
2121  {
2122  if (ra == 0)
2123  {
2124  __asm__(
2125  "lxsiwzx %x0,%y1;"
2126  : "=wa" (xt)
2127  : "Z" (*rb)
2128  : );
2129  } else {
2130  unsigned long long rt;
2131  __asm__(
2132  "li %0,%1;"
2133  : "=r" (rt)
2134  : "I" (ra)
2135  : );
2136  __asm__(
2137  "lxsiwzx %x0,%y1;"
2138  : "=wa" (xt)
2139  : "Z" (*(signed int *)((char *)rb+rt))
2140  : );
2141  }
2142  } else {
2143  __asm__(
2144  "lxsiwzx %x0,%y1;"
2145  : "=wa" (xt)
2146  : "Z" (*(signed int *)((char *)rb+ra))
2147  : );
2148  }
2149 #else // _ARCH_PWR7
2150  const vui32_t zero = {0,0,0,0};
2151  vui32_t xte;
2152  vui8_t perm;
2153 
2154  perm = vec_lvsl (ra, rb);
2155  xte = vec_lde (ra, rb);
2156  xte = vec_perm (xte, xte, perm);
2157  xt = (vui64_t) vec_sld (zero, xte, 12);
2158 #endif
2159  return xt;
2160 }
2161 
2168 static inline vui64_t
2170 
2177 static inline vui64_t
2179 
2186 static inline vui64_t
2188 
2195 static inline vui64_t
2197 
2204 static inline vui64_t
2206 
2236 static inline vui64_t
2238 {
2239  vui64_t res;
2240 #ifdef _ARCH_PWR8
2241 #if defined __GNUC__ && (__GNUC__ > 7)
2242 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2243  res = vec_mulo (vra, vrb);
2244 #else
2245  res = vec_mule (vra, vrb);
2246 #endif
2247 #else
2248  __asm__(
2249  "vmuleuw %0,%1,%2;\n"
2250  : "=v" (res)
2251  : "v" (vra),
2252  "v" (vrb)
2253  : );
2254 #endif
2255 #else
2256  const vui32_t zero = {0,0,0,0};
2257  const vui32_t ones = {-1,-1,-1,-1};
2258  vui32_t wmask01;
2259  vui32_t p0, p1, pp10, pp01, resw;
2260  vui16_t m0, m1, mt, mth, mtl;
2261 
2262  /* generate {0,-1,0,-1} mask. */
2263  wmask01 = vec_vmrghw (zero, ones);
2264 
2265  mt = (vui16_t)vrb;
2266  mtl = vec_mergeh (mt, mt);
2267  mth = vec_mergel (mt, mt);
2268 
2269 #ifdef _ARCH_PWR7
2270  m0 = (vui16_t)vec_xxpermdi ((vui64_t)mtl, (vui64_t)mth, 0);
2271 #else
2272  {
2273  vui32_t temp;
2274  temp = vec_sld ((vui32_t) mtl, (vui32_t) mth, 8);
2275  m0 = (vui16_t) vec_sld (temp, (vui32_t) mth, 8);
2276  }
2277 #endif
2278 
2279  resw = vec_sld (vra, vra, 12);
2280  resw = vec_sel (vra, resw, wmask01);
2281  m1 = (vui16_t)resw;
2282 
2283  p0 = vec_vmuleuh (m1, m0);
2284  p1 = vec_vmulouh (m1, m0);
2285  resw = vec_sel (p0, p1, wmask01);
2286  res = (vui64_t)resw;
2287 
2288  pp10 = vec_sld (p1, p1, 12);
2289  pp01 = p0;
2290  /* pp01 = vec_addudm (pp01, pp10). */
2291  {
2292  vui32_t c;
2293  vui32_t xmask;
2294  xmask = vec_sld (wmask01, wmask01, 2);
2295  c = vec_vaddcuw (pp01, pp10);
2296  pp01 = vec_vadduwm (pp01, pp10);
2297  c = vec_sld (c, c, 6);
2298  pp01 = vec_sld (pp01, pp01, 2);
2299  pp01 = vec_sel (c, pp01, xmask);
2300  }
2301  /* res = vec_addudm (pp01, res). */
2302  {
2303  vui32_t c, r;
2304  c = vec_vaddcuw (pp01, (vui32_t)res);
2305  r = vec_vadduwm (pp01, (vui32_t)res);
2306  c = vec_sld (c, zero, 4);
2307  res = (vui64_t)vec_vadduwm (r, c);
2308  }
2309 #endif
2310  return (res);
2311 }
2312 
2339 static inline vui64_t
2341 {
2342  vui64_t res;
2343 #ifdef _ARCH_PWR8
2344 #if defined __GNUC__ && (__GNUC__ > 7)
2345  /* Not supported in GCC yet. ETA GCC-8. */
2346 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2347  res = vec_mule (vra, vrb);
2348 #else
2349  res = vec_mulo (vra, vrb);
2350 #endif
2351 #else
2352  __asm__(
2353  "vmulouw %0,%1,%2;\n"
2354  : "=v" (res)
2355  : "v" (vra),
2356  "v" (vrb)
2357  : );
2358 #endif
2359 #else
2360  const vui32_t zero = {0,0,0,0};
2361  const vui32_t ones = {-1,-1,-1,-1};
2362  vui32_t wmask01;
2363  vui32_t p0, p1, pp10, pp01, resw;
2364  vui16_t m0, m1, mt, mth, mtl;
2365  /* generate {0,-1,0,-1} mask. */
2366  wmask01 = vec_vmrghw (zero, ones);
2367 
2368  mt = (vui16_t)vrb;
2369  mtl = vec_mergel (mt, mt);
2370  mth = vec_mergeh (mt, mt);
2371 #ifdef _ARCH_PWR7
2372  m0 = (vui16_t)vec_xxpermdi ((vui64_t)mth, (vui64_t)mtl, 3);
2373 #else
2374  {
2375  vui32_t temp;
2376  temp = vec_sld ((vui32_t) mtl, (vui32_t) mtl, 8);
2377  result = (vui64_t) vec_sld ((vui32_t) mth, temp, 8);
2378  }
2379 #endif
2380 
2381  resw = vec_sld (vra, vra, 4);
2382  m1 = (vui16_t)vec_sel (resw, vra, wmask01);
2383 
2384  p0 = vec_vmuleuh (m1, m0);
2385  p1 = vec_vmulouh (m1, m0);
2386 
2387  resw = vec_sel (p0, p1, wmask01);
2388  res = (vui64_t)resw;
2389 
2390  pp10 = vec_sld (p1, p1, 12);
2391  pp01 = p0;
2392  /* pp01 = vec_addudm (pp01, pp10). */
2393  {
2394  vui32_t c;
2395  vui32_t xmask;
2396  xmask = vec_sld (wmask01, wmask01, 2);
2397  c = vec_vaddcuw (pp01, pp10);
2398  pp01 = vec_vadduwm (pp01, pp10);
2399  c = vec_sld (c, c, 6);
2400  pp01 = vec_sld (pp01, pp01, 2);
2401  pp01 = vec_sel (c, pp01, xmask);
2402  }
2403  /* res = vec_addudm (pp01, res). */
2404  {
2405  vui32_t c, r;
2406  c = vec_vaddcuw (pp01, (vui32_t)res);
2407  r = vec_vadduwm (pp01, (vui32_t)res);
2408  c = vec_sld (c, zero, 4);
2409  res = (vui64_t)vec_vadduwm (r, c);
2410  }
2411 #endif
2412  return (res);
2413 }
2414 
2432 static inline void
2433 vec_vsst4wso (vui32_t xs, unsigned int *array,
2434  const long long offset0, const long long offset1,
2435  const long long offset2, const long long offset3)
2436 {
2437  vui32_t xs0, xs1, xs2, xs3;
2438 
2439  xs0 = vec_splat (xs, 0);
2440  xs1 = vec_splat (xs, 1);
2441  xs2 = vec_splat (xs, 2);
2442  xs3 = vec_splat (xs, 3);
2443  vec_ste (xs0, offset0, array);
2444  vec_ste (xs1, offset1, array);
2445  vec_ste (xs2, offset2, array);
2446  vec_ste (xs3, offset3, array);
2447 }
2448 
2466 static inline void
2467 vec_vsst4wwo (vui32_t xs, unsigned int *array,
2468  vi32_t vra)
2469 {
2470 #ifdef _ARCH_PWR8
2471  vi64_t off01, off23;
2472 
2473  off01 = vec_vupkhsw (vra);
2474  off23 = vec_vupklsw (vra);
2475 
2476  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);
2477 #else
2478  // Need to explicitly manage the VR/GPR xfer for PWR7
2479  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2480  signed int off0, off1, off2, off3;
2481 
2482  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;
2483  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);
2484  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;
2485  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);
2486 
2487  vec_vsst4wso (xs, array, off0, off1, off2, off3);
2488 #endif
2489 }
2490 
2512 static inline void
2513 vec_vsst4wwsx (vui32_t xs, unsigned int *array,
2514  vi32_t vra, const unsigned char scale)
2515 {
2516 #ifdef _ARCH_PWR8
2517  vi64_t off01, off23;
2518  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));
2519 
2520  off01 = vec_vupkhsw (vra);
2521  off23 = vec_vupklsw (vra);
2522 
2523  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);
2524  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);
2525 
2526  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);
2527 #else
2528  // Need to explicitly manage the VR/GPR xfer for PWR7
2529  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2530  signed int off0, off1, off2, off3;
2531 
2532  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2 + scale);
2533  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2 + scale);
2534  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2 + scale);
2535  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2 + scale);
2536 
2537  vec_vsst4wso (xs, array, off0, off1, off2, off3);
2538 #endif
2539 }
2540 
2559 static inline void
2560 vec_vsst4wwx (vui32_t xs, unsigned int *array,
2561  vi32_t vra)
2562 {
2563 #ifdef _ARCH_PWR8
2564  vi64_t off01, off23;
2565  vui64_t lshift = vec_splats ((unsigned long long) 2);
2566 
2567  off01 = vec_vupkhsw (vra);
2568  off23 = vec_vupklsw (vra);
2569 
2570  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);
2571  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);
2572 
2573  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);
2574 #else
2575  // Need to explicitly manage the VR/GPR xfer for PWR7
2576  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2577  signed int off0, off1, off2, off3;
2578 
2579  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;
2580  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;
2581  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;
2582  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;
2583 
2584  vec_vsst4wso (xs, array, off0, off1, off2, off3);
2585 #endif
2586 }
2587 
2603 static inline void
2604 vec_vsstwdo (vui64_t xs, unsigned int *array, vi64_t vra)
2605 {
2606 #ifdef _ARCH_PWR8
2607  vec_vsstwso (xs, array, vra[0], vra[1]);
2608 #else
2609  // Need to explicitly manage the VR/GPR xfer for PWR7
2610  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2611 
2612  vec_vsstwso (xs, array,
2615 #endif
2616 }
2617 
2635 static inline void
2636 vec_vsstwdsx (vui64_t xs, unsigned int *array, vi64_t vra,
2637  const unsigned char scale)
2638 {
2639 #ifdef _ARCH_PWR8
2640  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));
2641  vui64_t offset;
2642 
2643  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);
2644  vec_vsstwso (xs, array, offset[0], offset[1]);
2645 #else
2646  long long offset0, offset1;
2647  // Need to explicitly manage the VR/GPR xfer for PWR7
2648  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2649  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);
2650  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);
2651 
2652  vec_vsstwso (xs, array, offset0, offset1);
2653 #endif
2654 }
2655 
2671 static inline void
2672 vec_vsstwdx (vui64_t xs, unsigned int *array, vi64_t vra)
2673 {
2674 #ifdef _ARCH_PWR8
2675  vui64_t lshift = vec_splats ((unsigned long long) 2);
2676  vui64_t offset;
2677 
2678  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);
2679  vec_vsstwso (xs, array, offset[0], offset[1]);
2680 #else
2681  long long offset0, offset1;
2682  // Need to explicitly manage the VR/GPR xfer for PWR7
2683  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
2684  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;
2685  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;
2686 
2687  vec_vsstwso (xs, array, offset0, offset1);
2688 #endif
2689 }
2690 
2707 static inline void
2708 vec_vsstwso (vui64_t xs, unsigned int *array,
2709  const long long offset0, const long long offset1)
2710 {
2711  vui32_t xs0, xs1;
2712 
2713  xs0 = (vui32_t) xs;
2714  // xs1 = vec_xxswapd (xs);
2715 #ifdef _ARCH_PWR7
2716  xs1 = (vui32_t) vec_xxpermdi (xs, xs, 2);
2717 #else
2718  xs1 = vec_sld (xs0, xs0, 8);
2719 #endif
2720  /* Need to handle endian as vec_vstsxiwx always stores the right word
2721  * from the left doubleword of the VSR, while word element [1] may in
2722  * the left or right doubleword. */
2723 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2724  vec_vstxsiwx (xs0, offset1, array);
2725  vec_vstxsiwx (xs1, offset0, array);
2726 #else
2727  vec_vstxsiwx (xs0, offset0, array);
2728  vec_vstxsiwx (xs1, offset1, array);
2729 #endif
2730 }
2731 
2759 static inline void
2760 vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb)
2761 {
2762 #if (defined(__clang__) && __clang_major__ < 8)
2763  __VEC_U_128 t;
2764  unsigned int *p = (unsigned int *)((char *)rb + ra);
2765  t.vx4 = xs;
2766  *p = t.ulong.upper;
2767 #elif _ARCH_PWR8
2768  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768))
2769  {
2770  if (ra == 0)
2771  {
2772  __asm__(
2773  "stxsiwx %x1,%y0;"
2774  : "=Z" (*rb)
2775  : "wa" (xs)
2776  : );
2777  } else {
2778  unsigned long long rt;
2779  __asm__(
2780  "li %0,%1;"
2781  : "=r" (rt)
2782  : "I" (ra)
2783  : );
2784  __asm__(
2785  "stxsiwx %x1,%y0;"
2786  : "=Z" (*(unsigned int *)((char *)rb+rt))
2787  : "wa" (xs)
2788  : );
2789  }
2790  } else {
2791  __asm__(
2792  "stxsiwx %x1,%y0;"
2793  : "=Z" (*(unsigned int *)((char *)rb+ra))
2794  : "wa" (xs)
2795  : );
2796  }
2797 #else //_ARCH_PWR8
2798  // Splat word element 1 to all elements
2799  vui32_t xss = vec_splat (xs, 1);
2800  // store a word element at the EA (ra+rb)
2801  vec_ste (xss, ra, rb);
2802 #endif
2803 }
2804 
2847 static inline vi32_t
2849 {
2850  vi32_t res;
2851 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
2852  res = vec_sum2s (vra, vrb);
2853 #else
2854  __asm__(
2855  "vsum2sws %0,%1,%2;\n"
2856  : "=v" (res)
2857  : "v" (vra),
2858  "v" (vrb)
2859  : );
2860 #endif
2861  return ((vi32_t) res);
2862 }
2863 
2906 static inline vi32_t
2908 {
2909  vi32_t res;
2910 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
2911  res = vec_sums (vra, vrb);
2912 #else
2913  __asm__(
2914  "vsumsws %0,%1,%2;\n"
2915  : "=v" (res)
2916  : "v" (vra),
2917  "v" (vrb)
2918  : );
2919 #endif
2920  return ((vi32_t) res);
2921 }
2922 
2945 #ifndef vec_vupkhsw
2946 // May be defined as inline function for clang
2947 // But only for _ARCH_PWR8 or higher.
2948 #if !defined(__clang__) || !defined(_ARCH_PWR8)
2949 static inline vi64_t
2951 {
2952  vi64_t r;
2953 #ifdef _ARCH_PWR8
2954  __asm__(
2955 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2956  "vupklsw %0,%1;\n"
2957 #else
2958  "vupkhsw %0,%1;\n"
2959 #endif
2960  : "=v" (r)
2961  : "v" (vra)
2962  : );
2963 #else
2964  vui32_t const shb = { 31, 0, 31 ,0 };
2965  vi32_t xra;
2966 
2967  xra = vec_mergeh (vra, vra);
2968  r = (vi64_t) vec_sra (xra, shb);
2969 #endif
2970  return (r);
2971 }
2972 #endif
2973 #endif
2974 
2994 static inline vui64_t
2996 {
2997  vui32_t const zero = { 0, 0, 0 ,0 };
2998 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2999  return (vui64_t) vec_mergeh (vra, zero);
3000 #else
3001  return (vui64_t) vec_mergeh (zero, vra);
3002 #endif
3003 }
3004 
3023 #ifndef vec_vupklsw
3024 // May be defined as inline function for clang
3025 // But only for _ARCH_PWR8 or higher.
3026 #if !defined(__clang__) || !defined(_ARCH_PWR8)
3027 static inline vi64_t
3029 {
3030  vi64_t r;
3031 #ifdef _ARCH_PWR8
3032  __asm__(
3033 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
3034  "vupkhsw %0,%1;\n"
3035 #else
3036  "vupklsw %0,%1;\n"
3037 #endif
3038  : "=v" (r)
3039  : "v" (vra)
3040  : );
3041 #else
3042  vui32_t const shb = { 31, 0, 31 ,0 };
3043  vi32_t xra;
3044 
3045  xra = vec_mergel (vra, vra);
3046  r = (vi64_t) vec_sra (xra, shb);
3047 #endif
3048  return (r);
3049 }
3050 #endif
3051 #endif
3052 
3072 static inline vui64_t
3074 {
3075  vui32_t const zero = { 0, 0, 0 ,0 };
3076 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
3077  return (vui64_t) vec_mergel (vra, zero);
3078 #else
3079  return (vui64_t) vec_mergel (zero, vra);
3080 #endif
3081 }
3082 
3083 #endif /* VEC_INT32_PPC_H_ */
vec_vgl4wwo
static vui32_t vec_vgl4wwo(unsigned int *array, vi32_t vra)
Vector Gather-Load 4 Words from Vector Word Offsets.
Definition: vec_int32_ppc.h:1523
vec_mrgahw
static vui32_t vec_mrgahw(vui64_t vra, vui64_t vrb)
Vector Merge Algebraic High Words.
Definition: vec_int32_ppc.h:653
vec_vlxsiwax
static vi64_t vec_vlxsiwax(const signed long long ra, const signed int *rb)
Vector Load Scalar Integer Word Algebraic Indexed.
Definition: vec_int32_ppc.h:2018
vec_vmadd2ouw
static vui64_t vec_vmadd2ouw(vui32_t a, vui32_t b, vui32_t c, vui32_t d)
Vector Multiply-Add2 Odd Unsigned Words.
scalar_extract_uint64_from_high_uint128
static unsigned long long scalar_extract_uint64_from_high_uint128(unsigned __int128 gprp)
Extract the high doubleword from a __int128 scalar.
Definition: vec_common_ppc.h:503
vec_muleuw
static vui64_t vec_muleuw(vui32_t a, vui32_t b)
Vector multiply even unsigned words.
Definition: vec_int32_ppc.h:1007
vec_vgl4wso
static vui32_t vec_vgl4wso(unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
Vector Gather-Load 4 Words from scalar Offsets.
Definition: vec_int32_ppc.h:1453
vec_vupkluw
static vui64_t vec_vupkluw(vui32_t vra)
Vector Unpack Low Unsigned Word.
Definition: vec_int32_ppc.h:3073
vb32_t
__vector __bool int vb32_t
vector of 32-bit bool int elements.
Definition: vec_common_ppc.h:228
vec_setb_sw
static vb32_t vec_setb_sw(vi32_t vra)
Vector Set Bool from Signed Word.
Definition: vec_int32_ppc.h:1273
vec_vglswdo
static vi64_t vec_vglswdo(signed int *array, vi64_t vra)
Vector Gather-Load Signed Words from Vector Doubleword Offsets.
Definition: vec_int32_ppc.h:1721
vec_vgluwdx
static vui64_t vec_vgluwdx(unsigned int *array, vi64_t vra)
Vector Gather-Load Unsigned Words from Vector Doubleword Indexes.
Definition: vec_int32_ppc.h:1961
vec_vsstwdsx
static void vec_vsstwdsx(vui64_t xs, unsigned int *array, vi64_t vra, const unsigned char scale)
Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.
Definition: vec_int32_ppc.h:2636
vec_vglswso
static vi64_t vec_vglswso(signed int *array, const long long offset0, const long long offset1)
Vector Gather-Load Signed Word from Scalar Offsets.
Definition: vec_int32_ppc.h:1679
vec_revbw
static vui32_t vec_revbw(vui32_t vra)
byte reverse each word of a vector unsigned int.
Definition: vec_int32_ppc.h:1229
vec_mrgow
static vui32_t vec_mrgow(vui32_t vra, vui32_t vrb)
Vector Merge Odd Words.
Definition: vec_int32_ppc.h:803
CONST_VINT32_W
#define CONST_VINT32_W(__w0, __w1, __w2, __w3)
Arrange elements of word initializer in high->low order.
Definition: vec_common_ppc.h:306
vec_vupkhuw
static vui64_t vec_vupkhuw(vui32_t vra)
Vector Unpack High Unsigned Word.
Definition: vec_int32_ppc.h:2995
vec_vsumsw
static vi32_t vec_vsumsw(vi32_t vra, vi32_t vrb)
Vector Sum-across Signed Word Saturate.
Definition: vec_int32_ppc.h:2907
vec_vsstwso
static void vec_vsstwso(vui64_t xs, unsigned int *array, const long long offset0, const long long offset1)
Vector Scatter-Store Words to Scalar Offsets.
Definition: vec_int32_ppc.h:2708
vec_mulhuw
static vui32_t vec_mulhuw(vui32_t vra, vui32_t vrb)
Vector Multiply High Unsigned Word.
Definition: vec_int32_ppc.h:1103
CONST_VINT64_DW
#define CONST_VINT64_DW(__dw0, __dw1)
Arrange elements of dword initializer in high->low order.
Definition: vec_common_ppc.h:295
vec_vmsumuwm
static vui64_t vec_vmsumuwm(vui32_t a, vui32_t b, vui64_t c)
Vector Multiply-Sum Unsigned Word Modulo.
vec_transfer_vui128t_to_uint128
static unsigned __int128 vec_transfer_vui128t_to_uint128(vui128_t vra)
Transfer a vector unsigned __int128 to __int128 scalar.
Definition: vec_common_ppc.h:420
vui16_t
__vector unsigned short vui16_t
vector of 16-bit unsigned short elements.
Definition: vec_common_ppc.h:204
vec_mulesw
static vi64_t vec_mulesw(vi32_t a, vi32_t b)
Vector multiply even signed words.
Definition: vec_int32_ppc.h:854
vec_vgluwdsx
static vui64_t vec_vgluwdsx(unsigned int *array, vi64_t vra, const unsigned char scale)
Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes.
Definition: vec_int32_ppc.h:1920
vec_vmadd2euw
static vui64_t vec_vmadd2euw(vui32_t a, vui32_t b, vui32_t c, vui32_t d)
Vector Multiply-Add2 Even Unsigned Words.
vec_popcntw
static vui32_t vec_popcntw(vui32_t vra)
Vector Population Count word.
Definition: vec_int32_ppc.h:1184
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vec_vmuleuw
static vui64_t vec_vmuleuw(vui32_t vra, vui32_t vrb)
Vector Multiply Even Unsigned words.
Definition: vec_int32_ppc.h:2237
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vi32_t
__vector int vi32_t
vector of 32-bit signed int elements.
Definition: vec_common_ppc.h:215
vec_vmulouw
static vui64_t vec_vmulouw(vui32_t vra, vui32_t vrb)
Vector Multiply Odd Unsigned Words.
Definition: vec_int32_ppc.h:2340
vec_vsst4wso
static void vec_vsst4wso(vui32_t xs, unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
Vector Scatter-Store 4 words to Scalar Offsets.
Definition: vec_int32_ppc.h:2433
__VEC_U_128::ulong
struct __VEC_U_128::@0 ulong
Struct of two unsigned long int (64-bit GPR) fields.
vec_mulosw
static vi64_t vec_mulosw(vi32_t a, vi32_t b)
Vector multiply odd signed words.
Definition: vec_int32_ppc.h:928
vec_srawi
static vi32_t vec_srawi(vi32_t vra, const unsigned int shb)
Vector Shift Right Algebraic Word Immediate.
Definition: vec_int32_ppc.h:1356
vec_mrgalw
static vui32_t vec_mrgalw(vui64_t vra, vui64_t vrb)
Vector merge Algebraic low words.
Definition: vec_int32_ppc.h:703
vec_srwi
static vui32_t vec_srwi(vui32_t vra, const unsigned int shb)
Vector Shift Right Word Immediate.
Definition: vec_int32_ppc.h:1405
vec_ctzw
static vui32_t vec_ctzw(vui32_t vra)
Vector Count Trailing Zeros word.
Definition: vec_int32_ppc.h:601
vec_vsum2sw
static vi32_t vec_vsum2sw(vi32_t vra, vi32_t vrb)
Vector Sum-across Half Signed Word Saturate.
Definition: vec_int32_ppc.h:2848
vui128_t
__vector unsigned __int128 vui128_t
vector of one 128-bit unsigned __int128 element.
Definition: vec_common_ppc.h:237
vec_vupklsw
static vi64_t vec_vupklsw(vi32_t vra)
Vector Unpack Low Signed Word.
Definition: vec_int32_ppc.h:3028
__VEC_U_128
Union used to transfer 128-bit data between vector and non-vector types.
Definition: vec_common_ppc.h:256
vec_muluwm
static vui32_t vec_muluwm(vui32_t a, vui32_t b)
Vector Multiply Unsigned Word Modulo.
Definition: vec_int32_ppc.h:1132
vec_popcntb
static vui8_t vec_popcntb(vui8_t vra)
Vector Population Count byte.
Definition: vec_char_ppc.h:703
vec_vgluwso
static vui64_t vec_vgluwso(unsigned int *array, const long long offset0, const long long offset1)
Vector Gather-Load Unsigned Word from Scalar Offsets.
Definition: vec_int32_ppc.h:1842
vi64_t
__vector long long vi64_t
vector of 64-bit signed long long elements.
Definition: vec_common_ppc.h:217
vec_mulouw
static vui64_t vec_mulouw(vui32_t a, vui32_t b)
Vector multiply odd unsigned words.
Definition: vec_int32_ppc.h:1043
vec_absduw
static vui32_t vec_absduw(vui32_t vra, vui32_t vrb)
Vector Absolute Difference Unsigned Word.
Definition: vec_int32_ppc.h:456
vec_mulhsw
static vi32_t vec_mulhsw(vi32_t vra, vi32_t vrb)
Vector Multiply High Signed Word.
Definition: vec_int32_ppc.h:1069
vec_mrgew
static vui32_t vec_mrgew(vui32_t vra, vui32_t vrb)
Vector Merge Even Words.
Definition: vec_int32_ppc.h:753
vec_vsstwdx
static void vec_vsstwdx(vui64_t xs, unsigned int *array, vi64_t vra)
Vector Scatter-Store Words to Vector Doubleword Indexes.
Definition: vec_int32_ppc.h:2672
vec_vsst4wwsx
static void vec_vsst4wwsx(vui32_t xs, unsigned int *array, vi32_t vra, const unsigned char scale)
Vector Scatter-Store 4 words to Vector Word Indexes.
Definition: vec_int32_ppc.h:2513
vec_vupkhsw
static vi64_t vec_vupkhsw(vi32_t vra)
Vector Unpack High Signed Word.
Definition: vec_int32_ppc.h:2950
vec_vstxsiwx
static void vec_vstxsiwx(vui32_t xs, const signed long long ra, unsigned int *rb)
Vector Store Scalar Integer Word Indexed.
Definition: vec_int32_ppc.h:2760
vec_vglswdsx
static vi64_t vec_vglswdsx(signed int *array, vi64_t vra, const unsigned char scale)
Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes.
Definition: vec_int32_ppc.h:1757
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
scalar_extract_uint64_from_low_uint128
static unsigned long long scalar_extract_uint64_from_low_uint128(unsigned __int128 gprp)
Extract the low doubleword from a __int128 scalar.
Definition: vec_common_ppc.h:490
__VEC_U_128::vx2
vui64_t vx2
128 bit Vector of 2 unsigned long int (64-bit) elements.
Definition: vec_common_ppc.h:275
vec_vsst4wwx
static void vec_vsst4wwx(vui32_t xs, unsigned int *array, vi32_t vra)
Vector Scatter-Store 4 words to Vector Word Indexes.
Definition: vec_int32_ppc.h:2560
vec_vlxsiwzx
static vui64_t vec_vlxsiwzx(const signed long long ra, const unsigned int *rb)
Vector Load Scalar Integer Word and Zero Indexed.
Definition: vec_int32_ppc.h:2109
vec_vgluwdo
static vui64_t vec_vgluwdo(unsigned int *array, vi64_t vra)
Vector Gather-Load Unsigned Words from Vector Doubleword Offsets.
Definition: vec_int32_ppc.h:1884
vec_int16_ppc.h
Header package containing a collection of 128-bit SIMD operations over 16-bit integer elements.
vec_vgl4wwsx
static vui32_t vec_vgl4wwsx(unsigned int *array, vi32_t vra, const unsigned char scale)
Vector Gather-Load 4 Words from Vector Word Scaled Indexes.
Definition: vec_int32_ppc.h:1577
__VEC_U_128::vx4
vui32_t vx4
128 bit Vector of 4 unsigned int elements.
Definition: vec_common_ppc.h:273
vec_slwi
static vui32_t vec_slwi(vui32_t vra, const unsigned int shb)
Vector Shift left Word Immediate.
Definition: vec_int32_ppc.h:1309
vec_vsstwdo
static void vec_vsstwdo(vui64_t xs, unsigned int *array, vi64_t vra)
Vector Scatter-Store Words to Vector Doubleword Offsets.
Definition: vec_int32_ppc.h:2604
vec_vsst4wwo
static void vec_vsst4wwo(vui32_t xs, unsigned int *array, vi32_t vra)
Vector Scatter-Store 4 words to Vector Word Offsets.
Definition: vec_int32_ppc.h:2467
vec_vmaddouw
static vui64_t vec_vmaddouw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Odd Unsigned Words.
vec_vglswdx
static vi64_t vec_vglswdx(signed int *array, vi64_t vra)
Vector Gather-Load Signed Words from Vector Doubleword Indexes.
Definition: vec_int32_ppc.h:1801
vec_vgl4wwx
static vui32_t vec_vgl4wwx(unsigned int *array, vi32_t vra)
Vector Gather-Load 4 Words from Vector Word Indexes.
Definition: vec_int32_ppc.h:1630
vec_clzw
static vui32_t vec_clzw(vui32_t vra)
Vector Count Leading Zeros word.
Definition: vec_int32_ppc.h:503
vec_vmaddeuw
static vui64_t vec_vmaddeuw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Even Unsigned Words.