POWER Vector Library Manual  1.0.4
vec_int64_ppc.h
Go to the documentation of this file.
1 /*
2  Copyright (c) [2018] IBM Corporation.
3 
4  Licensed under the Apache License, Version 2.0 (the "License");
5  you may not use this file except in compliance with the License.
6  You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  vec_int32_ppc.h
17 
18  Contributors:
19  IBM Corporation, Steven Munroe
20  Created on: Mar 29, 2018
21  */
22 
23 #ifndef VEC_INT64_PPC_H_
24 #define VEC_INT64_PPC_H_
25 
26 #include <pveclib/vec_int32_ppc.h>
27 
1199 static inline vb64_t vec_cmpgtsd (vi64_t a, vi64_t b);
1201 static inline vb64_t vec_cmpequd (vui64_t a, vui64_t b);
1202 static inline vb64_t vec_cmpgeud (vui64_t a, vui64_t b);
1203 static inline vb64_t vec_cmpgtud (vui64_t a, vui64_t b);
1204 static inline vb64_t vec_cmpneud (vui64_t a, vui64_t b);
1205 static inline vui64_t vec_sldi (vui64_t vra, const unsigned int shb);
1206 static inline vui64_t vec_maxud (vui64_t vra, vui64_t vrb);
1207 static inline vui64_t vec_minud (vui64_t vra, vui64_t vrb);
1208 static inline vui64_t vec_permdi (vui64_t vra, vui64_t vrb, const int ctl);
1209 #ifndef vec_popcntd
1210 static inline vui64_t vec_popcntd (vui64_t vra);
1211 #else
1212 /* Work around for GCC PR85830. */
1213 #undef vec_popcntd
1214 #define vec_popcntd __builtin_vec_vpopcntd
1215 #endif
1216 static inline vi64_t vec_splat_s64 (const int sim);
1217 static inline vui64_t vec_subudm (vui64_t a, vui64_t b);
1218 static inline vui64_t
1219 vec_vlsidx (const signed long long a, const unsigned long long *b);
1220 static inline void
1221 vec_vstsidx (vui64_t xs, const signed long long ra, unsigned long long *rb);
1222 static inline vui64_t vec_xxspltd (vui64_t vra, const int ctl);
1224 
1240 static inline vui64_t
1242 {
1243  return vec_subudm (vec_maxud (vra, vrb), vec_minud (vra, vrb));
1244 }
1245 
1259 static inline
1260 vui64_t
1262 {
1263  vui32_t r;
1264 
1265 #ifdef _ARCH_PWR8
1266 #if defined (vec_vaddudm)
1267  r = (vui32_t) vec_vaddudm (a, b);
1268 #elif defined (__clang__)
1269  r = (vui32_t) vec_add (a, b);
1270 #else
1271  __asm__(
1272  "vaddudm %0,%1,%2;"
1273  : "=v" (r)
1274  : "v" (a),
1275  "v" (b)
1276  : );
1277 #endif
1278 #else
1279  vui32_t c;
1280  vui32_t z= { 0,0,0,0};
1281  vui32_t cm= { 0,1,0,1};
1282 
1283  c = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
1284  r = vec_vadduwm ((vui32_t)a, (vui32_t)b);
1285  c = vec_and (c, cm);
1286  c = vec_sld (c, z, 4);
1287  r = vec_vadduwm (r, c);
1288 #endif
1289  return ((vui64_t) r);
1290 }
1291 
1312 static inline vui64_t
1314 {
1315  vui64_t r;
1316 #ifdef _ARCH_PWR8
1317 #if defined (vec_vclzd)
1318  r = vec_vclzd (vra);
1319 #elif defined (__clang__)
1320  r = vec_cntlz (vra);
1321 #else
1322  __asm__(
1323  "vclzd %0,%1;"
1324  : "=v" (r)
1325  : "v" (vra)
1326  : );
1327 #endif
1328 #else
1329  vui32_t n, nt, y, x, m;
1330  vui32_t z = { 0, 0, 0, 0 };
1331  vui32_t dlwm = { 0, -1, 0, -1 };
1332 
1333  x = (vui32_t) vra;
1334 
1335  m = (vui32_t) vec_cmpgt (x, z);
1336  n = vec_sld (z, m, 12);
1337  y = vec_and (n, dlwm);
1338  nt = vec_or (x, y);
1339 
1340  n = vec_clzw (nt);
1341  r = (vui64_t) vec_vsum2sw ((vi32_t) n, (vi32_t) z);
1342 #endif
1343  return (r);
1344 }
1345 
1370 static inline vui64_t
1372 {
1373  vui64_t r;
1374 #ifdef _ARCH_PWR9
1375 #if defined (vec_cnttz) || defined (__clang__)
1376  r = vec_cnttz (vra);
1377 #else
1378  __asm__(
1379  "vctzd %0,%1;"
1380  : "=v" (r)
1381  : "v" (vra)
1382  : );
1383 #endif
1384 #else
1385 // For _ARCH_PWR8 and earlier. Generate 1's for the trailing zeros
1386 // and 0's otherwise. Then count (popcnt) the 1's. _ARCH_PWR8 uses
1387 // the hardware vpopcntd instruction. _ARCH_PWR7 and earlier use the
1388 // PVECLIB vec_popcntd implementation which runs ~24-33 instructions.
1389  const vui64_t ones = { -1, -1 };
1390  vui64_t tzmask;
1391  // tzmask = (!vra & (vra - 1))
1392  tzmask = vec_andc (vec_addudm (vra, ones), vra);
1393  // return = vec_popcnt (!vra & (vra - 1))
1394  r = vec_popcntd (tzmask);
1395 #endif
1396  return ((vui64_t) r);
1397 }
1398 
1420 static inline
1421 vb64_t
1423 {
1424  /* vcmpequd works for both signed and unsigned compares. */
1425  return vec_cmpequd ((vui64_t) a, (vui64_t) b);
1426 }
1427 
1449 static inline
1450 vb64_t
1452 {
1453  vb64_t result;
1454 #ifdef _ARCH_PWR8
1455 #if __GNUC__ >= 6
1456  result = vec_cmpeq(a, b);
1457 #else
1458  __asm__(
1459  "vcmpequd %0,%1,%2;\n"
1460  : "=v" (result)
1461  : "v" (a),
1462  "v" (b)
1463  : );
1464 #endif
1465 #else
1466  /*
1467  * Don't have vector compare equal unsigned doubleword until power8.
1468  * So we have to compare word and unless all_eq we need to do some
1469  * extra work, ie the words may have different truth values. So we
1470  * rotate each doubleword by 32-bits (here we use permute as we don't
1471  * have rotate doubleword either). Then vand the original word
1472  * compare and rotated value to get the final value.
1473  */
1474  vui8_t permute =
1475  { 0x04,0x05,0x6,0x7, 0x00,0x01,0x2,0x03, 0x0C,0x0D,0x0E,0x0F, 0x08,0x09,0x0A,0x0B};
1476  vui32_t r, rr;
1477  r = (vui32_t) vec_cmpeq ((vui32_t) a, (vui32_t) b);
1478  if (vec_any_ne ((vui32_t) a, (vui32_t) b))
1479  {
1480  rr = vec_perm (r, r, permute);
1481  r= vec_and (r, rr);
1482  }
1483  result = (vb64_t)r;
1484 #endif
1485  return (result);
1486 }
1487 
1507 static inline
1508 vb64_t
1510 {
1511  vb64_t r;
1512  /* vec_cmpge is implemented as the not of vec_cmplt. And vec_cmplt
1513  is implemented as vec_cmpgt with parms reversed. */
1514  r = vec_cmpgtsd (b, a);
1515  return vec_nor (r, r);
1516 }
1517 
1537 static inline
1538 vb64_t
1540 {
1541  vb64_t r;
1542  /* vec_cmpge is implemented as the not of vec_cmplt. And vec_cmplt
1543  is implemented as vec_cmpgt with parms reversed. */
1544  r = vec_cmpgtud (b, a);
1545  return vec_nor (r, r);
1546 }
1547 
1569 static inline
1570 vb64_t
1572 {
1573  vb64_t result;
1574 #ifdef _ARCH_PWR8
1575 #if __GNUC__ >= 6
1576  result = vec_cmpgt(a, b);
1577 #else
1578  __asm__(
1579  "vcmpgtsd %0,%1,%2;\n"
1580  : "=v" (result)
1581  : "v" (a),
1582  "v" (b)
1583  : );
1584 #endif
1585 #else
1586  vui64_t _A, _B;
1587  const vui64_t signmask = CONST_VINT128_DW(0x8000000000000000UL,
1588  0x8000000000000000UL);
1589  /* For a signed compare we can flip the sign bits, which give
1590  unsigned magnitudes, that retain the correct relative different.
1591  */
1592  _A = vec_xor ((vui64_t)a, signmask);
1593  _B = vec_xor ((vui64_t)b, signmask);
1594  result = vec_cmpgtud (_A, _B);
1595 #endif
1596  return (result);
1597 }
1598 
1620 static inline
1621 vb64_t
1623 {
1624  vb64_t result;
1625 #ifdef _ARCH_PWR8
1626 #if __GNUC__ >= 6
1627  result = vec_cmpgt(a, b);
1628 #else
1629  __asm__(
1630  "vcmpgtud %0,%1,%2;\n"
1631  : "=v" (result)
1632  : "v" (a),
1633  "v" (b)
1634  : );
1635 #endif
1636 #else
1637  /*
1638  * Don't have vector compare greater than unsigned doubleword until
1639  * power8. So we have to use compare word and logic to compute the
1640  * doubleword truth values.
1641  */
1642  __vector unsigned int r, x, y;
1643  __vector unsigned int c0, c1, c01;
1644  __vector unsigned int eq, gt, a32, b32;
1645 
1646  /* c10 = {0, -1, 0, -1} */
1647  c0 = vec_splat_u32 (0);
1648  c1 = vec_splat_u32 (-1);
1649  c01 = vec_mergeh (c0, c1);
1650 
1651  a32 = (__vector unsigned int)a;
1652  b32 = (__vector unsigned int)b;
1653 
1654  gt = (__vector unsigned int)vec_cmpgt (a32, b32);
1655  eq = (__vector unsigned int)vec_cmpeq (a32, b32);
1656  /* GTxw = GThw | (EQhw & GTlw) */
1657  x = vec_sld (gt, c0, 4);
1658  y = vec_and (eq, x);
1659  x = vec_or (gt, y);
1660  /* Duplicate result word to dword width. */
1661  y = vec_sld (c0, x, 12);
1662  r = vec_sel (x, y, c01);
1663  result = (vb64_t)r;
1664 #endif
1665  return (result);
1666 }
1667 
1687 static inline
1688 vb64_t
1690 {
1691  vb64_t result;
1692  /* vec_cmple is implemented as the not of vec_cmpgt. */
1693  result = vec_cmpgtsd (a, b);
1694  return vec_nor (result, result);
1695 }
1696 
1716 static inline
1717 vb64_t
1719 {
1720  vb64_t result;
1721  /* vec_cmple is implemented as the not of vec_cmpgt. */
1722  result = vec_cmpgtud (a, b);
1723  return vec_nor (result, result);
1724 }
1725 
1744 static inline
1745 vb64_t
1747 {
1748  return vec_cmpgtsd (b, a);
1749 }
1750 
1769 static inline
1770 vb64_t
1772 {
1773  return vec_cmpgtud (b, a);
1774 }
1775 
1794 static inline
1795 vb64_t
1797 {
1798  return vec_cmpneud ((vui64_t) a, (vui64_t) b);
1799 }
1800 
1819 static inline
1820 vb64_t
1822 {
1823  vb64_t r;
1824  /* vec_cmpne is implemented as the not of vec_cmpeq. */
1825  r = vec_cmpequd (a, b);
1826  return vec_nor (r, r);
1827 }
1828 
1845 static inline
1846 int
1848 {
1849  int result;
1850 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1851  result = vec_all_eq(a, b);
1852 #else
1853  result = vec_all_eq((vui32_t)a, (vui32_t)b);
1854 #endif
1855  return (result);
1856 }
1857 
1875 static inline
1876 int
1878 {
1879  int result;
1880 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1881  result = vec_all_ge(a, b);
1882 #else
1883  vui32_t wt = { -1, -1, -1, -1};
1884  vb64_t gt_bool = vec_cmpgesd (a, b);
1885  result = vec_all_eq((vui32_t)gt_bool, wt);
1886 #endif
1887  return (result);
1888 }
1889 
1907 static inline
1908 int
1910 {
1911  int result;
1912 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1913  result = vec_all_gt(a, b);
1914 #else
1915  vui32_t wt = { -1, -1, -1, -1};
1916  vb64_t gt_bool = vec_cmpgtsd (a, b);
1917  result = vec_all_eq((vui32_t)gt_bool, wt);
1918 #endif
1919  return (result);
1920 }
1921 
1939 static inline
1940 int
1942 {
1943  return vec_cmpsd_all_ge (b, a);
1944 }
1945 
1963 static inline
1964 int
1966 {
1967  return vec_cmpsd_all_gt (b, a);
1968 }
1969 
1986 static inline
1987 int
1989 {
1990  int result;
1991 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1992  result = vec_all_ne(a, b);
1993 #else
1994  vui32_t wt = { -1, -1, -1, -1};
1995  vb64_t gt_bool = vec_cmpneud ((vui64_t)a, (vui64_t)b);
1996  result = vec_all_eq((vui32_t)gt_bool, wt);
1997 #endif
1998  return (result);
1999 }
2000 
2017 static inline
2018 int
2020 {
2021  int result;
2022 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2023  result = vec_any_eq(a, b);
2024 #else
2025  vui32_t wt = { -1, -1, -1, -1};
2026  vb64_t gt_bool = vec_cmpequd ((vui64_t)a, (vui64_t)b);
2027  result = vec_any_eq((vui32_t)gt_bool, wt);
2028 #endif
2029  return (result);
2030 }
2031 
2049 static inline
2050 int
2052 {
2053  int result;
2054 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2055  result = vec_any_ge(a, b);
2056 #else
2057  vui32_t wt = { -1, -1, -1, -1};
2058  vb64_t gt_bool = vec_cmpgesd (a, b);
2059  result = vec_any_eq((vui32_t)gt_bool, wt);
2060 #endif
2061  return (result);
2062 }
2063 
2081 static inline
2082 int
2084 {
2085  int result;
2086 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2087  result = vec_any_gt(a, b);
2088 #else
2089  vui32_t wt = { -1, -1, -1, -1};
2090  vb64_t gt_bool = vec_cmpgtsd (a, b);
2091  result = vec_any_eq((vui32_t)gt_bool, wt);
2092 #endif
2093  return (result);
2094 }
2095 
2113 static inline
2114 int
2116 {
2117  return vec_cmpsd_any_ge (b, a);
2118 }
2119 
2137 static inline
2138 int
2140 {
2141  return vec_cmpsd_any_gt (b, a);
2142 }
2143 
2160 static inline
2161 int
2163 {
2164  int result;
2165 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2166  result = vec_any_ne(a, b);
2167 #else
2168  vui32_t wt = { -1, -1, -1, -1};
2169  vb64_t gt_bool = vec_cmpneud ((vui64_t)a, (vui64_t)b);
2170  result = vec_any_eq((vui32_t)gt_bool, wt);
2171 #endif
2172  return (result);
2173 }
2174 
2191 static inline
2192 int
2194 {
2195  int result;
2196 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2197  result = vec_all_eq(a, b);
2198 #else
2199  result = vec_all_eq((vui32_t)a, (vui32_t)b);
2200 #endif
2201  return (result);
2202 }
2203 
2221 static inline
2222 int
2224 {
2225  int result;
2226 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2227  result = vec_all_ge(a, b);
2228 #else
2229  vui32_t wt = { -1, -1, -1, -1};
2230  vb64_t gt_bool = vec_cmpgeud (a, b);
2231  result = vec_all_eq((vui32_t)gt_bool, wt);
2232 #endif
2233  return (result);
2234 }
2235 
2253 static inline
2254 int
2256 {
2257  int result;
2258 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2259  result = vec_all_gt(a, b);
2260 #else
2261  vui32_t wt = { -1, -1, -1, -1};
2262  vb64_t gt_bool = vec_cmpgtud (a, b);
2263  result = vec_all_eq((vui32_t)gt_bool, wt);
2264 #endif
2265  return (result);
2266 }
2267 
2285 static inline
2286 int
2288 {
2289  return vec_cmpud_all_ge (b, a);
2290 }
2291 
2309 static inline
2310 int
2312 {
2313  return vec_cmpud_all_gt (b, a);
2314 }
2315 
2332 static inline
2333 int
2335 {
2336  int result;
2337 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2338  result = vec_all_ne(a, b);
2339 #else
2340  vui32_t wt = { -1, -1, -1, -1};
2341  vb64_t gt_bool = vec_cmpneud (a, b);
2342  result = vec_all_eq((vui32_t)gt_bool, wt);
2343 #endif
2344  return (result);
2345 }
2346 
2363 static inline
2364 int
2366 {
2367  int result;
2368 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2369  result = vec_any_eq(a, b);
2370 #else
2371  vui32_t wt = { -1, -1, -1, -1};
2372  vb64_t gt_bool = vec_cmpequd (a, b);
2373  result = vec_any_eq((vui32_t)gt_bool, wt);
2374 #endif
2375  return (result);
2376 }
2377 
2395 static inline
2396 int
2398 {
2399  int result;
2400 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2401  result = vec_any_ge(a, b);
2402 #else
2403  vui32_t wt = { -1, -1, -1, -1};
2404  vb64_t gt_bool = vec_cmpgeud (a, b);
2405  result = vec_any_eq((vui32_t)gt_bool, wt);
2406 #endif
2407  return (result);
2408 }
2409 
2427 static inline
2428 int
2430 {
2431  int result;
2432 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2433  result = vec_any_gt(a, b);
2434 #else
2435  vui32_t wt = { -1, -1, -1, -1};
2436  vb64_t gt_bool = vec_cmpgtud (a, b);
2437  result = vec_any_eq((vui32_t)gt_bool, wt);
2438 #endif
2439  return (result);
2440 }
2441 
2459 static inline
2460 int
2462 {
2463  return vec_cmpud_any_ge (b, a);
2464 }
2465 
2483 static inline
2484 int
2486 {
2487  return vec_cmpud_any_gt (b, a);
2488 }
2489 
2506 static inline
2507 int
2509 {
2510  int result;
2511 #if defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
2512  result = vec_any_ne(a, b);
2513 #else
2514  vui32_t wt = { -1, -1, -1, -1};
2515  vb64_t gt_bool = vec_cmpneud (a, b);
2516  result = vec_any_eq((vui32_t)gt_bool, wt);
2517 #endif
2518  return (result);
2519 }
2520 
2535 static inline
2536 vi64_t
2538 {
2539  vi64_t r;
2540 
2541 #ifdef _ARCH_PWR8
2542 #if defined (vec_vmaxsd)
2543  r = vec_vmaxsd (vra, vrb);
2544 #elif defined (__clang__)
2545  r = vec_max (vra, vrb);
2546 #else
2547  __asm__(
2548  "vmaxsd %0,%1,%2;"
2549  : "=v" (r)
2550  : "v" (vra),
2551  "v" (vrb)
2552  : );
2553 #endif
2554 #else
2555  vb64_t maxmask;
2556 
2557  maxmask = vec_cmpgtsd ( vra, vrb );
2558  r = vec_sel (vrb, vra, maxmask);
2559 #endif
2560  return r;
2561 }
2562 
2577 static inline
2578 vui64_t
2580 {
2581  vui64_t r;
2582 
2583 #ifdef _ARCH_PWR8
2584 #if defined (vec_vmaxud)
2585  r = vec_vmaxud (vra, vrb);
2586 #elif defined (__clang__)
2587  r = vec_max (vra, vrb);
2588 #else
2589  __asm__(
2590  "vmaxud %0,%1,%2;"
2591  : "=v" (r)
2592  : "v" (vra),
2593  "v" (vrb)
2594  : );
2595 #endif
2596 #else
2597  vb64_t maxmask;
2598 
2599  maxmask = vec_cmpgtud ( vra, vrb );
2600  r = vec_sel (vrb, vra, maxmask);
2601 #endif
2602  return r;
2603 }
2604 
2619 static inline
2620 vi64_t
2622 {
2623  vi64_t r;
2624 
2625 #ifdef _ARCH_PWR8
2626 #if defined (vec_vminsd)
2627  r = vec_vminsd (vra, vrb);
2628 #elif defined (__clang__)
2629  r = vec_min (vra, vrb);
2630 #else
2631  __asm__(
2632  "vminsd %0,%1,%2;"
2633  : "=v" (r)
2634  : "v" (vra),
2635  "v" (vrb)
2636  : );
2637 #endif
2638 #else
2639  vb64_t minmask;
2640 
2641  minmask = vec_cmpgtsd ( vrb, vra );
2642  r = vec_sel (vrb, vra, minmask);
2643 #endif
2644  return r;
2645 }
2646 
2661 static inline
2662 vui64_t
2664 {
2665  vui64_t r;
2666 
2667 #ifdef _ARCH_PWR8
2668 #if defined (vec_vminud)
2669  r = vec_vminud (vra, vrb);
2670 #elif defined (__clang__)
2671  r = vec_min (vra, vrb);
2672 #else
2673  __asm__(
2674  "vminud %0,%1,%2;"
2675  : "=v" (r)
2676  : "v" (vra),
2677  "v" (vrb)
2678  : );
2679 #endif
2680 #else
2681  vb64_t minmask;
2682 
2683  minmask = vec_cmpgtud ( vrb, vra );
2684  r = vec_sel (vrb, vra, minmask);
2685 #endif
2686  return r;
2687 }
2688 
2709 static inline vui64_t
2711 {
2712  return vec_permdi ((vui64_t) vra, (vui64_t) vrb, 0);
2713 }
2714 
2735 static inline vui64_t
2737 {
2738  return vec_permdi ((vui64_t) vra, (vui64_t) vrb, 3);
2739 }
2740 
2758 static inline vui64_t
2760 {
2761  vui64_t result;
2762  /*
2763  result[0] = __VA[0];
2764  result[1] = __VB[0];
2765  */
2766 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2767  result = vec_permdi (__VB, __VA, 3);
2768 #else
2769  result = vec_permdi (__VA, __VB, 0);
2770 #endif
2771  return (result);
2772 }
2773 
2791 static inline vui64_t
2793 {
2794  vui64_t result;
2795  /*
2796  result[0] = __VA[0];
2797  result[1] = __VB[0];
2798  */
2799 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2800  result = vec_permdi (__VB, __VA, 3);
2801 #else
2802  result = vec_permdi (__VA, __VB, 0);
2803 #endif
2804  return (result);
2805 }
2806 
2824 static inline vui64_t
2826 {
2827  vui64_t result;
2828  /*
2829  result[0] = __VA[1];
2830  result[1] = __VB[1];
2831  */
2832 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2833  result = vec_permdi (__VB, __VA, 0);
2834 #else
2835  result = vec_permdi (__VA, __VB, 3);
2836 #endif
2837 
2838  return (result);
2839 }
2840 
2858 static inline vui64_t
2860 {
2861  vui64_t result;
2862  /*
2863  result[0] = __VA[1];
2864  result[1] = __VB[1];
2865  */
2866 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2867  result = vec_permdi (__VB, __VA, 0);
2868 #else
2869  result = vec_permdi (__VA, __VB, 3);
2870 #endif
2871 
2872  return (result);
2873 }
2874 
2881 static inline vui128_t
2883 
2890 static inline vui128_t
2891 vec_muleud (vui64_t a, vui64_t b);
2892 
2899 static inline vui64_t
2900 vec_mulhud (vui64_t vra, vui64_t vrb);
2901 
2908 static inline vui128_t
2909 vec_muloud (vui64_t a, vui64_t b);
2910 
2917 static inline vui64_t
2918 vec_muludm (vui64_t vra, vui64_t vrb);
2919 
2936 static inline vui64_t
2938 {
2939  vui64_t result;
2940  /*
2941  result[1] = __VH[1];
2942  result[0] = __VL[0];
2943  */
2944  result = vec_permdi (__VH, __VL, 1);
2945 
2946  return (result);
2947 }
2948 
2982 static inline vui64_t
2983 vec_permdi (vui64_t vra, vui64_t vrb, const int ctl)
2984 {
2985  vui64_t result;
2986 #ifdef _ARCH_PWR7
2987  switch (ctl & 3)
2988  {
2989 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || (defined (__clang__) && (__clang_major__ < 7))
2990  case 0:
2991  result = vec_xxpermdi (vra, vrb, 0);
2992  break;
2993  case 1:
2994  result = vec_xxpermdi (vra, vrb, 1);
2995  break;
2996  case 2:
2997  result = vec_xxpermdi (vra, vrb, 2);
2998  break;
2999  case 3:
3000  result = vec_xxpermdi (vra, vrb, 3);
3001  break;
3002 #else
3003  case 0:
3004  result = vec_xxpermdi (vrb, vra, 3);
3005  break;
3006  case 1:
3007  result = vec_xxpermdi (vrb, vra, 1);
3008  break;
3009  case 2:
3010  result = vec_xxpermdi (vrb, vra, 2);
3011  break;
3012  case 3:
3013  result = vec_xxpermdi (vrb, vra, 0);
3014  break;
3015 #endif
3016  default:
3017  result = (vui64_t){ 0, 0 };
3018  }
3019 #else
3020  /* Current compilers don't accept vector unsigned long int as vector
3021  * parms to vec_sld, so use vector unsigned int. The vsldoi
3022  * instruction does not care). */
3023  vui32_t temp;
3024  switch (ctl & 3)
3025  {
3026  case 0:
3027  temp = vec_sld ((vui32_t) vra, (vui32_t) vra, 8);
3028  result = (vui64_t) vec_sld (temp, (vui32_t) vrb, 8);
3029  break;
3030  case 1:
3031  temp = vec_sld ((vui32_t) vrb, (vui32_t) vra, 8);
3032  result = (vui64_t) vec_sld (temp, temp, 8);
3033  break;
3034  case 2:
3035  result = (vui64_t) vec_sld ((vui32_t) vra, (vui32_t) vrb, 8);
3036  break;
3037  case 3:
3038  temp = vec_sld ((vui32_t) vrb, (vui32_t) vrb, 8);
3039  result = (vui64_t) vec_sld ((vui32_t) vra, temp, 8);
3040  break;
3041  }
3042 #endif
3043  return (result);
3044 }
3045 
3066 #ifndef vec_popcntd
3067 static inline vui64_t
3069 {
3070  vui64_t r;
3071 #ifdef _ARCH_PWR8
3072 #if defined (vec_vpopcntd)
3073  r = vec_vpopcntd (vra);
3074 #elif defined (__clang__)
3075  r = vec_popcnt (vra);
3076 #else
3077  __asm__(
3078  "vpopcntd %0,%1;"
3079  : "=v" (r)
3080  : "v" (vra)
3081  : );
3082 #endif
3083 #else
3084  vui32_t z= { 0,0,0,0};
3085  vui32_t x;
3086  x = vec_popcntw ((vui32_t) vra);
3087  r = (vui64_t) vec_vsum2sw ((vi32_t) x, (vi32_t) z);
3088 #endif
3089  return (r);
3090 }
3091 #else
3092 /* Work around for GCC PR85830. */
3093 #undef vec_popcntd
3094 #define vec_popcntd __builtin_vec_vpopcntd
3095 #endif
3096 
3111 static inline vui64_t
3113 {
3114  vui64_t result;
3115 
3116 #ifdef _ARCH_PWR9
3117 #if defined (vec_revb) || defined (__clang__)
3118  result = vec_revb (vra);
3119 #else
3120  __asm__(
3121  "xxbrd %x0,%x1;"
3122  : "=wa" (result)
3123  : "wa" (vra)
3124  : );
3125 #endif
3126 #else
3127 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
3128  const vui64_t vconstp =
3129  CONST_VINT64_DW(0x0706050403020100UL, 0x0F0E0D0C0B0A0908UL);
3130 #else
3131  const vui64_t vconstp =
3132  CONST_VINT64_DW(0x08090A0B0C0D0E0FUL, 0x0001020304050607UL);
3133 #endif
3134  result = (vui64_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);
3135 #endif
3136 
3137  return (result);
3138 }
3139 
3140 #ifndef vec_vsld
3141 static inline vui64_t vec_vrld (vui64_t vra, vui64_t vrb);
3142 static inline vui64_t vec_vsld (vui64_t vra, vui64_t vrb);
3143 static inline vui64_t vec_vsrd (vui64_t vra, vui64_t vrb);
3144 static inline vi64_t vec_vsrad (vi64_t vra, vui64_t vrb);
3145 #endif
3146 
3163 static inline vb64_t
3165 {
3166  vb64_t result;
3167 
3168 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
3169  __asm__(
3170  "vexpanddm %0,%1;\n"
3171  : "=v" (result)
3172  : "v" (vra)
3173  : );
3174 #elif defined (_ARCH_PWR8)
3175  // Compare signed doubleword less than zero
3176  const vi64_t zero = {0, 0};
3177  result = vec_cmpltsd (vra, zero);
3178 #else // ARCH_PWR7 or older, without compare signed doubleword
3179  const vui8_t rshift = vec_splat_u8( 7 );
3180  const vui8_t sperm = { 0,0,0,0, 0,0,0,0, 8,8,8,8, 8,8,8,8 };
3181  // Splat the high byte of each doubleword across.
3182  vui8_t splat = vec_perm ((vui8_t) vra, (vui8_t) vra, sperm);
3183  // Vector Shift Right Algebraic Bytes 7-bits.
3184  result = (vb64_t) vec_sra (splat, rshift);
3185 #endif
3186  return result;
3187 }
3188 
3206 static inline vui64_t
3207 vec_rldi (vui64_t vra, const unsigned int shb)
3208 {
3209  vui64_t lshift;
3210  vui64_t result;
3211 
3212  if ((shb%64) != 0)
3213  {
3214  /* Load the rotate const in a vector. The element rotates require
3215  a rotate amount for each element. For the immediate form the
3216  rotate constant is splatted to all elements of the
3217  rotate control. */
3218  if (__builtin_constant_p (shb) && (shb < 16))
3219  lshift = (vui64_t) vec_splat_s32(shb);
3220  else
3221  lshift = vec_splats ((unsigned long long) shb);
3222 
3223  /* Vector Shift right bytes based on the lower 6-bits of
3224  corresponding element of lshift. */
3225  result = vec_vrld (vra, lshift);
3226  }
3227  else
3228  { /* Rotation of 0 bits returns vra unchanged. */
3229  result = vra;
3230  }
3231 
3232  return (vui64_t) result;
3233 }
3234 
3252 static inline vui64_t
3253 vec_sldi (vui64_t vra, const unsigned int shb)
3254 {
3255  vui64_t result;
3256 
3257  if (shb < 64)
3258  {
3259  /* Load the shift const in a vector. The element shifts require
3260  a shift amount for each element. For the immediate form the
3261  shift constant is splatted to all elements of the
3262  shift control. */
3263 #ifdef _ARCH_PWR8
3264  vui64_t lshift;
3265 
3266  if (__builtin_constant_p (shb) && (shb < 16))
3267  lshift = (vui64_t) vec_splat_s32(shb);
3268  else
3269  lshift = vec_splats ((unsigned long long) shb);
3270  /* Vector Shift left doubleword from the lower 6-bits of
3271  corresponding element of lshift. */
3272  result = vec_vsld (vra, lshift);
3273 #else
3274  /*
3275  * POWER7 and earlier do not have vsld. So use the vector shift
3276  * left bit/octet instructions. But these may shift bits from
3277  * element 1 in the low bits of element 0. So generate a mask of
3278  * '1's, shifted left by the same shb and rotated into the
3279  * element 0 position.
3280  */
3281  vui8_t lshift;
3282 
3283  if (__builtin_constant_p (shb) && (shb < 16))
3284  lshift = vec_splat_u8(shb);
3285  else
3286  lshift = vec_splats ((unsigned char) shb);
3287 
3288  {
3289  vui8_t sl_a;
3290  vui8_t sl_m = (vui8_t) vec_splat_s8(-1);
3291 
3292  sl_a = ((vui8_t) vra);
3293  if (shb > 7)
3294  {
3295  /* Vector Shift Left By Octet by bits 121-124 of lshift. */
3296  sl_m = vec_slo (sl_m, lshift);
3297  sl_a = vec_slo ((vui8_t) vra, lshift);
3298  }
3299  if ((shb & 7) != 0)
3300  {
3301  /* Vector Shift Left by bits 125-127 of lshift. */
3302  sl_m = vec_sll (sl_m, lshift);
3303  sl_a = vec_sll (sl_a, lshift);
3304  }
3305  /* Rotate mask and clear low order bits of Element 0. */
3306  sl_m = vec_sld (sl_m, sl_m, 8);
3307  result = (vui64_t) vec_and (sl_a, sl_m);
3308  }
3309 #endif
3310  }
3311  else
3312  { /* shifts greater then 63 bits return zeros. */
3313  result = vec_xor ((vui64_t) vra, (vui64_t) vra);
3314  }
3315 
3316  return (vui64_t) result;
3317 }
3318 
3333 static inline vi64_t
3335 {
3336  return (vi64_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);
3337 }
3338 
3353 static inline vui64_t
3355 {
3356  return (vui64_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);
3357 }
3358 
3381 static inline vui64_t
3382 vec_splatd (vui64_t vra, const int ctl)
3383 {
3384  vui64_t result;
3385  switch (ctl & 1)
3386  {
3387  case 0:
3388  /*
3389  result[1] = vra[0];
3390  result[0] = vra[0];
3391  */
3392  result = vec_mrged (vra, vra);
3393  break;
3394  case 1:
3395  /*
3396  result[1] = vra[1];
3397  result[0] = vra[1];
3398  */
3399  result = vec_mrgod (vra, vra);
3400  break;
3401  }
3402 
3403  return (result);
3404 }
3405 
3428 static inline vi64_t
3429 vec_splat_s64 (const int sim)
3430 {
3431  vi64_t result;
3432 #ifdef _ARCH_PWR9
3433  result = vec_splats ((signed long long) sim);
3434 #else
3435  if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))
3436  {
3437  vi32_t vwi = vec_splat_s32 (sim);
3438 
3439  if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
3440  {
3441  // Special case for -1 and 0. Skip vec_unpackl().
3442  result = (vi64_t) vwi;
3443  } else {
3444  // For P8 can use either vupklsh or vupklsw but P7 only has
3445  // vupklsh. Given the reduced range, Either works here.
3446  // Unpack signed HW works here because immediate value fits
3447  // into the low HW and sign extends to high HW of each word.
3448  // Unpack will expand the low HW to low word and high HW
3449  // (sign extend) into the high word of each DW.
3450  // Unpack low/high (or endian) will not change the result.
3451 #if defined (__GNUC__) && (__GNUC__ == 8)
3452  // GCC 8 (AT12) handle this correctly.
3453  result = (vi64_t) vec_vupklsh ((vi16_t) vwi);
3454 #else
3455  // But GCC 9+ optimized the above to be load from .rodata.
3456  // With a little register pressure it adds some gratuitous store/reloads.
3457  // So the following work-around is required.
3458  __asm__(
3459  "vupklsh %0,%1;"
3460  : "=v" (result)
3461  : "v" (vwi)
3462  : );
3463 #endif
3464  }
3465  }
3466  else
3467  result = vec_splats ((signed long long) sim);
3468 #endif
3469  return (result);
3470 }
3471 
3494 static inline vui64_t
3495 vec_splat_u64 (const int sim)
3496 {
3497  vui64_t result;
3498 #ifdef _ARCH_PWR9
3499  result = vec_splats ((unsigned long long) sim);
3500 #else
3501  if (__builtin_constant_p (sim) && ((sim >= 0) && (sim < 16)))
3502  {
3503  vui32_t vwi = vec_splat_u32 (sim);
3504 
3505  if (__builtin_constant_p (sim) && (sim == 0))
3506  {
3507  // Special case for -1 and 0. Skip vec_unpackl().
3508  result = (vui64_t) vwi;
3509  } else {
3510  // For P8 can use either vupklsh or vupklsw but P7 only has
3511  // vupklsh. Given the reduced range, Either works here.
3512  // Unpack unsigned HW works here because immediate value fits
3513  // into the low HW and zero extends to high HW of each word.
3514  // Unpack will expand the low HW to low word and high HW
3515  // (zero extended) into the high word of each DW.
3516  // Unpack low/high (or endian) will not change the result.
3517 #if defined (__GNUC__) && (__GNUC__ == 8)
3518  // GCC 8 (AT12) handle this correctly.
3519  result = (vui64_t) vec_vupklsh ((vi16_t) vwi);
3520 #else
3521  // But GCC 9+ optimized the above to be load from .rodata.
3522  // With a little register pressure it adds some gratuitous store/reloads.
3523  // So the following work-around is required.
3524  __asm__(
3525  "vupklsh %0,%1;"
3526  : "=v" (result)
3527  : "v" (vwi)
3528  : );
3529 #endif
3530  }
3531  }
3532  else
3533  result = vec_splats ((unsigned long long) sim);
3534 #endif
3535  return (result);
3536 }
3537 
3559 static inline vui64_t
3560 vec_spltd (vui64_t vra, const int ctl)
3561 {
3562  vui64_t result;
3563  /* Don't need to reverse the cases for LE because vec_permdi handles
3564  that. */
3565  switch (ctl & 1)
3566  {
3567  case 0:
3568  /*
3569  result[1] = vra[0];
3570  result[0] = vra[0];
3571  */
3572  result = vec_permdi (vra, vra, 0);
3573  break;
3574  case 1:
3575  /*
3576  result[1] = vra[1];
3577  result[0] = vra[1];
3578  */
3579  result = vec_permdi (vra, vra, 3);
3580  break;
3581  }
3582 
3583  return (result);
3584 }
3585 
3603 static inline vui64_t
3604 vec_srdi (vui64_t vra, const unsigned int shb)
3605 {
3606  vui64_t result;
3607 
3608  if (shb < 64)
3609  {
3610  /* Load the shift const in a vector. The element shifts require
3611  a shift amount for each element. For the immediate form the
3612  shift constant is splatted to all elements of the
3613  shift control. */
3614 #ifdef _ARCH_PWR8
3615  vui64_t rshift;
3616 
3617 #if defined (__GNUC__) && (__GNUC__ < 8)
3618  if (__builtin_constant_p (shb) && (shb < 16))
3619  rshift = (vui64_t) vec_splat_s32(shb);
3620  else
3621  rshift = vec_splats ((unsigned long long) shb);
3622 #else
3623  rshift = CONST_VINT128_DW (shb, shb);
3624 #endif
3625  /* Vector Shift right bytes based on the lower 6-bits of
3626  corresponding element of rshift. */
3627  result = vec_vsrd (vra, rshift);
3628 #else
3629  /*
3630  * POWER7 and earlier do not have vsrd. So use the vector shift
3631  * right bit/octet instructions. But these may shift bits from
3632  * element 0 in the high bits of element 1. So generate a mask of
3633  * '1's, shifted right by the same shb and rotated into the
3634  * element 1 position.
3635  */
3636  vui8_t rshift;
3637 
3638  if (__builtin_constant_p (shb) && (shb < 16))
3639  rshift = vec_splat_u8(shb);
3640  else
3641  rshift = vec_splats ((unsigned char) shb);
3642 
3643  {
3644  vui8_t sr_a;
3645  vui8_t sr_m = (vui8_t) vec_splat_s8(-1);
3646 
3647  sr_a = ((vui8_t) vra);
3648  if (shb > 7)
3649  {
3650  /* Vector Shift Right By Octet by bits 121-124 of rshift. */
3651  sr_m = vec_sro (sr_m, rshift);
3652  sr_a = vec_sro ((vui8_t) vra, rshift);
3653  }
3654  if ((shb & 7) != 0)
3655  {
3656  /* Vector Shift Right by bits 125-127 of rshift. */
3657  sr_m = vec_srl (sr_m, rshift);
3658  sr_a = vec_srl (sr_a, rshift);
3659  }
3660  /* Rotate mask and clear high order bits of Element 1. */
3661  sr_m = vec_sld (sr_m, sr_m, 8);
3662  result = (vui64_t) vec_and (sr_a, sr_m);
3663  }
3664 #endif
3665  }
3666  else
3667  { /* shifts greater then 63 bits return zeros. */
3668  result = vec_xor ((vui64_t) vra, (vui64_t) vra);
3669  }
3670  return (vui64_t) result;
3671 }
3672 
3691 static inline vi64_t
3692 vec_sradi (vi64_t vra, const unsigned int shb)
3693 {
3694  vui64_t rshift;
3695  vi64_t result;
3696 
3697  if (shb < 64)
3698  {
3699  /* Load the shift const in a vector. The element shifts require
3700  a shift amount for each element. For the immediate form the
3701  shift constant is splatted to all elements of the
3702  shift control. */
3703 #if defined (__GNUC__) && (__GNUC__ < 8)
3704  if (__builtin_constant_p (shb) && (shb < 16))
3705  rshift = (vui64_t) vec_splat_s32(shb);
3706  else
3707  rshift = vec_splats ((unsigned long long) shb);
3708 #else
3709  rshift = CONST_VINT128_DW (shb, shb);
3710 #endif
3711  /* Vector Shift Right Algebraic Doublewords based on the lower 6-bits
3712  of corresponding element of rshift. */
3713  result = vec_vsrad (vra, rshift);
3714  }
3715  else
3716  { /* shifts greater then 63 bits returns the sign bit propagated to
3717  all bits. This is equivalent to shift Right Algebraic of
3718  63 bits. */
3719  rshift = (vui64_t) vec_splats(63);
3720  result = vec_vsrad (vra, rshift);
3721  }
3722 
3723  return (vi64_t) result;
3724 }
3725 
3745 static inline vui64_t
3747 {
3748  vui32_t r;
3749 
3750 #ifdef _ARCH_PWR8
3751 #if defined (vec_vsubudm)
3752  r = (vui32_t) vec_vsubudm (a, b);
3753 #elif defined (__clang__)
3754  r = (vui32_t) vec_sub (a, b);
3755 #else
3756  __asm__(
3757  "vsubudm %0,%1,%2;"
3758  : "=v" (r)
3759  : "v" (a),
3760  "v" (b)
3761  : );
3762 #endif
3763 #else
3764  vui32_t c;
3765  vui32_t z= { 0,0,0,0};
3766  vui32_t cm= { 0,1,0,1};
3767 
3768  c = vec_vsubcuw ((vui32_t)a, (vui32_t)b);
3769  r = vec_vsubuwm ((vui32_t)a, (vui32_t)b);
3770  c = vec_andc (cm, c);
3771  c = vec_sld (c, z, 4);
3772  r = vec_vsubuwm (r, c);
3773 #endif
3774  return ((vui64_t) r);
3775 }
3776 
3788 static inline vui64_t
3790 {
3791  vui64_t result;
3792  /*
3793  result[1] = vra[0];
3794  result[0] = vra[1];
3795  */
3796  result = vec_permdi (vra, vra, 2);
3797 
3798  return (result);
3799 }
3800 
3822 static inline
3823 vui64_t
3824 vec_vgluddo (unsigned long long *array, vi64_t vra)
3825 {
3826  vui64_t rese0, rese1;
3827 
3828 #ifdef _ARCH_PWR8
3829  rese0 = vec_vlsidx (vra[VEC_DW_H], array);
3830  rese1 = vec_vlsidx (vra[VEC_DW_L], array);
3831 #else
3832  // Need to explicitly manage the VR/GPR xfer for PWR7
3833  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
3834 
3835  rese0 = vec_vlsidx (scalar_extract_uint64_from_high_uint128(gprp), array);
3836  rese1 = vec_vlsidx (scalar_extract_uint64_from_low_uint128(gprp), array);
3837 #endif
3838  return vec_permdi (rese0, rese1, 0);
3839 }
3840 
3865 static inline vui64_t
3866 vec_vgluddsx (unsigned long long *array, vi64_t vra,
3867  const unsigned char scale)
3868 {
3869  vi64_t offset;
3870 
3871  offset = (vi64_t) vec_sldi ((vui64_t) vra, (3 + scale));
3872  return vec_vgluddo (array, offset);
3873 }
3874 
3896 static inline
3897 vui64_t
3898 vec_vgluddx (unsigned long long *array, vi64_t vra)
3899 {
3900  vi64_t offset;
3901 
3902  offset = (vi64_t) vec_sldi ((vui64_t) vra, 3);
3903  return vec_vgluddo (array, offset);
3904 }
3905 
3925 static inline vui64_t
3926 vec_vgludso (unsigned long long *array, const long long offset0,
3927  const long long offset1)
3928 {
3929  vui64_t re0, re1, result;
3930 
3931  re0 = vec_vlsidx (offset0, array);
3932  re1 = vec_vlsidx (offset1, array);
3933  /* Need to handle endian as the vec_vlsidx result is always left
3934  * justified in VSR, while element [0] may be left or right. */
3935 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
3936  result = vec_permdi (re1, re0, 0);
3937 #else
3938  result = vec_permdi (re0, re1, 0);
3939 #endif
3940  return result;
3941 }
3942 
3978 static inline vui64_t
3979 vec_vlsidx (const signed long long ra, const unsigned long long *rb)
3980 {
3981  vui64_t xt;
3982 
3983 #if (defined(__clang__) && __clang_major__ < 8)
3984  __VEC_U_128 t;
3985  unsigned long long *p = (unsigned long long *)((char *)rb + ra);
3986  t.ulong.upper = *p;
3987  xt = t.vx1;
3988 #else
3989  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768)
3990  && ((ra & 3) == 0))
3991  {
3992 #if defined (_ARCH_PWR9)
3993  __asm__(
3994  "lxsd%X1 %0,%1;"
3995  : "=v" (xt)
3996  : "m" (*(unsigned long long *)((char *)rb + ra))
3997  : );
3998 #else
3999  if (ra == 0)
4000  {
4001  __asm__(
4002  "lxsdx %x0,%y1;"
4003  : "=wa" (xt)
4004  : "Z" (*rb)
4005  : );
4006  } else {
4007  unsigned long long rt;
4008 #if defined (_ARCH_PWR8)
4009  // For P8 better if li and lxsdx shared a single asm block
4010  // (enforcing consecutive instructions).
4011  // This enables instruction fusion for P8.
4012  __asm__(
4013  "li %0,%2;"
4014  "lxsdx %x1,%3,%0;"
4015  : "=&r" (rt), "=wa" (xt)
4016  : "I" (ra), "b" (rb), "Z" (*(unsigned long long *)((char *)rb+ra))
4017  : );
4018 #else // _ARCH_PWR7
4019  // This generates operationally the same code, but the
4020  // compiler may rearrange/schedule the code.
4021  __asm__(
4022  "li %0,%1;"
4023  : "=r" (rt)
4024  : "I" (ra)
4025  : );
4026  __asm__(
4027  "lxsdx %x0,%y1;"
4028  : "=wa" (xt)
4029  : "Z" (*(unsigned long long *)((char *)rb+rt))
4030  : );
4031 #endif
4032  }
4033 #endif
4034  } else {
4035  __asm__(
4036  "lxsdx %x0,%y1;"
4037  : "=wa" (xt)
4038  : "Z" (*(unsigned long long *)((char *)rb+ra))
4039  : );
4040  }
4041 #endif
4042  return xt;
4043 }
4044 
4051 static inline vui128_t
4053 
4060 static inline vui128_t
4062 
4069 static inline vui128_t
4071 
4078 static inline vui128_t
4080 
4087 static inline vui128_t
4088 vec_vmuleud (vui64_t a, vui64_t b);
4089 
4096 static inline vui128_t
4097 vec_vmuloud (vui64_t a, vui64_t b);
4098 
4105 static inline vui128_t
4107 
4114 static inline vui128_t
4116 
4136 #ifndef vec_vpkudum
4137 // May be defined as inline function for clang
4138 // But only for _ARCH_PWR8 or higher.
4139 #if !defined(__clang__) || !defined(_ARCH_PWR8)
4140 static inline vui32_t
4142 {
4143  vui32_t r;
4144 #ifdef _ARCH_PWR8
4145  __asm__(
4146 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4147  "vpkudum %0,%2,%1;\n"
4148 #else
4149  "vpkudum %0,%1,%2;\n"
4150 #endif
4151  : "=v" (r)
4152  : "v" (vra),
4153  "v" (vrb)
4154  : );
4155 #else
4156  const vui32_t vconstp =
4157  CONST_VINT128_W(0x04050607, 0x0c0d0e0f, 0x14151617, 0x1c1d1e1f);
4158 
4159  r = vec_perm ((vui32_t) vra, (vui32_t) vrb, (vui8_t) vconstp);
4160 #endif
4161  return (r);
4162 }
4163 #endif
4164 #endif
4165 
4183 #ifndef vec_vrld
4184 static inline vui64_t
4186 {
4187  vui64_t r;
4188 #ifdef _ARCH_PWR8
4189 #ifdef __clang__
4190  r = vec_rl (vra, vrb);
4191 #else
4192  __asm__(
4193  "vrld %0,%1,%2;"
4194  : "=v" (r)
4195  : "v" (vra),
4196  "v" (vrb)
4197  : );
4198 #endif
4199 #else
4200  vui64_t hd, ld;
4201  vui32_t t1, t2;
4202  vui8_t shh, shl;
4203 
4204  shh = vec_splat ((vui8_t) vrb, VEC_BYTE_L_DWH);
4205  shl = vec_splat ((vui8_t) vrb, VEC_BYTE_L_DWL);
4206  hd = vec_xxspltd (vra, VEC_DW_H);
4207  ld = vec_xxspltd (vra, VEC_DW_L);
4208  t1 = vec_vslo ((vui32_t)hd, shh);
4209  t2 = vec_vslo ((vui32_t)ld, shl);
4210  t1 = vec_vsl (t1, shh);
4211  t2 = vec_vsl (t2, shl);
4212  r = vec_mrghd ((vui64_t)t1, (vui64_t)t2);
4213 #endif
4214  return (r);
4215 }
4216 #endif
4217 
4236 #ifndef vec_vsld
4237 static inline vui64_t
4239 {
4240  vui64_t result;
4241 
4242 #ifdef _ARCH_PWR8
4243 #ifdef __clang__
4244  result = vec_sl (vra, vrb);
4245 #else
4246  __asm__(
4247  "vsld %0,%1,%2;"
4248  : "=v" (result)
4249  : "v" (vra),
4250  "v" (vrb)
4251  : );
4252 #endif
4253 #else
4254  vui8_t vsh_h, vsh_l;
4255  vui8_t vr_h, vr_l;
4256  vui64_t sel_mask = CONST_VINT128_DW(0, -1LL);
4257  vui64_t shft_mask = CONST_VINT128_DW(63, 63);
4258 
4259  /* constrain the dword shift amounts to 0-63. */
4260  vsh_l = vec_and ((vui8_t) vrb, (vui8_t) shft_mask);
4261  /* Isolate the high dword so that bits from the low dword
4262  * do not contaminate the result. */
4263  vr_h = vec_andc ((vui8_t) vra, (vui8_t) sel_mask);
4264  /* The low dword is just vra as the 128-bit shift left generates
4265  * '0's on the right and the final merge (vec_sel)
4266  * cleans up 64-bit overflow on the left. */
4267  vr_l = (vui8_t) vra;
4268  /* The vsr instruction only works correctly if the bit shift
4269  * value is splatted to each byte of the vector. */
4270  vsh_h = vec_splat (vsh_l, VEC_BYTE_L_DWH);
4271  vsh_l = vec_splat (vsh_l, VEC_BYTE_L_DWL);
4272  /* Shift the high dword by vsh_h. */
4273  vr_h = vec_vslo (vr_h, vsh_h);
4274  vr_h = vec_vsl (vr_h, vsh_h);
4275  /* Shift the low dword by vsh_l. */
4276  vr_l = vec_vslo (vr_l, vsh_l);
4277  vr_l = vec_vsl (vr_l, vsh_l);
4278  /* Merge the dwords after shift. */
4279  result = (vui64_t) vec_sel (vr_h, vr_l, (vui8_t) sel_mask);
4280 #endif
4281  return ((vui64_t) result);
4282 }
4283 #endif
4284 
4303 #ifndef vec_vsrad
4304 static inline vi64_t
4306 {
4307  vi64_t result;
4308 
4309 #ifdef _ARCH_PWR8
4310 #ifdef __clang__bad
4311 // clang8/9 has code gen bug here, disabled for now
4312  result = vec_sra (vra, vrb);
4313 #else
4314  __asm__(
4315  "vsrad %0,%1,%2;"
4316  : "=v" (result)
4317  : "v" (vra),
4318  "v" (vrb)
4319  : );
4320 #endif
4321 #else
4322  vui8_t vsh_h, vsh_l;
4323  vui8_t vr_h, vr_l;
4324  vi32_t exsa;
4325  vui32_t shw31 = CONST_VINT128_W (-1, -1, -1, -1);
4326  vui64_t exsah, exsal;
4327  vui64_t shft_mask = CONST_VINT128_DW(63, 63);
4328 
4329  /* Need to extend each signed long int to __int128. So the unsigned
4330  * (128-bit) shift right behaves as a arithmetic (64-bit) shift. */
4331  exsa = vec_vsraw ((vi32_t) vra, shw31);
4332  exsah = (vui64_t) vec_vmrghw (exsa, exsa);
4333  exsal = (vui64_t) vec_vmrglw (exsa, exsa);
4334  /* constrain the dword shift amounts to 0-63. */
4335  vsh_l = vec_and ((vui8_t) vrb, (vui8_t) shft_mask);
4336  /* The vsr instruction only works correctly if the bit shift
4337  * value is splatted to each byte of the vector. */
4338  vsh_h = vec_splat (vsh_l, VEC_BYTE_L_DWH);
4339  vsh_l = vec_splat (vsh_l, VEC_BYTE_L_DWL);
4340  /* Merge the extended sign with high dword. */
4341  exsah = vec_mrghd (exsah, (vui64_t) vra);
4342  /* Shift the high dword by vsh_h. */
4343  vr_h = vec_vsro ((vui8_t) exsah, vsh_h);
4344  vr_h = vec_vsr (vr_h, vsh_h);
4345  /* Merge the extended sign with high dword. */
4346  exsal = vec_pasted (exsal, (vui64_t) vra);
4347  /* Shift the low dword by vsh_l. */
4348  vr_l = vec_vsro ((vui8_t) exsal, vsh_l);
4349  vr_l = vec_vsr (vr_l, vsh_l);
4350  /* Merge the dwords after shift. */
4351  result = (vi64_t) vec_mrgld ((vui64_t) vr_h, (vui64_t) vr_l);
4352 #endif
4353  return ((vi64_t) result);
4354 }
4355 #endif
4356 
4375 #ifndef vec_vsrd
4376 static inline vui64_t
4378 {
4379  vui64_t result;
4380 
4381 #ifdef _ARCH_PWR8
4382 #ifdef __clang__
4383  result = vec_sr (vra, vrb);
4384 #else
4385  __asm__(
4386  "vsrd %0,%1,%2;"
4387  : "=v" (result)
4388  : "v" (vra),
4389  "v" (vrb)
4390  : );
4391 #endif
4392 #else
4393  vui8_t vsh_h, vsh_l;
4394  vui8_t vr_h, vr_l;
4395  vui64_t sel_mask = CONST_VINT128_DW(0, -1LL);
4396  vui64_t shft_mask = CONST_VINT128_DW(63, 63);
4397 
4398  /* constrain the dword shift amounts to 0-63. */
4399  vsh_l = vec_and ((vui8_t) vrb, (vui8_t) shft_mask);
4400  /* Isolate the low dword so that bits from the high dword,
4401  * do not contaminate the result. */
4402  vr_l = vec_and ((vui8_t) vra, (vui8_t) sel_mask);
4403  /* The vsr instruction only works correctly if the bit shift
4404  * value is splatted to each byte of the vector. */
4405  vsh_h = vec_splat (vsh_l, VEC_BYTE_L_DWH);
4406  vsh_l = vec_splat (vsh_l, VEC_BYTE_L_DWL);
4407  /* Shift the high dword by vsh_h. */
4408  vr_h = vec_vsro ((vui8_t) vra, vsh_h);
4409  vr_h = vec_vsr (vr_h, vsh_h);
4410  /* Shift the low dword by vsh_l. */
4411  vr_l = vec_vsro (vr_l, vsh_l);
4412  vr_l = vec_vsr (vr_l, vsh_l);
4413  /* Merge the dwords after shift. */
4414  result = (vui64_t) vec_sel (vr_h, vr_l, (vui8_t) sel_mask);
4415 #endif
4416  return ((vui64_t) result);
4417 }
4418 #endif
4419 
4436 static inline void
4437 vec_vsstuddo (vui64_t xs, unsigned long long *array, vi64_t vra)
4438 {
4439  vui64_t xs1;
4440 
4441  xs1 = vec_xxspltd (xs, 1);
4442 #ifdef _ARCH_PWR8
4443  vec_vstsidx (xs, vra[VEC_DW_H], array);
4444  vec_vstsidx (xs1, vra[VEC_DW_L], array);
4445 #else
4446  // Need to explicitly manage the VR/GPR xfer for PWR7
4447  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);
4450 #endif
4451 }
4452 
4472 static inline void
4473 vec_vsstuddsx (vui64_t xs, unsigned long long *array,
4474  vi64_t vra, const unsigned char scale)
4475 {
4476  vi64_t offset;
4477 
4478  offset = (vi64_t) vec_sldi ((vui64_t) vra, (3 + scale));
4479  vec_vsstuddo (xs, array, offset);
4480 }
4481 
4498 static inline void
4499 vec_vsstuddx (vui64_t xs, unsigned long long *array,
4500  vi64_t vra)
4501 {
4502  vi64_t offset;
4503 
4504  offset = (vi64_t) vec_sldi ((vui64_t) vra, 3);
4505  vec_vsstuddo (xs, array, offset);
4506 }
4507 
4525 static inline void
4526 vec_vsstudso (vui64_t xs, unsigned long long *array,
4527  const long long offset0, const long long offset1)
4528 {
4529  vui64_t xs1;
4530 
4531  xs1 = vec_xxspltd (xs, 1);
4532  /* Need to handle endian as vec_vstsfdux always left side of
4533  * the VR, while the element [0] may in the left or right. */
4534 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
4535  vec_vstsidx (xs, offset1, array);
4536  vec_vstsidx (xs1, offset0, array);
4537 #else
4538  vec_vstsidx (xs, offset0, array);
4539  vec_vstsidx (xs1, offset1, array);
4540 #endif
4541 }
4542 
4570 static inline void
4571 vec_vstsidx (vui64_t xs, const signed long long ra, unsigned long long *rb)
4572 {
4573 #if (defined(__clang__) && __clang_major__ < 8)
4574  __VEC_U_128 t;
4575  unsigned long long *p = (unsigned long long *)((char *)rb + ra);
4576  t.vx1 = xs;
4577  *p = t.ulong.upper;
4578 #else
4579  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768)
4580  && ((ra & 3) == 0))
4581  {
4582 #if defined (_ARCH_PWR9)
4583  __asm__(
4584  "stxsd%X0 %1,%0;"
4585  : "=m" (*((char *)rb + ra))
4586  : "v" (xs)
4587  : );
4588 #else
4589  if (ra == 0)
4590  {
4591  __asm__(
4592  "stxsdx %x1,%y0;"
4593  : "=Z" (*rb)
4594  : "wa" (xs)
4595  : );
4596  } else {
4597  unsigned long long rt;
4598  __asm__(
4599  "li %0,%1;"
4600  : "=r" (rt)
4601  : "I" (ra)
4602  : );
4603  __asm__(
4604  "stxsdx %x1,%y0;"
4605  : "=Z" (*((char *)rb+rt))
4606  : "wa" (xs)
4607  : );
4608  }
4609 #endif
4610  } else {
4611  __asm__(
4612  "stxsdx %x1,%y0;"
4613  : "=Z" (*((char *)rb+ra))
4614  : "wa" (xs)
4615  : );
4616  }
4617 #endif
4618 }
4619 
4646 static inline vui64_t
4647 vec_xxspltd (vui64_t vra, const int ctl)
4648 {
4649  vui64_t result;
4650  /* Don't need to reverse the cases for LE because vec_permdi handles
4651  that. */
4652  switch (ctl & 1)
4653  {
4654  case 0:
4655  result = vec_permdi (vra, vra, 0);
4656  break;
4657  case 1:
4658  result = vec_permdi (vra, vra, 3);
4659  break;
4660  }
4661 
4662  return (result);
4663 }
4664 
4687 static inline vui64_t
4689 {
4690  const vui32_t zero = { 0, 0, 0, 0 };
4691  vui64_t res;
4692  vui32_t c_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) c);
4693  res = vec_vmuleuw (a, b);
4694  return vec_addudm (res, (vui64_t) c_euw);
4695 }
4696 
4722 static inline vui64_t
4724 {
4725  const vui32_t zero = { 0, 0, 0, 0 };
4726  vui64_t res, sum;
4727  vui32_t c_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) c);
4728  vui32_t d_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) d);
4729  res = vec_vmuleuw (a, b);
4730  sum = vec_addudm ( (vui64_t) c_euw, (vui64_t) d_euw);
4731  return vec_addudm (res, sum);
4732 }
4733 
4756 static inline vui64_t
4758 {
4759  const vui32_t zero = { 0, 0, 0, 0 };
4760  vui64_t res;
4761  vui32_t c_ouw = vec_mrgalw ((vui64_t) zero, (vui64_t) c);
4762  res = vec_vmulouw (a, b);
4763  return vec_addudm (res, (vui64_t) c_ouw);
4764 }
4765 
4791 static inline vui64_t
4793 {
4794  const vui32_t zero = { 0, 0, 0, 0 };
4795  vui64_t res, sum;
4796  vui32_t c_ouw = vec_mrgalw ((vui64_t) zero, (vui64_t) c);
4797  vui32_t d_ouw = vec_mrgalw ((vui64_t) zero, (vui64_t) d);
4798  res = vec_vmulouw (a, b);
4799  sum = vec_addudm ((vui64_t) c_ouw, (vui64_t) d_ouw);
4800  return vec_addudm (res, sum);
4801 }
4802 
4828 static inline vui64_t
4830 {
4831  vui64_t peven, podd, psum;
4832 
4833  peven = vec_muleuw (vra, vrb);
4834  podd = vec_mulouw (vra, vrb);
4835  psum = vec_addudm (peven, podd);
4836 
4837  return vec_addudm (psum, vrc);
4838 }
4839 
4840 #endif /* VEC_INT64_PPC_H_ */
vec_cmpud_all_ne
static int vec_cmpud_all_ne(vui64_t a, vui64_t b)
Vector Compare all Not Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2334
vec_vmsumoud
static vui128_t vec_vmsumoud(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Odd Unsigned Doublewords.
vec_mrgahw
static vui32_t vec_mrgahw(vui64_t vra, vui64_t vrb)
Vector Merge Algebraic High Words.
Definition: vec_int32_ppc.h:653
vec_vmadd2euw
static vui64_t vec_vmadd2euw(vui32_t a, vui32_t b, vui32_t c, vui32_t d)
Vector Multiply-Add2 Even Unsigned Words.
Definition: vec_int64_ppc.h:4723
vec_vgluddo
static vui64_t vec_vgluddo(unsigned long long *array, vi64_t vra)
Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets.
Definition: vec_int64_ppc.h:3824
vec_cmpud_all_le
static int vec_cmpud_all_le(vui64_t a, vui64_t b)
Vector Compare all Less than equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2287
scalar_extract_uint64_from_high_uint128
static unsigned long long scalar_extract_uint64_from_high_uint128(unsigned __int128 gprp)
Extract the high doubleword from a __int128 scalar.
Definition: vec_common_ppc.h:503
vec_vmaddouw
static vui64_t vec_vmaddouw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Odd Unsigned Words.
Definition: vec_int64_ppc.h:4757
vec_muleuw
static vui64_t vec_muleuw(vui32_t a, vui32_t b)
Vector multiply even unsigned words.
Definition: vec_int32_ppc.h:1007
vec_vmuloud
static vui128_t vec_vmuloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
vec_vmsumeud
static vui128_t vec_vmsumeud(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Even Unsigned Doublewords.
vec_xxspltd
static vui64_t vec_xxspltd(vui64_t vra, const int ctl)
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of ...
Definition: vec_int64_ppc.h:4647
vec_vgluddsx
static vui64_t vec_vgluddsx(unsigned long long *array, vi64_t vra, const unsigned char scale)
Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes.
Definition: vec_int64_ppc.h:3866
vec_cmpud_any_ge
static int vec_cmpud_any_ge(vui64_t a, vui64_t b)
Vector Compare any Greater Than or Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2397
vec_vpkudum
static vui32_t vec_vpkudum(vui64_t vra, vui64_t vrb)
Vector Pack Unsigned Doubleword Unsigned Modulo.
Definition: vec_int64_ppc.h:4141
vec_muleud
static vui128_t vec_muleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
vec_vmaddeuw
static vui64_t vec_vmaddeuw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Even Unsigned Words.
Definition: vec_int64_ppc.h:4688
vec_cmpud_any_le
static int vec_cmpud_any_le(vui64_t a, vui64_t b)
Vector Compare any Less than equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2461
vec_cmpsd_any_eq
static int vec_cmpsd_any_eq(vi64_t a, vi64_t b)
Vector Compare any Equal Signed Doubleword.
Definition: vec_int64_ppc.h:2019
vec_vstsidx
static void vec_vstsidx(vui64_t xs, const signed long long ra, unsigned long long *rb)
Vector Store Scalar Integer Doubleword Indexed.
Definition: vec_int64_ppc.h:4571
vec_cmpud_all_lt
static int vec_cmpud_all_lt(vui64_t a, vui64_t b)
Vector Compare all Less than Unsigned Doubleword.
Definition: vec_int64_ppc.h:2311
vec_minud
static vui64_t vec_minud(vui64_t vra, vui64_t vrb)
Vector Minimum Unsigned Doubleword.
Definition: vec_int64_ppc.h:2663
vec_vmuleud
static vui128_t vec_vmuleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
CONST_VINT128_W
#define CONST_VINT128_W(__w0, __w1, __w2, __w3)
Arrange word elements of a unsigned int initializer in high->low order. May require an explicit cast.
Definition: vec_common_ppc.h:304
vec_cmpsd_all_ge
static int vec_cmpsd_all_ge(vi64_t a, vi64_t b)
Vector Compare all Greater Than or Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1877
vec_vsstuddsx
static void vec_vsstuddsx(vui64_t xs, unsigned long long *array, vi64_t vra, const unsigned char scale)
Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes.
Definition: vec_int64_ppc.h:4473
vec_selsd
static vi64_t vec_selsd(vi64_t vra, vi64_t vrb, vb64_t vrc)
Vector Select Signed Doubleword.
Definition: vec_int64_ppc.h:3334
CONST_VINT64_DW
#define CONST_VINT64_DW(__dw0, __dw1)
Arrange elements of dword initializer in high->low order.
Definition: vec_common_ppc.h:295
vec_transfer_vui128t_to_uint128
static unsigned __int128 vec_transfer_vui128t_to_uint128(vui128_t vra)
Transfer a vector unsigned __int128 to __int128 scalar.
Definition: vec_common_ppc.h:420
vec_srdi
static vui64_t vec_srdi(vui64_t vra, const unsigned int shb)
Vector Shift Right Doubleword Immediate.
Definition: vec_int64_ppc.h:3604
vec_mrghd
static vui64_t vec_mrghd(vui64_t __VA, vui64_t __VB)
Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and l...
Definition: vec_int64_ppc.h:2792
vec_cmpgesd
static vb64_t vec_cmpgesd(vi64_t a, vi64_t b)
Vector Compare Greater Than or Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1509
vec_popcntw
static vui32_t vec_popcntw(vui32_t vra)
Vector Population Count word.
Definition: vec_int32_ppc.h:1184
vec_mrgld
static vui64_t vec_mrgld(vui64_t __VA, vui64_t __VB)
Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low...
Definition: vec_int64_ppc.h:2825
vec_vsstudso
static void vec_vsstudso(vui64_t xs, unsigned long long *array, const long long offset0, const long long offset1)
Vector Scatter-Store Integer Doublewords to Scalar Offsets.
Definition: vec_int64_ppc.h:4526
vec_cmpud_all_eq
static int vec_cmpud_all_eq(vui64_t a, vui64_t b)
Vector Compare all Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2193
VEC_BYTE_L_DWL
#define VEC_BYTE_L_DWL
Element index for lowest order byte of the low dword.
Definition: vec_common_ppc.h:348
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vec_rldi
static vui64_t vec_rldi(vui64_t vra, const unsigned int shb)
Vector Rotate left Doubleword Immediate.
Definition: vec_int64_ppc.h:3207
vec_splatd
static vui64_t vec_splatd(vui64_t vra, const int ctl)
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of ...
Definition: vec_int64_ppc.h:3382
vec_vmuleuw
static vui64_t vec_vmuleuw(vui32_t vra, vui32_t vrb)
Vector Multiply Even Unsigned words.
Definition: vec_int32_ppc.h:2237
vec_sldi
static vui64_t vec_sldi(vui64_t vra, const unsigned int shb)
Vector Shift left Doubleword Immediate.
Definition: vec_int64_ppc.h:3253
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vi32_t
__vector int vi32_t
vector of 32-bit signed int elements.
Definition: vec_common_ppc.h:215
vec_cmpud_all_gt
static int vec_cmpud_all_gt(vui64_t a, vui64_t b)
Vector Compare all Greater Than Unsigned Doubleword.
Definition: vec_int64_ppc.h:2255
vec_vmulouw
static vui64_t vec_vmulouw(vui32_t vra, vui32_t vrb)
Vector Multiply Odd Unsigned Words.
Definition: vec_int32_ppc.h:2340
vec_vsrad
static vi64_t vec_vsrad(vi64_t vra, vui64_t vrb)
Vector Shift Right Algebraic Doubleword.
Definition: vec_int64_ppc.h:4305
vec_spltd
static vui64_t vec_spltd(vui64_t vra, const int ctl)
Definition: vec_int64_ppc.h:3560
vec_cmpud_any_lt
static int vec_cmpud_any_lt(vui64_t a, vui64_t b)
Vector Compare any Less than Unsigned Doubleword.
Definition: vec_int64_ppc.h:2485
vec_subudm
static vui64_t vec_subudm(vui64_t a, vui64_t b)
Vector Subtract Unsigned Doubleword Modulo.
Definition: vec_int64_ppc.h:3746
vec_vrld
static vui64_t vec_vrld(vui64_t vra, vui64_t vrb)
Vector Rotate Left Doubleword.
Definition: vec_int64_ppc.h:4185
vec_vsld
static vui64_t vec_vsld(vui64_t vra, vui64_t vrb)
Vector Shift Left Doubleword.
Definition: vec_int64_ppc.h:4238
vec_muloud
static vui128_t vec_muloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
vec_minsd
static vi64_t vec_minsd(vi64_t vra, vi64_t vrb)
Vector Minimum Signed Doubleword.
Definition: vec_int64_ppc.h:2621
__VEC_U_128::ulong
struct __VEC_U_128::@0 ulong
Struct of two unsigned long int (64-bit GPR) fields.
vec_vsstuddx
static void vec_vsstuddx(vui64_t xs, unsigned long long *array, vi64_t vra)
Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes.
Definition: vec_int64_ppc.h:4499
vec_mrgod
static vui64_t vec_mrgod(vui64_t __VA, vui64_t __VB)
Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low...
Definition: vec_int64_ppc.h:2859
vec_cmpsd_any_ge
static int vec_cmpsd_any_ge(vi64_t a, vi64_t b)
Vector Compare any Greater Than or Equal Signed Doubleword.
Definition: vec_int64_ppc.h:2051
vec_maxud
static vui64_t vec_maxud(vui64_t vra, vui64_t vrb)
Vector Maximum Unsigned Doubleword.
Definition: vec_int64_ppc.h:2579
vec_cmpsd_any_ne
static int vec_cmpsd_any_ne(vi64_t a, vi64_t b)
Vector Compare any Not Equal Signed Doubleword.
Definition: vec_int64_ppc.h:2162
vec_mrgalw
static vui32_t vec_mrgalw(vui64_t vra, vui64_t vrb)
Vector merge Algebraic low words.
Definition: vec_int32_ppc.h:703
vec_msumudm
static vui128_t vec_msumudm(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Unsigned Doubleword Modulo.
vec_cmpnesd
static vb64_t vec_cmpnesd(vi64_t a, vi64_t b)
Vector Compare Not Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1796
vec_cmpud_any_eq
static int vec_cmpud_any_eq(vui64_t a, vui64_t b)
Vector Compare any Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2365
vec_vlsidx
static vui64_t vec_vlsidx(const signed long long ra, const unsigned long long *rb)
Vector Load Scalar Integer Doubleword Indexed.
Definition: vec_int64_ppc.h:3979
vec_cmpneud
static vb64_t vec_cmpneud(vui64_t a, vui64_t b)
Vector Compare Not Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1821
vec_mulhud
static vui64_t vec_mulhud(vui64_t vra, vui64_t vrb)
Vector Multiply High Unsigned Doubleword.
vec_vsum2sw
static vi32_t vec_vsum2sw(vi32_t vra, vi32_t vrb)
Vector Sum-across Half Signed Word Saturate.
Definition: vec_int32_ppc.h:2848
__VEC_U_128::vx1
vui128_t vx1
128 bit Vector of 1 unsigned __int128 element.
Definition: vec_common_ppc.h:277
vec_maxsd
static vi64_t vec_maxsd(vi64_t vra, vi64_t vrb)
Vector Maximum Signed Doubleword.
Definition: vec_int64_ppc.h:2537
vui128_t
__vector unsigned __int128 vui128_t
vector of one 128-bit unsigned __int128 element.
Definition: vec_common_ppc.h:237
vb64_t
__vector __bool long long vb64_t
vector of 64-bit bool long long elements.
Definition: vec_common_ppc.h:230
vec_cmplesd
static vb64_t vec_cmplesd(vi64_t a, vi64_t b)
Vector Compare Less Than Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1689
vec_vgludso
static vui64_t vec_vgludso(unsigned long long *array, const long long offset0, const long long offset1)
Vector Gather-Load Integer Doublewords from Scalar Offsets.
Definition: vec_int64_ppc.h:3926
vec_cmpeqsd
static vb64_t vec_cmpeqsd(vi64_t a, vi64_t b)
Vector Compare Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1422
vec_permdi
static vui64_t vec_permdi(vui64_t vra, vui64_t vrb, const int ctl)
Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a d...
Definition: vec_int64_ppc.h:2983
vec_int32_ppc.h
Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements.
__VEC_U_128
Union used to transfer 128-bit data between vector and non-vector types.
Definition: vec_common_ppc.h:256
vec_clzd
static vui64_t vec_clzd(vui64_t vra)
Vector Count Leading Zeros Doubleword for unsigned long long elements.
Definition: vec_int64_ppc.h:1313
vi64_t
__vector long long vi64_t
vector of 64-bit signed long long elements.
Definition: vec_common_ppc.h:217
CONST_VINT128_DW
#define CONST_VINT128_DW(__dw0, __dw1)
Initializer for 128-bits vector, as two unsigned long long elements in high->low order....
Definition: vec_common_ppc.h:298
vec_mulouw
static vui64_t vec_mulouw(vui32_t a, vui32_t b)
Vector multiply odd unsigned words.
Definition: vec_int32_ppc.h:1043
vec_vsrd
static vui64_t vec_vsrd(vui64_t vra, vui64_t vrb)
Vector Shift Right Doubleword.
Definition: vec_int64_ppc.h:4377
vec_cmpgtsd
static vb64_t vec_cmpgtsd(vi64_t a, vi64_t b)
Vector Compare Greater Than Signed Doubleword.
Definition: vec_int64_ppc.h:1571
VEC_DW_L
#define VEC_DW_L
Element index for low order dword.
Definition: vec_common_ppc.h:324
vec_revbd
static vui64_t vec_revbd(vui64_t vra)
byte reverse each doubleword for a vector unsigned long int.
Definition: vec_int64_ppc.h:3112
vec_selud
static vui64_t vec_selud(vui64_t vra, vui64_t vrb, vb64_t vrc)
Vector Select Unsigned Doubleword.
Definition: vec_int64_ppc.h:3354
vec_cmpsd_any_gt
static int vec_cmpsd_any_gt(vi64_t a, vi64_t b)
Vector Compare any Greater Than Signed Doubleword.
Definition: vec_int64_ppc.h:2083
vec_pasted
static vui64_t vec_pasted(vui64_t __VH, vui64_t __VL)
Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word o...
Definition: vec_int64_ppc.h:2937
vec_cmpud_any_ne
static int vec_cmpud_any_ne(vui64_t a, vui64_t b)
Vector Compare any Not Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2508
vec_mrgald
static vui64_t vec_mrgald(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic Low Doublewords.
Definition: vec_int64_ppc.h:2736
vec_vmadd2eud
static vui128_t vec_vmadd2eud(vui64_t a, vui64_t b, vui64_t c, vui64_t d)
Vector Multiply-Add2 Even Unsigned Doublewords.
vec_cmpsd_all_ne
static int vec_cmpsd_all_ne(vi64_t a, vi64_t b)
Vector Compare all Not Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1988
vec_mrgahd
static vui64_t vec_mrgahd(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic High Doublewords.
Definition: vec_int64_ppc.h:2710
vec_cmpsd_all_le
static int vec_cmpsd_all_le(vi64_t a, vi64_t b)
Vector Compare all Less than equal Signed Doubleword.
Definition: vec_int64_ppc.h:1941
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
vec_splat_u64
static vui64_t vec_splat_u64(const int sim)
Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword...
Definition: vec_int64_ppc.h:3495
vec_mrged
static vui64_t vec_mrged(vui64_t __VA, vui64_t __VB)
Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and l...
Definition: vec_int64_ppc.h:2759
vec_setb_sd
static vb64_t vec_setb_sd(vi64_t vra)
Vector Set Bool from Signed Doubleword.
Definition: vec_int64_ppc.h:3164
vec_vmaddeud
static vui128_t vec_vmaddeud(vui64_t a, vui64_t b, vui64_t c)
Vector Multiply-Add Even Unsigned Doublewords.
vec_cmpltud
static vb64_t vec_cmpltud(vui64_t a, vui64_t b)
Vector Compare less Than Unsigned Doubleword.
Definition: vec_int64_ppc.h:1771
vec_cmpsd_all_eq
static int vec_cmpsd_all_eq(vi64_t a, vi64_t b)
Vector Compare all Equal Signed Doubleword.
Definition: vec_int64_ppc.h:1847
scalar_extract_uint64_from_low_uint128
static unsigned long long scalar_extract_uint64_from_low_uint128(unsigned __int128 gprp)
Extract the low doubleword from a __int128 scalar.
Definition: vec_common_ppc.h:490
vi16_t
__vector short vi16_t
vector of 16-bit signed short elements.
Definition: vec_common_ppc.h:213
vec_cmpud_all_ge
static int vec_cmpud_all_ge(vui64_t a, vui64_t b)
Vector Compare all Greater Than or Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:2223
vec_splat_s64
static vi64_t vec_splat_s64(const int sim)
Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword ele...
Definition: vec_int64_ppc.h:3429
vec_vmsumuwm
static vui64_t vec_vmsumuwm(vui32_t vra, vui32_t vrb, vui64_t vrc)
Vector Multiply-Sum Unsigned Word Modulo.
Definition: vec_int64_ppc.h:4829
vec_cmpgtud
static vb64_t vec_cmpgtud(vui64_t a, vui64_t b)
Vector Compare Greater Than Unsigned Doubleword.
Definition: vec_int64_ppc.h:1622
vec_vsstuddo
static void vec_vsstuddo(vui64_t xs, unsigned long long *array, vi64_t vra)
Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets.
Definition: vec_int64_ppc.h:4437
vec_addudm
static vui64_t vec_addudm(vui64_t a, vui64_t b)
Vector Add Unsigned Doubleword Modulo.
Definition: vec_int64_ppc.h:1261
vec_vmadd2oud
static vui128_t vec_vmadd2oud(vui64_t a, vui64_t b, vui64_t c, vui64_t d)
Vector Multiply-Add2 Odd Unsigned Doublewords.
vec_absdud
static vui64_t vec_absdud(vui64_t vra, vui64_t vrb)
Vector Absolute Difference Unsigned Doubleword.
Definition: vec_int64_ppc.h:1241
vec_cmpsd_all_gt
static int vec_cmpsd_all_gt(vi64_t a, vi64_t b)
Vector Compare all Greater Than Signed Doubleword.
Definition: vec_int64_ppc.h:1909
vec_popcntd
static vui64_t vec_popcntd(vui64_t vra)
Vector Population Count doubleword.
Definition: vec_int64_ppc.h:3068
vec_cmpltsd
static vb64_t vec_cmpltsd(vi64_t a, vi64_t b)
Vector Compare less Than Signed Doubleword.
Definition: vec_int64_ppc.h:1746
vec_cmpsd_all_lt
static int vec_cmpsd_all_lt(vi64_t a, vi64_t b)
Vector Compare all Less than Signed Doubleword.
Definition: vec_int64_ppc.h:1965
vec_swapd
static vui64_t vec_swapd(vui64_t vra)
Vector doubleword swap. Exchange the high and low doubleword elements of a vector.
Definition: vec_int64_ppc.h:3789
vec_sradi
static vi64_t vec_sradi(vi64_t vra, const unsigned int shb)
Vector Shift Right Algebraic Doubleword Immediate.
Definition: vec_int64_ppc.h:3692
vec_vgluddx
static vui64_t vec_vgluddx(unsigned long long *array, vi64_t vra)
Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes.
Definition: vec_int64_ppc.h:3898
vec_ctzd
static vui64_t vec_ctzd(vui64_t vra)
Vector Count Trailing Zeros Doubleword for unsigned long long elements.
Definition: vec_int64_ppc.h:1371
vec_clzw
static vui32_t vec_clzw(vui32_t vra)
Vector Count Leading Zeros word.
Definition: vec_int32_ppc.h:503
vec_vmaddoud
static vui128_t vec_vmaddoud(vui64_t a, vui64_t b, vui64_t c)
Vector Multiply-Add Odd Unsigned Doublewords.
vec_cmpleud
static vb64_t vec_cmpleud(vui64_t a, vui64_t b)
Vector Compare Less Than Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1718
vec_muludm
static vui64_t vec_muludm(vui64_t vra, vui64_t vrb)
Vector Multiply Unsigned Doubleword Modulo.
vec_cmpequd
static vb64_t vec_cmpequd(vui64_t a, vui64_t b)
Vector Compare Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1451
vec_cmpgeud
static vb64_t vec_cmpgeud(vui64_t a, vui64_t b)
Vector Compare Greater Than or Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1539
VEC_DW_H
#define VEC_DW_H
Element index for high order dword.
Definition: vec_common_ppc.h:322
vec_vmadd2ouw
static vui64_t vec_vmadd2ouw(vui32_t a, vui32_t b, vui32_t c, vui32_t d)
Vector Multiply-Add2 Odd Unsigned Words.
Definition: vec_int64_ppc.h:4792
vec_cmpud_any_gt
static int vec_cmpud_any_gt(vui64_t a, vui64_t b)
Vector Compare any Greater Than Unsigned Doubleword.
Definition: vec_int64_ppc.h:2429
vec_cmpsd_any_le
static int vec_cmpsd_any_le(vi64_t a, vi64_t b)
Vector Compare any Less than equal Signed Doubleword.
Definition: vec_int64_ppc.h:2115
vec_cmpsd_any_lt
static int vec_cmpsd_any_lt(vi64_t a, vi64_t b)
Vector Compare any Less than Signed Doubleword.
Definition: vec_int64_ppc.h:2139
VEC_BYTE_L_DWH
#define VEC_BYTE_L_DWH
Element index for lowest order byte of the high dword.
Definition: vec_common_ppc.h:346