POWER Vector Library Manual  1.0.4
vec_int128_ppc.h
Go to the documentation of this file.
1 /*
2  Copyright (c) [2017, 2018] IBM Corporation.
3 
4  Licensed under the Apache License, Version 2.0 (the "License");
5  you may not use this file except in compliance with the License.
6  You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  vec_int128_ppc.h
17 
18  Contributors:
19  IBM Corporation, Steven Munroe
20  Created on: May 10, 2015
21  Steven Munroe, additional contributions for POWER9.
22  */
23 
24 #ifndef VEC_INT128_PPC_H_
25 #define VEC_INT128_PPC_H_
26 
27 #include <pveclib/vec_common_ppc.h>
28 #include <pveclib/vec_int64_ppc.h>
29 
2337 #ifndef PVECLIB_DISABLE_CONSTINT128
2338 #define CONST_VUINT128_QxW(__q0, __q1, __q2, __q3) ( (vui128_t) \
2339  (((unsigned __int128) __q0) << 96) \
2340  + (((unsigned __int128) __q1) << 64) \
2341  + (((unsigned __int128) __q2) << 32) \
2342  + ((unsigned __int128) __q3) )
2343 #else
2344 // clang does not handle constant folding for __int128
2345 #define CONST_VUINT128_QxW(__q0, __q1, __q2, __q3) ( (vui128_t) \
2346  CONST_VINT128_W(__q0, __q1, __q2, __q3) )
2347 #endif
2348 
2364 #define CONST_VUINT128_QxD(__q0, __q1) ( (vui128_t) \
2365  (((unsigned __int128) __q0) << 64) \
2366  + ((unsigned __int128) __q1) )
2367 
2385 #define CONST_VUINT128_Qx19d(__q0, __q1) ( (vui128_t) \
2386  (((unsigned __int128) __q0) * 10000000000000000000UL) \
2387  + ((unsigned __int128) __q1) )
2388 
2405 #define CONST_VUINT128_Qx18d(__q0, __q1) ( (vui128_t) \
2406  (((unsigned __int128) __q0) * 1000000000000000000UL) \
2407  + ((unsigned __int128) __q1) )
2408 
2425 #define CONST_VUINT128_Qx16d(__q0, __q1) ( (vui128_t) \
2426  (((unsigned __int128) __q0) * 10000000000000000UL) \
2427  + ((unsigned __int128) __q1) )
2428 
2430 static inline vui128_t vec_addecuq (vui128_t a, vui128_t b, vui128_t ci);
2431 static inline vui128_t vec_addeuqm (vui128_t a, vui128_t b, vui128_t ci);
2432 static inline vb128_t vec_cmpequq (vui128_t vra, vui128_t vrb);
2433 static inline vb128_t vec_cmpgeuq (vui128_t vra, vui128_t vrb);
2434 static inline vb128_t vec_cmpgtuq (vui128_t vra, vui128_t vrb);
2435 static inline vb128_t vec_cmpleuq (vui128_t vra, vui128_t vrb);
2436 static inline vb128_t vec_cmpltuq (vui128_t vra, vui128_t vrb);
2437 static inline vb128_t vec_cmpneuq (vui128_t vra, vui128_t vrb);
2438 static inline vui128_t vec_divuq_10e31 (vui128_t vra);
2439 static inline vui128_t vec_divuq_10e32 (vui128_t vra);
2440 static inline vui128_t vec_maxuq (vui128_t a, vui128_t b);
2441 static inline vui128_t vec_minuq (vui128_t a, vui128_t b);
2442 static inline vui128_t vec_moduq_10e31 (vui128_t vra, vui128_t q);
2443 static inline vui128_t vec_moduq_10e32 (vui128_t vra, vui128_t q);
2444 static inline vui128_t vec_muleud (vui64_t a, vui64_t b);
2445 static inline vui128_t vec_mulhuq (vui128_t a, vui128_t b);
2446 static inline vui128_t vec_mulluq (vui128_t a, vui128_t b);
2447 static inline vui128_t vec_muloud (vui64_t a, vui64_t b);
2448 static inline vui128_t vec_muludq (vui128_t *mulu, vui128_t a, vui128_t b);
2449 static inline vi128_t vec_negsq (vi128_t int128);
2450 static inline vui128_t vec_popcntq (vui128_t vra);
2451 static inline vb128_t vec_setb_cyq (vui128_t vcy);
2452 static inline vb128_t vec_setb_ncq (vui128_t vcy);
2453 static inline vb128_t vec_setb_sq (vi128_t vra);
2454 static inline vi128_t vec_selsq (vi128_t vra, vi128_t vrb, vb128_t vrc);
2455 static inline vui128_t vec_sldq (vui128_t vrw, vui128_t vrx,
2456  vui128_t vrb);
2457 static inline vui128_t vec_sldqi (vui128_t vrw, vui128_t vrx,
2458  const unsigned int shb);
2459 static inline vui128_t vec_srqi (vui128_t vra, const unsigned int shb);
2460 static inline vui128_t vec_subcuq (vui128_t vra, vui128_t vrb);
2461 static inline vui128_t vec_subeuqm (vui128_t vra, vui128_t vrb, vui128_t vrc);
2462 static inline vui128_t vec_subuqm (vui128_t vra, vui128_t vrb);
2463 static inline vui128_t vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c);
2464 static inline vui128_t vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c);
2465 static inline vui128_t vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c);
2466 static inline vui128_t vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c);
2467 static inline vui128_t vec_vmuleud (vui64_t a, vui64_t b);
2468 static inline vui128_t vec_vmuloud (vui64_t a, vui64_t b);
2469 static inline vui128_t vec_vsldbi (vui128_t vra, vui128_t vrb,
2470  const unsigned int shb);
2472 
2488 static inline vui128_t
2490 {
2491 #ifdef _ARCH_PWR8
2492  vui128_t tmp1, tmp2;
2493  vb128_t cmpbool;
2494  cmpbool = vec_cmpgtuq ( vra, vrb );
2495  tmp1 = vec_subuqm ( vra, vrb );
2496  tmp2 = vec_subuqm ( vrb, vra );
2497  return (vui128_t) vec_sel ((vui32_t) tmp2, (vui32_t) tmp1, (vui32_t) cmpbool);
2498 #else
2499  return vec_subuqm (vec_maxuq (vra, vrb), vec_minuq (vra, vrb));
2500 #endif
2501 }
2502 
2515 static inline vi128_t
2517 {
2518  vi128_t q_neg;
2519  vb128_t b_sign;
2520  // Convert 2s complement to unsigned magnitude form.
2521  q_neg = vec_negsq (vra);
2522  b_sign = vec_setb_sq (vra);
2523  return vec_selsq (vra, q_neg, b_sign);
2524 }
2525 
2540 static inline vui128_t
2542 {
2543  vui128_t result, tmp1, tmp2;
2544  const vui128_t qu1 = (vui128_t) CONST_VINT128_W(0, 0, 0, 1);
2545  // Compute (vra + vrb + 1) with carry
2546  tmp1 = vec_addeuqm (vra, vrb, qu1);
2547  tmp2 = vec_addecuq (vra, vrb, qu1);
2548  // shift sum with carry, right 1 bit
2549  result = vec_sldqi (tmp2, tmp1, 127);
2550 
2551  return result;
2552 }
2553 
2567 static inline vui128_t
2569 {
2570  vui32_t co;
2571 #ifdef _ARCH_PWR8
2572 #if defined (vec_vaddcuq)
2573  co = (vui32_t) vec_vaddcuq (a, b);
2574 #elif defined (__clang__)
2575  co = (vui32_t) vec_addc (a, b);
2576 #else
2577  __asm__(
2578  "vaddcuq %0,%1,%2;"
2579  : "=v" (co)
2580  : "v" (a),
2581  "v" (b)
2582  : );
2583 #endif
2584 #else
2585  vui32_t c, c2, t;
2586  vui32_t z= { 0,0,0,0};
2587 
2588  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
2589  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);
2590  c = vec_sld (co, z, 4);
2591  c2 = vec_vaddcuw (t, c);
2592  t = vec_vadduwm (t, c);
2593  co = vec_vor (co, c2);
2594  c = vec_sld (c2, z, 4);
2595  c2 = vec_vaddcuw (t, c);
2596  t = vec_vadduwm (t, c);
2597  co = vec_vor (co, c2);
2598  c = vec_sld (c2, z, 4);
2599  c2 = vec_vaddcuw (t, c);
2600  co = vec_vor (co, c2);
2601  co = vec_sld (z, co, 4);
2602 #endif
2603  return ((vui128_t) co);
2604 }
2605 
2621  static inline vui128_t
2623  {
2624  vui32_t co;
2625  #ifdef _ARCH_PWR8
2626  #if defined (vec_vaddcuq)
2627  co = (vui32_t) vec_vaddecuq (a, b, ci);
2628  #elif defined (__clang__)
2629  co = (vui32_t) vec_addec (a, b, ci);
2630 # else
2631  __asm__(
2632  "vaddecuq %0,%1,%2,%3;"
2633  : "=v" (co)
2634  : "v" (a),
2635  "v" (b),
2636  "v" (ci)
2637  : );
2638  #endif
2639  #else
2640  vui32_t c, c2, t;
2641  vui32_t z = { 0, 0, 0, 0 };
2642  co = (vui32_t){ 1, 1, 1, 1 };
2643 
2644  c2 = vec_and ((vui32_t) ci, co);
2645  c2 = vec_sld ((vui32_t) c2, z, 12);
2646  co = vec_vaddcuw ((vui32_t) a, (vui32_t) b);
2647  t = vec_vadduwm ((vui32_t) a, (vui32_t) b);
2648  c = vec_sld (co, c2, 4);
2649  c2 = vec_vaddcuw (t, c);
2650  t = vec_vadduwm (t, c);
2651  co = vec_vor (co, c2);
2652  c = vec_sld (c2, z, 4);
2653  c2 = vec_vaddcuw (t, c);
2654  t = vec_vadduwm (t, c);
2655  co = vec_vor (co, c2);
2656  c = vec_sld (c2, z, 4);
2657  c2 = vec_vaddcuw (t, c);
2658  t = vec_vadduwm (t, c);
2659  co = vec_vor (co, c2);
2660  c = vec_sld (c2, z, 4);
2661  c2 = vec_vaddcuw (t, c);
2662  co = vec_vor (co, c2);
2663  co = vec_sld (z, co, 4);
2664  #endif
2665  return ((vui128_t) co);
2666  }
2667 
2683 static inline vui128_t
2685 {
2686  vui32_t t;
2687 #ifdef _ARCH_PWR8
2688 #if defined (vec_vaddeuqm)
2689  t = (vui32_t) vec_vaddeuqm (a, b, ci);
2690 #elif defined (__clang__)
2691  t = (vui32_t) vec_adde (a, b, ci);
2692 #else
2693  __asm__(
2694  "vaddeuqm %0,%1,%2,%3;"
2695  : "=v" (t)
2696  : "v" (a),
2697  "v" (b),
2698  "v" (ci)
2699  : );
2700 #endif
2701 #else
2702  vui32_t c2, c;
2703  vui32_t z = { 0,0,0,0};
2704  vui32_t co = { 1,1,1,1};
2705 
2706  c2 = vec_and ((vui32_t)ci, co);
2707  c2 = vec_sld ((vui32_t)ci, z, 12);
2708  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
2709  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);
2710  c = vec_sld (co, c2, 4);
2711  c2 = vec_vaddcuw (t, c);
2712  t = vec_vadduwm (t, c);
2713  c = vec_sld (c2, z, 4);
2714  c2 = vec_vaddcuw (t, c);
2715  t = vec_vadduwm (t, c);
2716  c = vec_sld (c2, z, 4);
2717  c2 = vec_vaddcuw (t, c);
2718  t = vec_vadduwm (t, c);
2719  c = vec_sld (c2, z, 4);
2720  t = vec_vadduwm (t, c);
2721 #endif
2722  return ((vui128_t) t);
2723 }
2724 
2738 static inline vui128_t
2740 {
2741  vui32_t t;
2742 #ifdef _ARCH_PWR8
2743 #if defined (vec_vadduqm)
2744  t = (vui32_t) vec_vadduqm (a, b);
2745 #elif defined (__clang__)
2746  t = (vui32_t) vec_add (a, b);
2747 #else
2748  __asm__(
2749  "vadduqm %0,%1,%2;"
2750  : "=v" (t)
2751  : "v" (a),
2752  "v" (b)
2753  : );
2754 #endif
2755 #else
2756  vui32_t c, c2;
2757  vui32_t z= { 0,0,0,0};
2758 
2759  c = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
2760  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);
2761  c = vec_sld (c, z, 4);
2762  c2 = vec_vaddcuw (t, c);
2763  t = vec_vadduwm (t, c);
2764  c = vec_sld (c2, z, 4);
2765  c2 = vec_vaddcuw (t, c);
2766  t = vec_vadduwm (t, c);
2767  c = vec_sld (c2, z, 4);
2768  t = vec_vadduwm (t, c);
2769 #endif
2770  return ((vui128_t) t);
2771 }
2772 
2787 static inline vui128_t
2789 {
2790  vui32_t t, co;
2791 #ifdef _ARCH_PWR8
2792 #if defined (vec_vadduqm) && defined (vec_vaddcuq)
2793  t = (vui32_t) vec_vadduqm (a, b);
2794  co = (vui32_t) vec_vaddcuq (a, b);
2795 #elif defined (__clang__)
2796  t = (vui32_t) vec_add (a, b);
2797  co = (vui32_t) vec_addc (a, b);
2798 #else
2799  __asm__(
2800  "vadduqm %0,%2,%3;\n"
2801  "\tvaddcuq %1,%2,%3;"
2802  : "=&v" (t),
2803  "=v" (co)
2804  : "v" (a),
2805  "v" (b)
2806  : );
2807 #endif
2808 #else
2809  vui32_t c, c2;
2810  vui32_t z= { 0,0,0,0};
2811 
2812  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
2813  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);
2814  c = vec_sld (co, z, 4);
2815  c2 = vec_vaddcuw (t, c);
2816  t = vec_vadduwm (t, c);
2817  co = vec_vor (co, c2);
2818  c = vec_sld (c2, z, 4);
2819  c2 = vec_vaddcuw (t, c);
2820  t = vec_vadduwm (t, c);
2821  co = vec_vor (co, c2);
2822  c = vec_sld (c2, z, 4);
2823  c2 = vec_vaddcuw (t, c);
2824  t = vec_vadduwm (t, c);
2825  co = vec_vor (co, c2);
2826  co = vec_sld (z, co, 4);
2827 #endif
2828  *cout = (vui128_t) co;
2829  return ((vui128_t) t);
2830 }
2831 
2848 static inline vui128_t
2850 {
2851  vui32_t t, co;
2852 #ifdef _ARCH_PWR8
2853 #if defined (vec_vaddeuqm) && defined (vec_vaddecuq)
2854  t = (vui32_t) vec_vaddeuqm (a, b, ci);
2855  co = (vui32_t) vec_vaddecuq (a, b, ci);
2856 #elif defined (__clang__)
2857  t = (vui32_t) vec_adde (a, b, ci);
2858  co = (vui32_t) vec_addec (a, b, ci);
2859 #else
2860  __asm__(
2861  "vaddeuqm %0,%2,%3,%4;\n"
2862  "\tvaddecuq %1,%2,%3,%4;"
2863  : "=&v" (t),
2864  "=v" (co)
2865  : "v" (a),
2866  "v" (b),
2867  "v" (ci)
2868  : );
2869 #endif
2870 #else
2871  vui32_t c, c2;
2872  vui32_t z= { 0,0,0,0};
2873  co = (vui32_t){ 1,1,1,1};
2874 
2875  c2 = vec_and ((vui32_t)ci, co);
2876  c2 = vec_sld ((vui32_t)c2, z, 12);
2877  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);
2878  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);
2879  c = vec_sld (co, c2, 4);
2880  c2 = vec_vaddcuw (t, c);
2881  t = vec_vadduwm (t, c);
2882  co = vec_vor (co, c2);
2883  c = vec_sld (c2, z, 4);
2884  c2 = vec_vaddcuw (t, c);
2885  t = vec_vadduwm (t, c);
2886  co = vec_vor (co, c2);
2887  c = vec_sld (c2, z, 4);
2888  c2 = vec_vaddcuw (t, c);
2889  t = vec_vadduwm (t, c);
2890  co = vec_vor (co, c2);
2891  c = vec_sld (c2, z, 4);
2892  c2 = vec_vaddcuw (t, c);
2893  t = vec_vadduwm (t, c);
2894  co = vec_vor (co, c2);
2895  co = vec_sld (z, co, 4);
2896 #endif
2897  *cout = (vui128_t) co;
2898  return ((vui128_t) t);
2899 }
2900 
2917 static inline vui128_t
2919 {
2920  vui64_t result;
2921 
2922 #ifdef _ARCH_PWR8
2923  /*
2924  * Use the Vector Count Leading Zeros Double Word instruction to get
2925  * the count for the left and right vector halves. If the left vector
2926  * doubleword of the input is nonzero then only the left count is
2927  * included and we need to mask off the right count.
2928  * Otherwise the left count is 64 and we need to add 64 to the right
2929  * count.
2930  * After masking we sum across the left and right counts to
2931  * get the final 128-bit vector count (0-128).
2932  */
2933  vui64_t vt1, vt2, vt3, h64, l64;
2934  const vui64_t vzero = { 0, 0 };
2935 
2936  vt1 = vec_clzd ((vui64_t) vra);
2937  vt2 = (vui64_t) vec_cmpequd((vui64_t) vra, vzero);
2938  vt3 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt2);
2939  h64 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt1);
2940  l64 = vec_and (vt1, vt3);
2941  result = vec_addudm (h64, l64);
2942 #else
2943  /* vector clz instructions were introduced in power8. For power7 and
2944  * earlier, use the pveclib vec_clzw implementation. For a quadword
2945  * clz, this requires pre-conditioning the input before computing the
2946  * the word clz and sum across. */
2947  vui32_t c0, clz;
2948  vui32_t r32, gt32, gt32sr32, gt64sr64;
2949 
2950  c0 = vec_splat_u32 (0);
2951  gt32 = (vui32_t) vec_cmpgt ((vui32_t) vra, c0);
2952  gt32sr32 = vec_sld (c0, gt32, 12);
2953  gt64sr64 = vec_sld (c0, gt32, 8);
2954  gt32 = vec_sld (c0, gt32, 4);
2955 
2956  gt32sr32 = vec_or (gt32sr32, gt32);
2957  gt64sr64 = vec_or (gt64sr64, (vui32_t) vra);
2958  r32 = vec_or (gt32sr32, gt64sr64);
2959 
2960  clz = vec_clzw (r32);
2961  result = (vui64_t) vec_sums ((vi32_t) clz, (vi32_t) c0);
2962 #endif
2963 
2964  return ((vui128_t) result);
2965 }
2966 
2983 static inline vui128_t
2985 {
2986  const vui128_t ones = (vui128_t) vec_splat_s32(-1);
2987  vui128_t tzmask;
2988 
2989  // tzmask = (!vra & (vra - 1))
2990  tzmask = (vui128_t) vec_andc ((vui64_t) vec_adduqm (vra, ones),
2991  (vui64_t) vra);
2992  // return = vec_popcnt (!vra & (vra - 1))
2993  return vec_popcntq (tzmask);
2994 }
2995 
3012 static inline vb128_t
3014 {
3015  /* vec_cmpequq works for both signed and unsigned compares. */
3016  return vec_cmpequq ((vui128_t) vra, (vui128_t) vrb);
3017 }
3018 
3042 static inline vb128_t
3044 {
3045 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3046 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3047  return vec_cmpeq (vra, vrb);
3048 #else
3049  vb128_t vrt;
3050  __asm__(
3051  "vcmpequq %0,%1,%2;\n"
3052  : "=v" (vrt)
3053  : "v" (vra), "v" (vrb)
3054  : );
3055  return vrt;
3056 #endif
3057 #elif defined (_ARCH_PWR8)
3058  vui64_t equd, swapd;
3059 
3060  equd = (vui64_t) vec_cmpequd ((vui64_t) vra, (vui64_t) vrb);
3061  swapd = vec_swapd (equd);
3062  return (vb128_t) vec_and (equd, swapd);
3063 #else
3064  if (vec_all_eq ((vui32_t) vra, (vui32_t) vrb))
3065  return (vb128_t) vec_cmpeq ((vui32_t) vra, (vui32_t) vrb);
3066  else
3067  return (vb128_t) vec_splat_u32 (0);
3068 #endif
3069 }
3070 
3088 static inline vb128_t
3090 {
3091 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3092 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3093  return vec_cmpge (vra, vrb);
3094 #else
3095  vb128_t vrt;
3096  __asm__(
3097  "vcmpgtsq %0,%2,%1;\n"
3098  : "=v" (vrt)
3099  : "v" (vra), "v" (vrb)
3100  : );
3101  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);
3102 #endif
3103 #else
3104  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3105  vui32_t _a, _b;
3106 
3107  _a = vec_xor ((vui32_t) vra, signbit);
3108  _b = vec_xor ((vui32_t) vrb, signbit);
3109  return vec_cmpgeuq ((vui128_t) _a, (vui128_t) _b);
3110 #endif
3111 }
3112 
3137 static inline vb128_t
3139 {
3140 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3141 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3142  return vec_cmpge (vra, vrb);
3143 #else
3144  vb128_t vrt;
3145  __asm__(
3146  "vcmpgtuq %0,%2,%1;\n"
3147  : "=v" (vrt)
3148  : "v" (vra), "v" (vrb)
3149  : );
3150  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);
3151 #endif
3152 #else
3153  vui128_t a_b;
3154 
3155  a_b = vec_subcuq (vra, vrb);
3156  return vec_setb_cyq (a_b);
3157 #endif
3158 }
3159 
3177 static inline vb128_t
3179 {
3180 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3181 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3182  return vec_cmpgt (vra, vrb);
3183 #else
3184  vb128_t vrt;
3185  __asm__(
3186  "vcmpgtsq %0,%1,%2;\n"
3187  : "=v" (vrt)
3188  : "v" (vra), "v" (vrb)
3189  : );
3190  return vrt;
3191 #endif
3192 #else
3193  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3194  vui32_t _a, _b;
3195 
3196  _a = vec_xor ((vui32_t) vra, signbit);
3197  _b = vec_xor ((vui32_t) vrb, signbit);
3198  return vec_cmpgtuq ((vui128_t) _a, (vui128_t) _b);
3199 #endif
3200 }
3201 
3226 static inline vb128_t
3228 {
3229 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3230 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3231  return vec_cmpgt (vra, vrb);
3232 #else
3233  vb128_t vrt;
3234  __asm__(
3235  "vcmpgtuq %0,%1,%2;\n"
3236  : "=v" (vrt)
3237  : "v" (vra), "v" (vrb)
3238  : );
3239  return vrt;
3240 #endif
3241 #else
3242  vui128_t b_a;
3243 
3244  b_a = vec_subcuq (vrb, vra);
3245  return vec_setb_ncq (b_a);
3246 #endif
3247 }
3248 
3266 static inline vb128_t
3268 {
3269 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3270 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3271  return vec_cmple (vra, vrb);
3272 #else
3273  vb128_t vrt;
3274  __asm__(
3275  "vcmpgtsq %0,%1,%2;\n"
3276  : "=v" (vrt)
3277  : "v" (vra), "v" (vrb)
3278  : );
3279  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);
3280 #endif
3281 #else
3282  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3283  vui32_t _a, _b;
3284 
3285  _a = vec_xor ((vui32_t) vra, signbit);
3286  _b = vec_xor ((vui32_t) vrb, signbit);
3287  return vec_cmpleuq ((vui128_t) _a, (vui128_t) _b);
3288 #endif
3289 }
3290 
3315 static inline vb128_t
3317 {
3318 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3319 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3320  return vec_cmple (vra, vrb);
3321 #else
3322  vb128_t vrt;
3323  __asm__(
3324  "vcmpgtuq %0,%1,%2;\n"
3325  : "=v" (vrt)
3326  : "v" (vra), "v" (vrb)
3327  : );
3328  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);
3329 #endif
3330 #else
3331  vui128_t b_a;
3332 
3333  b_a = vec_subcuq (vrb, vra);
3334  return vec_setb_cyq (b_a);
3335 #endif
3336 }
3337 
3338 
3356 static inline vb128_t
3358 {
3359 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3360 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3361  return vec_cmplt (vra, vrb);
3362 #else
3363  vb128_t vrt;
3364  __asm__(
3365  "vcmpgtsq %0,%2,%1;\n"
3366  : "=v" (vrt)
3367  : "v" (vra), "v" (vrb)
3368  : );
3369  return vrt;
3370 #endif
3371 #else
3372  const vui32_t signbit = CONST_VINT128_W(0x80000000, 0, 0, 0);
3373  vui32_t _a, _b;
3374 
3375  _a = vec_xor ((vui32_t) vra, signbit);
3376  _b = vec_xor ((vui32_t) vrb, signbit);
3377  return vec_cmpltuq ((vui128_t) _a, (vui128_t) _b);
3378 #endif
3379 }
3380 
3405 static inline vb128_t
3407 {
3408 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3409 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3410  return vec_cmplt (vra, vrb);
3411 #else
3412  vb128_t vrt;
3413  __asm__(
3414  "vcmpgtuq %0,%2,%1;\n"
3415  : "=v" (vrt)
3416  : "v" (vra), "v" (vrb)
3417  : );
3418  return vrt;
3419 #endif
3420 #else
3421  vui128_t a_b;
3422 
3423  a_b = vec_subcuq (vra, vrb);
3424  return vec_setb_ncq (a_b);
3425 #endif
3426 }
3427 
3444 static inline vb128_t
3446 {
3447  /* vec_cmpneuq works for both signed and unsigned compares. */
3448  return vec_cmpneuq ((vui128_t) vra, (vui128_t) vrb);
3449 }
3450 
3474 static inline vb128_t
3476 {
3477 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3478 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3479  return vec_cmpne (vra, vrb);
3480 #else
3481  vb128_t vrt;
3482  __asm__(
3483  "vcmpequq %0,%1,%2;\n"
3484  : "=v" (vrt)
3485  : "v" (vra), "v" (vrb)
3486  : );
3487  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);
3488 #endif
3489 #elif defined (_ARCH_PWR8)
3490  __vector unsigned long long equd, swapd;
3491 
3492  equd = (vui64_t) vec_cmpequd ((vui64_t) vra, (vui64_t) vrb);
3493  swapd = vec_swapd (equd);
3494  return (vb128_t) vec_nand (equd, swapd);
3495 #else
3496  if (vec_any_ne ((vui32_t) vra, (vui32_t) vrb))
3497  return (vb128_t) vec_splat_s32 (-1);
3498  else
3499  return (vb128_t) vec_splat_u32 (0);
3500 #endif
3501 }
3502 
3520 static inline
3521 int
3523 {
3524  int result;
3525 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3526 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3527  return vec_all_eq (vra, vrb);
3528 #else
3529  vb128_t vrt;
3530  int u, r;
3531  __asm__(
3532  "vcmpequq. %0,%3,%4;\n"
3533  "mfocrf %1,2;\n"
3534  "rlwinm %2,%1,25,1"
3535  : "=v" (vrt), "=&r" (u), "=r" (r)
3536  : "v" (vra), "v" (vrb)
3537  : "cr6");
3538  return r;
3539 #endif
3540 #elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
3541  result = vec_all_eq((vui64_t)vra, (vui64_t)vrb);
3542 #else
3543  result = vec_all_eq((vui32_t)vra, (vui32_t)vrb);
3544 #endif
3545  return (result);
3546 }
3547 
3565 static inline int
3567 {
3568 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3569 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3570  return vec_all_ge (vra, vrb);
3571 #else
3572  vb128_t vrt;
3573  int u, r;
3574  __asm__(
3575  "vcmpgtsq. %0,%4,%3;\n"
3576  "mfocrf %1,2;\n"
3577  "rlwinm %2,%1,27,1"
3578  : "=v" (vrt), "=&r" (u), "=r" (r)
3579  : "v" (vra), "v" (vrb)
3580  : "cr6");
3581  return r;
3582 #endif
3583 #else
3584  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);
3585  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3586  vui128_t a_b, _a, _b;
3587 
3588  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);
3589  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);
3590 
3591  a_b = vec_subcuq (_a, _b);
3592  return vec_all_eq((vui32_t)a_b, carry128);
3593 #endif
3594 }
3595 
3613 static inline int
3615 {
3616 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3617 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3618  return vec_all_ge (vra, vrb);
3619 #else
3620  vb128_t vrt;
3621  int u, r;
3622  __asm__(
3623  "vcmpgtsq. %0,%3,%4;\n"
3624  "mfocrf %1,2;\n"
3625  "rlwinm %2,%1,25,1"
3626  : "=v" (vrt), "=&r" (u), "=r" (r)
3627  : "v" (vra), "v" (vrb)
3628  : "cr6");
3629  return r;
3630 #endif
3631 #else
3632  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);
3633  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3634  vui128_t b_a, _a, _b;
3635 
3636  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);
3637  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);
3638 
3639  b_a = vec_subcuq (_b, _a);
3640  return vec_all_eq((vui32_t)b_a, ncarry128);
3641 #endif
3642 }
3643 
3661 static inline int
3663 {
3664 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3665 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3666  return vec_all_le (vra, vrb);
3667 #else
3668  vb128_t vrt;
3669  int u, r;
3670  __asm__(
3671  "vcmpgtsq. %0,%3,%4;\n"
3672  "mfocrf %1,2;\n"
3673  "rlwinm %2,%1,27,1"
3674  : "=v" (vrt), "=&r" (u), "=r" (r)
3675  : "v" (vra), "v" (vrb)
3676  : "cr6");
3677  return r;
3678 #endif
3679 #else
3680  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);
3681  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3682  vui128_t b_a, _a, _b;
3683 
3684  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);
3685  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);
3686 
3687  b_a = vec_subcuq (_b, _a);
3688  return vec_all_eq((vui32_t)b_a, carry128);
3689 #endif
3690 }
3691 
3709 static inline int
3711 {
3712 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3713 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3714  return vec_all_lt (vra, vrb);
3715 #else
3716  vb128_t vrt;
3717  int u, r;
3718  __asm__(
3719  "vcmpgtsq. %0,%4,%3;\n"
3720  "mfocrf %1,2;\n"
3721  "rlwinm %2,%1,25,1"
3722  : "=v" (vrt), "=&r" (u), "=r" (r)
3723  : "v" (vra), "v" (vrb)
3724  : "cr6");
3725  return r;
3726 #endif
3727 #else
3728  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);
3729  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);
3730  vui128_t a_b, _a, _b;
3731 
3732  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);
3733  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);
3734 
3735  a_b = vec_subcuq (_a, _b);
3736  return vec_all_eq((vui32_t)a_b, ncarry128);
3737 #endif
3738 }
3739 
3757 static inline
3758 int
3760 {
3761  int result;
3762 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3763 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3764  return vec_all_ne (vra, vrb);
3765 #else
3766  vb128_t vrt;
3767  int u, r;
3768  __asm__(
3769  "vcmpequq. %0,%3,%4;\n"
3770  "mfocrf %1,2;\n"
3771  "rlwinm %2,%1,27,1"
3772  : "=v" (vrt), "=&r" (u), "=r" (r)
3773  : "v" (vra), "v" (vrb)
3774  : "cr6");
3775  return r;
3776 #endif
3777 #elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
3778  result = !vec_all_eq ((vui64_t) vra, (vui64_t) vrb);
3779 #else
3780  result = !vec_all_eq ((vui32_t) vra, (vui32_t) vrb);
3781 #endif
3782  return (result);
3783 }
3784 
3802 static inline
3803 int
3805 {
3806  int result;
3807 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3808 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3809  return vec_all_eq (vra, vrb);
3810 #else
3811  vb128_t vrt;
3812  int u, r;
3813  __asm__(
3814  "vcmpequq. %0,%3,%4;\n"
3815  "mfocrf %1,2;\n"
3816  "rlwinm %2,%1,25,1"
3817  : "=v" (vrt), "=&r" (u), "=r" (r)
3818  : "v" (vra), "v" (vrb)
3819  : "cr6");
3820  return r;
3821 #endif
3822 #elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
3823  result = vec_all_eq ((vui64_t) vra, (vui64_t) vrb);
3824 #else
3825  result = vec_all_eq ((vui32_t) vra, (vui32_t) vrb);
3826 #endif
3827  return (result);
3828 }
3829 
3847 static inline int
3849 {
3850 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3851 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3852  return vec_all_ge (vra, vrb);
3853 #else
3854  vb128_t vrt;
3855  int u, r;
3856  __asm__(
3857  "vcmpgtuq. %0,%4,%3;\n"
3858  "mfocrf %1,2;\n"
3859  "rlwinm %2,%1,27,1"
3860  : "=v" (vrt), "=&r" (u), "=r" (r)
3861  : "v" (vra), "v" (vrb)
3862  : "cr6");
3863  return r;
3864 #endif
3865 #else
3866  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);
3867  vui128_t a_b;
3868 
3869  a_b = vec_subcuq (vra, vrb);
3870  return vec_all_eq ((vui32_t) a_b, carry128);
3871 #endif
3872 }
3873 
3891 static inline int
3893 {
3894 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3895 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3896  return vec_all_ge (vra, vrb);
3897 #else
3898  vb128_t vrt;
3899  int u, r;
3900  __asm__(
3901  "vcmpgtuq. %0,%3,%4;\n"
3902  "mfocrf %1,2;\n"
3903  "rlwinm %2,%1,25,1"
3904  : "=v" (vrt), "=&r" (u), "=r" (r)
3905  : "v" (vra), "v" (vrb)
3906  : "cr6");
3907  return r;
3908 #endif
3909 #else
3910  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);
3911  vui128_t b_a;
3912 
3913  b_a = vec_subcuq (vrb, vra);
3914  return vec_all_eq ((vui32_t) b_a, ncarry128);
3915 #endif
3916 }
3917 
3935 static inline int
3937 {
3938 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3939 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3940  return vec_all_le (vra, vrb);
3941 #else
3942  vb128_t vrt;
3943  int u, r;
3944  __asm__(
3945  "vcmpgtuq. %0,%3,%4;\n"
3946  "mfocrf %1,2;\n"
3947  "rlwinm %2,%1,27,1"
3948  : "=v" (vrt), "=&r" (u), "=r" (r)
3949  : "v" (vra), "v" (vrb)
3950  : "cr6");
3951  return r;
3952 #endif
3953 #else
3954  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);
3955  vui128_t b_a;
3956 
3957  b_a = vec_subcuq (vrb, vra);
3958  return vec_all_eq ((vui32_t) b_a, carry128);
3959 #endif
3960 }
3961 
3979 static inline int
3981 {
3982 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
3983 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
3984  return vec_all_lt (vra, vrb);
3985 #else
3986  vb128_t vrt;
3987  int u, r;
3988  __asm__(
3989  "vcmpgtuq. %0,%4,%3;\n"
3990  "mfocrf %1,2;\n"
3991  "rlwinm %2,%1,25,1"
3992  : "=v" (vrt), "=&r" (u), "=r" (r)
3993  : "v" (vra), "v" (vrb)
3994  : "cr6");
3995  return r;
3996 #endif
3997 #else
3998  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);
3999  vui128_t a_b;
4000 
4001  a_b = vec_subcuq (vra, vrb);
4002  return vec_all_eq ((vui32_t) a_b, ncarry128);
4003 #endif
4004 }
4005 
4023 static inline
4024 int
4026 {
4027  int result;
4028 #if defined (_ARCH_PWR10) && defined (__VSX__) && (__GNUC__ >= 10)
4029 #if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))
4030  return vec_all_ne (vra, vrb);
4031 #else
4032  vb128_t vrt;
4033  int u, r;
4034  __asm__(
4035  "vcmpequq. %0,%3,%4;\n"
4036  "mfocrf %1,2;\n"
4037  "rlwinm %2,%1,27,1"
4038  : "=v" (vrt), "=&r" (u), "=r" (r)
4039  : "v" (vra), "v" (vrb)
4040  : "cr6");
4041  return r;
4042 #endif
4043 #elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
4044  result = !vec_all_eq ((vui64_t) vra, (vui64_t) vrb);
4045 #else
4046  result = !vec_all_eq ((vui32_t) vra, (vui32_t) vrb);
4047 #endif
4048  return (result);
4049 }
4050 
4066 static inline vui128_t
4068 {
4069  vui32_t t;
4070  vui32_t t_carry;
4071 #ifdef _ARCH_PWR9
4072  __asm__(
4073  "vmul10ecuq %0,%2,%3;\n"
4074  "vmul10euq %1,%2,%3;\n"
4075  : "=&v" (t_carry),
4076  "=v" (t)
4077  : "v" (a),
4078  "v" (cin)
4079  : );
4080 #else
4081  vui16_t ts = (vui16_t) a;
4082  vui32_t tc;
4083  vui16_t t10;
4084  vui32_t t_odd, t_even, t_high;
4085  vui32_t z = { 0, 0, 0, 0 };
4086  t10 = vec_splat_u16(10);
4087 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4088  t_even = vec_vmulouh (ts, t10);
4089  t_odd = vec_vmuleuh (ts, t10);
4090 #else
4091  t_even = vec_vmuleuh (ts, t10);
4092  t_odd = vec_vmulouh (ts, t10);
4093 #endif
4094  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
4095  t_high = vec_sld (z, t_even, 2);
4096  /* Shift cin left 112 bits. */
4097  tc = vec_sld ((vui32_t) cin, z, 14);
4098  /* Shift t_even left 16 bits, merging the carry into the low bits. */
4099  t_even = vec_sld (t_even, tc, 2);
4100  /* then add the even/odd sub-products to generate the final product */
4101 #ifdef _ARCH_PWR8
4102  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
4103  t_carry = t_high; /* there is not carry into high */
4104  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
4105 #else
4106  t_carry = t_high; /* there is no carry into high */
4107  /* Use pveclib adduqm implementation for pre _ARCH_PWR8. */
4108  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
4109 #endif
4110 #endif
4111  *cout = (vui128_t) t_carry;
4112  return ((vui128_t) t);
4113 }
4114 
4129 static inline vui128_t
4131 {
4132  vui32_t t;
4133  vui32_t t_carry;
4134 #ifdef _ARCH_PWR9
4135  __asm__(
4136  "vmul10cuq %0,%2;\n"
4137  "vmul10uq %1,%2;\n"
4138  : "=&v" (t_carry),
4139  "=v" (t)
4140  : "v" (a)
4141  : );
4142 #else
4143  vui16_t ts = (vui16_t) a;
4144  vui16_t t10;
4145  vui32_t t_odd, t_even, t_high;
4146  vui32_t z = { 0, 0, 0, 0 };
4147  t10 = vec_splat_u16(10);
4148 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4149  t_even = vec_vmulouh (ts, t10);
4150  t_odd = vec_vmuleuh (ts, t10);
4151 #else
4152  t_even = vec_vmuleuh(ts, t10);
4153  t_odd = vec_vmulouh(ts, t10);
4154 #endif
4155  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
4156  t_high = vec_sld (z, t_even, 2);
4157  /* Shift t_even left 16 bits to align for lower 128-bits. */
4158  t_even = vec_sld (t_even, z, 2);
4159  /* then add the even/odd sub-products to generate the final product */
4160 #ifdef _ARCH_PWR8
4161  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
4162  t_carry = t_high; /* there is no carry into high */
4163  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
4164 #else
4165  t_carry = t_high; /* there is no carry into high */
4166  /* Use pveclib adduqm implementation for pre _ARCH_PWR8. */
4167  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
4168 #endif
4169 #endif
4170  *cout = (vui128_t) t_carry;
4171  return ((vui128_t) t);
4172 }
4173 
4192 static inline vi128_t
4194 {
4195  const vui128_t zero = (vui128_t) { (__int128) 0 };
4196  /* ten32 = +100000000000000000000000000000000UQ */
4197  const vui128_t ten31 = (vui128_t)
4198  { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
4199  /* Magic numbers for multiplicative inverse to divide by 10**31
4200  are 4804950418589725908363185682083061167, corrective add,
4201  and shift right 107 bits. */
4202  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(
4203  0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);
4204  const int shift_ten31 = 103;
4205  vui128_t result, t, q, uvra;
4206  vb128_t negbool;
4207 
4208  negbool = vec_setb_sq (vra);
4209  uvra = (vui128_t) vec_sel ((vui32_t) vra,
4210  (vui32_t) vec_subuqm (zero, (vui128_t) vra),
4211  (vb32_t) negbool);
4212 
4213  if (vec_cmpuq_all_ge (uvra, ten31))
4214  {
4215  q = vec_mulhuq (uvra, mul_invs_ten31);
4216  // Need corrective add but want to avoid carry & double quad shift
4217  // The following avoids the carry and less instructions
4218  t = vec_subuqm (uvra, q);
4219  t = vec_srqi (t, 1);
4220  t = vec_adduqm (t, q);
4221  result = vec_srqi (t, shift_ten31-1);
4222  result = (vui128_t) vec_sel ((vui32_t) result,
4223  (vui32_t) vec_subuqm (zero, (vui128_t) result),
4224  (vb32_t) negbool);
4225  }
4226  else
4227  result = zero;
4228 
4229  return (vi128_t) result;
4230 }
4231 
4256 static inline vui128_t
4258 {
4259  const vui128_t ten31 = (vui128_t)
4260  { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
4261  const vui128_t zero = (vui128_t) { (__int128) 0UL };
4262  /* Magic numbers for multiplicative inverse to divide by 10**31
4263  are 4804950418589725908363185682083061167, corrective add,
4264  and shift right 103 bits. */
4265  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(
4266  0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);
4267  const int shift_ten31 = 103;
4268  vui128_t result, r2, t, q, q1, q2, c;
4269 
4270  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))
4271  {
4272  // Multiply high [vra||vrb] * mul_invs_ten31
4273  q = vec_mulhuq (vrb, mul_invs_ten31);
4274  q1 = vec_muludq (&t, vra, mul_invs_ten31);
4275  c = vec_addcuq (q1, q);
4276  q = vec_adduqm (q1, q);
4277  q1 = vec_adduqm (t, c);
4278  // corrective add [q2||q1||q] = [q1||q] + [vra||vrb]
4279  c = vec_addcuq (vrb, q);
4280  q = vec_adduqm (vrb, q);
4281  // q2 is the carry-out from the corrective add
4282  q2 = vec_addecuq (q1, vra, c);
4283  q1 = vec_addeuqm (q1, vra, c);
4284  // shift 384-bits (including the carry) right 107 bits
4285  // Using shift left double quadword shift by (128-107)-bits
4286  r2 = vec_sldqi (q2, q1, (128 - shift_ten31));
4287  result = vec_sldqi (q1, q, (128 - shift_ten31));
4288  }
4289  else
4290  {
4291  // Dividend less than divisor then return zero quotient
4292  r2 = zero;
4293  result = zero;
4294  }
4295 
4296  // return 256-bit quotient
4297  *qh = r2;
4298  return result;
4299 }
4300 
4301 
4326 static inline vui128_t
4328 {
4329  /* ten32 = +100000000000000000000000000000000UQ */
4330  const vui128_t ten32 = (vui128_t)
4331  { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };
4332  const vui128_t zero = (vui128_t) { (__int128) 0UL };
4333  /* Magic numbers for multiplicative inverse to divide by 10**32
4334  are 211857340822306639531405861550393824741, corrective add,
4335  and shift right 107 bits. */
4336  const vui128_t mul_invs_ten32 = (vui128_t) CONST_VINT128_DW(
4337  0x9f623d5a8a732974UL, 0xcfbc31db4b0295e5UL);
4338  const int shift_ten32 = 107;
4339  vui128_t result, r2, t, q, q1, q2, c;
4340 
4341  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten32))
4342  {
4343  // Multiply high [vra||vrb] * mul_invs_ten31
4344  q = vec_mulhuq (vrb, mul_invs_ten32);
4345  q1 = vec_muludq (&t, vra, mul_invs_ten32);
4346  c = vec_addcuq (q1, q);
4347  q = vec_adduqm (q1, q);
4348  q1 = vec_adduqm (t, c);
4349  // corrective add [q2||q1||q] = [q1||q] + [vra||vrb]
4350  c = vec_addcuq (vrb, q);
4351  q = vec_adduqm (vrb, q);
4352  // q2 is the carry-out from the corrective add
4353  q2 = vec_addecuq (q1, vra, c);
4354  q1 = vec_addeuqm (q1, vra, c);
4355  // shift 384-bits (including the carry) right 107 bits
4356  // Using shift left double quadword shift by (128-107)-bits
4357  r2 = vec_sldqi (q2, q1, (128 - shift_ten32));
4358  result = vec_sldqi (q1, q, (128 - shift_ten32));
4359  }
4360  else
4361  {
4362  // Dividend less than divisor then return zero quotient
4363  r2 = zero;
4364  result = zero;
4365  }
4366 
4367  // return 256-bit quotient
4368  *qh = r2;
4369  return result;
4370 }
4371 
4390 static inline vui128_t
4392 {
4393  /* ten32 = +100000000000000000000000000000000UQ */
4394  const vui128_t ten31 = (vui128_t)
4395  { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
4396  /* Magic numbers for multiplicative inverse to divide by 10**31
4397  are 4804950418589725908363185682083061167, corrective add,
4398  and shift right 103 bits. */
4399  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(
4400  0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);
4401  const int shift_ten31 = 103;
4402  vui128_t result, t, q;
4403 
4404  if (vec_cmpuq_all_ge (vra, ten31))
4405  {
4406  q = vec_mulhuq (vra, mul_invs_ten31);
4407  // Need corrective add but want to avoid carry & double quad shift
4408  // The following avoids the carry and less instructions
4409  t = vec_subuqm (vra, q);
4410  t = vec_srqi (t, 1);
4411  t = vec_adduqm (t, q);
4412  result = vec_srqi (t, shift_ten31-1);
4413  }
4414  else
4415  result = (vui128_t) { (__int128) 0 };
4416 
4417  return result;
4418 }
4419 
4438 static inline vui128_t
4440 {
4441  /* ten32 = +100000000000000000000000000000000UQ */
4442  const vui128_t ten32 = (vui128_t)
4443  { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };
4444  /* Magic numbers for multiplicative inverse to divide by 10**32
4445  are 211857340822306639531405861550393824741, corrective add,
4446  and shift right 107 bits. */
4447  const vui128_t mul_invs_ten32 = (vui128_t) CONST_VINT128_DW(
4448  0x9f623d5a8a732974UL, 0xcfbc31db4b0295e5UL);
4449  const int shift_ten32 = 107;
4450  vui128_t result, t, q;
4451 
4452  if (vec_cmpuq_all_ge (vra, ten32))
4453  {
4454  q = vec_mulhuq (vra, mul_invs_ten32);
4455  // Need corrective add but want to avoid carry & double quad shift
4456  // The following avoids the carry and less instructions
4457  t = vec_subuqm (vra, q);
4458  t = vec_srqi (t, 1);
4459  t = vec_adduqm (t, q);
4460  result = vec_srqi (t, shift_ten32-1);
4461  }
4462  else
4463  result = (vui128_t) { (__int128) 0 };
4464 
4465  return result;
4466 }
4467 
4482 static inline
4483 vi128_t
4485 {
4486  vb32_t maxmask;
4487 
4488  maxmask = (vb32_t) vec_cmpgtsq ( vra, vrb );
4489  return (vi128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, maxmask);
4490 }
4491 
4506 static inline
4507 vui128_t
4509 {
4510  vb32_t maxmask;
4511 
4512  maxmask = (vb32_t) vec_cmpgtuq ( vra, vrb );
4513  return (vui128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, maxmask);
4514 }
4515 
4530 static inline
4531 vi128_t
4533 {
4534  vb32_t minmask;
4535 
4536  minmask = (vb32_t) vec_cmpgtsq ( vrb, vra );
4537  return (vi128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, minmask);
4538 }
4539 
4554 static inline
4555 vui128_t
4557 {
4558  vb32_t minmask;
4559 
4560  minmask = (vb32_t) vec_cmpgtuq ( vrb, vra );
4561  return (vui128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, minmask);
4562 }
4563 
4577 static inline vi128_t
4579 {
4580  const vui128_t zero = (vui128_t) { (__int128) 0 };
4581  /* ten32 = +100000000000000000000000000000000UQ */
4582  const vui128_t ten31 = (vui128_t)
4583  { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
4584  vui128_t result, t;
4585 
4586  // multiply low and subtract modulo are the same for signed/unsigned
4587  // But now easier to compare q for zero than signed compare to vra
4588  if (vec_cmpuq_all_ne ((vui128_t) vra, zero))
4589  {
4590  t = vec_mulluq ((vui128_t) q, ten31);
4591  result = vec_subuqm ((vui128_t) vra, (vui128_t) t);
4592  }
4593  else
4594  result = (vui128_t) vra;
4595 
4596  return (vi128_t) result;
4597 }
4598 
4619 static inline vui128_t
4621 {
4622  /* ten31 = +100000000000000000000000000000000UQ */
4623  const vui128_t ten31 = (vui128_t)
4624  { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
4625  const vui128_t zero = (vui128_t) { (__int128) 0UL };
4626  const vui128_t minus_one = (vui128_t) { (__int128) -1L };
4627  vui128_t result, t, th, c;
4628 
4629  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))
4630  {
4631  t = vec_muludq (&th, *ql, ten31);
4632  c = vec_subcuq (vrb, t);
4633  t = vec_subuqm (vrb, t);
4634  th = vec_subeuqm (vra, th, c);
4635  // The remainder should be less than the divisor
4636  if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten31))
4637  {
4638  // If not the estimated quotient is off by 1
4639  *ql = vec_adduqm (*ql, minus_one);
4640  // And the remainder is negative, so add the divisor
4641  t = vec_adduqm (t, ten31);
4642  }
4643  result = t;
4644  }
4645  else
4646  result = vrb;
4647 
4648  return result;
4649 }
4650 
4651 
4672 static inline vui128_t
4674 {
4675  /* ten32 = +100000000000000000000000000000000UQ */
4676  const vui128_t ten32 = (vui128_t)
4677  { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };
4678  const vui128_t zero = (vui128_t) { (__int128) 0UL };
4679  const vui128_t minus_one = (vui128_t) { (__int128) -1L };
4680  vui128_t result, t, th, c;
4681 
4682  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten32))
4683  {
4684  t = vec_muludq (&th, *ql, ten32);
4685  c = vec_subcuq (vrb, t);
4686  t = vec_subuqm (vrb, t);
4687  th = vec_subeuqm (vra, th, c);
4688  // The remainder should be less than the divisor
4689  if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten32))
4690  {
4691  // If not the estimated quotient is off by 1
4692  *ql = vec_adduqm (*ql, minus_one);
4693  // And the remainder is negative, so add the divisor
4694  t = vec_adduqm (t, ten32);
4695  }
4696  result = t;
4697  }
4698  else
4699  result = vrb;
4700 
4701  return result;
4702 }
4703 
4717 static inline vui128_t
4719 {
4720  /* ten31 = +100000000000000000000000000000000UQ */
4721  const vui128_t ten31 = (vui128_t)
4722  { (__int128) 1000000000000000UL
4723  * (__int128) 10000000000000000UL };
4724  vui128_t result, t;
4725 
4726  if (vec_cmpuq_all_ge (vra, ten31))
4727  {
4728  t = vec_mulluq (q, ten31);
4729  result = vec_subuqm (vra, t);
4730  }
4731  else
4732  result = vra;
4733 
4734  return result;
4735 }
4736 
4750 static inline vui128_t
4752 {
4753  /* ten32 = +100000000000000000000000000000000UQ */
4754  const vui128_t ten32 = (vui128_t)
4755  { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };
4756  vui128_t result, t;
4757 
4758  if (vec_cmpuq_all_ge (vra, ten32))
4759  {
4760  t = vec_mulluq (q, ten32);
4761  result = vec_subuqm (vra, t);
4762  }
4763  else
4764  result = vra;
4765 
4766  return result;
4767 }
4768 
4784 static inline vui128_t
4786 {
4787  vui32_t t_carry;
4788 #ifdef _ARCH_PWR9
4789  __asm__(
4790  "vmul10cuq %0,%1;\n"
4791  : "=v" (t_carry)
4792  : "v" (a)
4793  : );
4794 #else
4795  vui16_t ts = (vui16_t) a;
4796  vui16_t t10;
4797  vui32_t t_even, t_odd, t_high;
4798  vui32_t z = { 0, 0, 0, 0 };
4799  t10 = vec_splat_u16(10);
4800 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4801  t_even = vec_vmulouh (ts, t10);
4802  t_odd = vec_vmuleuh (ts, t10);
4803 #else
4804  t_even = vec_vmuleuh(ts, t10);
4805  t_odd = vec_vmulouh(ts, t10);
4806 #endif
4807  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
4808  t_high = vec_sld (z, t_even, 2);
4809  /* Shift t_even left 16 bits to align for lower 128-bits. */
4810  t_even = vec_sld (t_even, z, 2);
4811  /* then add the even/odd sub-products to generate the final product */
4812 #ifdef _ARCH_PWR8
4813  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
4814  t_carry = (vui32_t) vec_vaddcuq ((vui128_t) t_even, (vui128_t) t_odd);
4815  t_carry = (vui32_t) vec_vadduqm ((vui128_t) t_carry, (vui128_t) t_high);
4816 #else
4817  /* Use pveclib addcuq implementation for pre _ARCH_PWR8. */
4818  t_carry = (vui32_t) vec_addcuq ((vui128_t) t_even, (vui128_t) t_odd);
4819  /* The final carry is small (0-9) so use word add, ignore carries. */
4820  t_carry = vec_vadduwm (t_carry, t_high);
4821 #endif
4822 #endif
4823  return ((vui128_t) t_carry);
4824 }
4825 
4840 static inline vui128_t
4842 {
4843 // vui32_t t;
4844  vui32_t t_carry;
4845 #ifdef _ARCH_PWR9
4846  __asm__(
4847  "vmul10ecuq %0,%1,%2;\n"
4848  : "=&v" (t_carry)
4849  : "v" (a),
4850  "v" (cin)
4851  : );
4852 #else
4853  vui16_t ts = (vui16_t) a;
4854  vui32_t tc;
4855  vui16_t t10;
4856  vui32_t t_odd;
4857  vui32_t t_even, t_high;
4858  vui32_t z = { 0, 0, 0, 0 };
4859  t10 = vec_splat_u16(10);
4860 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4861  t_even = vec_vmulouh (ts, t10);
4862  t_odd = vec_vmuleuh (ts, t10);
4863 #else
4864  t_even = vec_vmuleuh(ts, t10);
4865  t_odd = vec_vmulouh(ts, t10);
4866 #endif
4867  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
4868  t_high = vec_sld (z, t_even, 2);
4869  /* Shift cin left 112 bits. */
4870  tc = vec_sld ((vui32_t) cin, z, 14);
4871  /* Shift t_even left 16 bits, merging the carry into the low bits. */
4872  t_even = vec_sld (t_even, tc, 2);
4873  /* then add the even/odd sub-products to generate the final product */
4874 #ifdef _ARCH_PWR8
4875  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
4876  t_carry = (vui32_t) vec_vaddcuq ((vui128_t) t_even, (vui128_t) t_odd);
4877  t_carry = (vui32_t) vec_vadduqm ((vui128_t) t_carry, (vui128_t) t_high);
4878 #else
4879  /* Use pveclib addcuq implementation for pre _ARCH_PWR8. */
4880  t_carry = (vui32_t) vec_addcuq ((vui128_t) t_even, (vui128_t) t_odd);
4881  /* The final carry is small (0-9) so use word add, ignore carries. */
4882  t_carry = vec_vadduwm (t_carry, t_high);
4883 #endif
4884 #endif
4885  return ((vui128_t) t_carry);
4886 }
4887 
4902 static inline vui128_t
4904 {
4905  vui32_t t;
4906 #ifdef _ARCH_PWR9
4907  __asm__(
4908  "vmul10euq %0,%1,%2;\n"
4909  : "=v" (t)
4910  : "v" (a),
4911  "v" (cin)
4912  : );
4913 #else
4914  vui16_t ts = (vui16_t) a;
4915  vui32_t tc;
4916  vui16_t t10;
4917  vui32_t t_odd, t_even;
4918  vui32_t z = { 0, 0, 0, 0 };
4919  t10 = vec_splat_u16(10);
4920 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4921  t_even = vec_vmulouh (ts, t10);
4922  t_odd = vec_vmuleuh (ts, t10);
4923 #else
4924  t_even = vec_vmuleuh(ts, t10);
4925  t_odd = vec_vmulouh(ts, t10);
4926 #endif
4927  /* Shift cin left 112 bits. */
4928  tc = vec_sld ((vui32_t) cin, z, 14);
4929  /* Shift t_even left 16 bits, merging the carry into the low bits. */
4930  t_even = vec_sld (t_even, tc, 2);
4931  /* then add the even/odd sub-products to generate the final product. */
4932 #ifdef _ARCH_PWR8
4933  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
4934 #else
4935  /* Use pveclib addcuq implementation for pre _ARCH_PWR8. */
4936  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
4937 #endif
4938 #endif
4939  return ((vui128_t) t);
4940 }
4941 
4955 static inline vui128_t
4957 {
4958  vui32_t t;
4959 #ifdef _ARCH_PWR9
4960  __asm__(
4961  "vmul10uq %0,%1;\n"
4962  : "=v" (t)
4963  : "v" (a)
4964  : );
4965 #else
4966  vui16_t ts = (vui16_t) a;
4967  vui16_t t10;
4968  vui32_t t_odd, t_even;
4969  vui32_t z = { 0, 0, 0, 0 };
4970  t10 = vec_splat_u16(10);
4971 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
4972  t_even = vec_vmulouh (ts, t10);
4973  t_odd = vec_vmuleuh (ts, t10);
4974 #else
4975  t_even = vec_vmuleuh(ts, t10);
4976  t_odd = vec_vmulouh(ts, t10);
4977 #endif
4978  /* Shift t_even left 16 bits */
4979  t_even = vec_sld (t_even, z, 2);
4980  /* then add the even/odd sub-products to generate the final product */
4981 #ifdef _ARCH_PWR8
4982  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
4983 #else
4984  /* Use pveclib addcuq implementation for pre _ARCH_PWR8. */
4985  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
4986 #endif
4987 #endif
4988  return ((vui128_t) t);
4989 }
4990 
5005 static inline vui128_t
5007 {
5008  vui32_t t;
5009  vui32_t t_carry;
5010 #ifdef _ARCH_PWR9
5011  vui128_t t0, t1, tc0, tc1;
5012  /* Times 10 with 1st carry. */
5013  tc0 = vec_mul10cuq (a);
5014  t0 = vec_mul10uq (a);
5015  /* Times 10 again with 2nd carry. */
5016  tc1 = vec_mul10cuq (t0);
5017  t1 = vec_mul10uq (t0);
5018  /* 1st carry times 10 plus 2nd carry. */
5019  t_carry = (vui32_t) vec_mul10euq (tc0, tc1);
5020  t = (vui32_t)t1;
5021 #else
5022  vui16_t ts = (vui16_t) a;
5023  vui16_t t100 = (vui16_t ) { 100, 100, 100, 100, 100, 100, 100, 100 };
5024  vui32_t t_odd, t_even, t_high;
5025  vui32_t z = { 0, 0, 0, 0 };
5026  //t100 = vec_splat_u16 (100);
5027 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
5028  t_even = vec_vmulouh (ts, t100);
5029  t_odd = vec_vmuleuh (ts, t100);
5030 #else
5031  t_even = vec_vmuleuh(ts, t100);
5032  t_odd = vec_vmulouh(ts, t100);
5033 #endif
5034  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
5035  t_high = vec_sld (z, t_even, 2);
5036  /* Shift t_even left 16 bits to align for lower 128-bits. */
5037  t_even = vec_sld (t_even, z, 2);
5038  /* then add the even/odd sub-products to generate the final product */
5039 #ifdef _ARCH_PWR8
5040  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
5041  t_carry = t_high; /* there is no carry into high */
5042  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
5043 #else
5044  t_carry = t_high; /* there is no carry into high */
5045  /* Use pveclib adduqm implementation for pre _ARCH_PWR8. */
5046  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5047 #endif
5048 #endif
5049  *cout = (vui128_t) t_carry;
5050  return ((vui128_t) t);
5051 }
5052 
5070 static inline vui128_t
5072 {
5073  vui32_t t;
5074  vui32_t t_carry;
5075 #ifdef _ARCH_PWR9
5076  vui128_t t0, t1, tc0, tc1;
5077  /* Times 10 with 1st carry. */
5078  tc0 = vec_mul10cuq (a);
5079  t0 = vec_mul10uq (a);
5080  /* Times 10 again with 2nd carry. No carry in yet. */
5081  tc1 = vec_mul10cuq (t0);
5082  t1 = vec_mul10uq (t0);
5083  /* 1st carry times 10 plus 2nd carry. */
5084  t_carry = (vui32_t) vec_mul10euq (tc0, tc1);
5085  /* Add cin to the low bits of a * 100. If cin is in valid range
5086  * (0-99) then can not generate carry out of low 128-bits. */
5087  t = (vui32_t) vec_vadduqm ((vui128_t) t1, cin);
5088 #else
5089  vui16_t ts = (vui16_t) a;
5090  vui32_t tc;
5091  vui16_t t100 = (vui16_t ) { 100, 100, 100, 100, 100, 100, 100, 100 };
5092  vui32_t t_odd, t_even, t_high;
5093  vui32_t z = { 0, 0, 0, 0 };
5094  //t100 = vec_splat_u16 (100);
5095 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
5096  t_even = vec_vmulouh (ts, t100);
5097  t_odd = vec_vmuleuh (ts, t100);
5098 #else
5099  t_even = vec_vmuleuh (ts, t100);
5100  t_odd = vec_vmulouh (ts, t100);
5101 #endif
5102  /* Shift t_even left 16-bits (right 112-bits) for the partial carry. */
5103  t_high = vec_sld (z, t_even, 2);
5104  /* Shift cin left 112 bits. */
5105  tc = vec_sld ((vui32_t) cin, z, 14);
5106  /* Shift t_even left 16 bits, merging the carry into the low bits. */
5107  t_even = vec_sld (t_even, tc, 2);
5108  /* then add the even/odd sub-products to generate the final product */
5109 #ifdef _ARCH_PWR8
5110  /* Any compiler that supports ARCH_PWR8 should support these builtins. */
5111  t_carry = t_high; /* there is no carry into high */
5112  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);
5113 #else
5114  t_carry = t_high; /* there is no carry into high */
5115  /* Use pveclib adduqm implementation for pre _ARCH_PWR8. */
5116  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5117 #endif
5118 #endif
5119  *cout = (vui128_t) t_carry;
5120  return ((vui128_t) t);
5121 }
5122 
5144 static inline vui128_t
5146 {
5147  vui128_t res;
5148 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
5149  __asm__(
5150  "vmsumcud %0,%1,%2,%3;\n"
5151  : "=v" (res)
5152  : "v" (a), "v" (b), "v" (c)
5153  : );
5154 #else
5155  vui128_t p_even, p_odd, p_sum1, p_cry1, p_cry2;
5156  // Generate separate 128-bit even/odd products to isolate the carries
5157  p_even = vec_muleud (a, b);
5158  p_odd = vec_muloud (a, b);
5159  // Sum the products and generate the carry
5160 #ifdef _ARCH_PWR8
5161  p_sum1 = vec_adduqm (p_even, p_odd);
5162  p_cry1 = vec_addcuq (p_even, p_odd);
5163 #else
5164  p_sum1 = vec_addcq (&p_cry1, p_even, p_odd);
5165 #endif
5166  // Generate the carry from the sum (p_even + p_odd + c)
5167  p_cry2 = vec_addcuq (p_sum1, c);
5168  // Sum the two carries
5169 #ifdef _ARCH_PWR9
5170  res = vec_adduqm (p_cry2, p_cry1);
5171 #else
5172  /* Results can be 0-2, So Add Word will do. */
5173  res = (vui128_t) vec_add ((vui32_t) p_cry2, (vui32_t) p_cry1);
5174 #endif
5175 #endif
5176  return (res);
5177 }
5178 
5201 static inline vui128_t
5203 {
5204  vui128_t res;
5205 #if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
5206  __asm__(
5207  "vmsumudm %0,%1,%2,%3;\n"
5208  : "=v" (res)
5209  : "v" (a), "v" (b), "v" (c)
5210  : );
5211 #else
5212  vui128_t p_even, p_odd, p_sum;
5213 
5214  p_even = vec_muleud (a, b);
5215  p_odd = vec_muloud (a, b);
5216  p_sum = vec_adduqm (p_even, p_odd);
5217  res = vec_adduqm (p_sum, c);
5218 #endif
5219 
5220  return (res);
5221 }
5222 
5243 static inline vui128_t
5245 {
5246 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
5247  return vec_vmuloud (a, b);
5248 #else
5249  return vec_vmuleud (a, b);
5250 #endif
5251 }
5252 
5276 static inline vui64_t
5278 {
5279 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
5280  vui64_t res;
5281  __asm__(
5282  "vmulhud %0,%1,%2;\n"
5283  : "=v" (res)
5284  : "v" (vra), "v" (vrb)
5285  : );
5286  return res;
5287 #else
5288  return vec_mrgahd (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));
5289 #endif
5290 }
5291 
5312 static inline vui128_t
5314 {
5315 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
5316  return vec_vmuleud (a, b);
5317 #else
5318  return vec_vmuloud (a, b);
5319 #endif
5320 }
5321 
5343 static inline vui64_t
5345 {
5346 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
5347  vui64_t res;
5348  __asm__(
5349  "vmulld %0,%1,%2;\n"
5350  : "=v" (res)
5351  : "v" (vra), "v" (vrb)
5352  : );
5353  return res;
5354 #elif defined (_ARCH_PWR9)
5355  return vec_mrgald (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));
5356 #elif defined (_ARCH_PWR8)
5357  vui64_t s32 = { 32, 32 }; // shift / rotate amount.
5358  vui64_t z = { 0, 0 };
5359  vui64_t t2, t3, t4;
5360  vui32_t t1;
5361 
5362  t1 = (vui32_t) vec_vrld (vrb, s32);
5363  t2 = vec_vmulouw ((vui32_t)vra, (vui32_t)vrb);
5364  t3 = vec_vmsumuwm ((vui32_t)vra, t1, z);
5365  t4 = vec_vsld (t3, s32);
5366  return (vui64_t) vec_addudm (t4, t2);
5367 #else
5368  return vec_mrgald (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));
5369 #endif
5370 }
5371 
5386 static inline vui128_t
5388 {
5389  vui32_t t;
5390  /* compute the 256 bit product of two 128 bit values a, b.
5391  * The high 128 bits are accumulated in t and the low 128-bits
5392  * in tmq. The high 128-bits are the return value.
5393  */
5394 #ifdef _ARCH_PWR9
5395  const vui64_t zero = { 0, 0 };
5396  vui64_t b_swap = vec_swapd ((vui64_t) b);
5397  vui128_t tmh, tab, tba, tb0, tc1, tc2, tmq;
5398  /* multiply the low 64-bits of a and b. For PWR9 this is just
5399  * vmsumudm with conditioned inputs. */
5400  tmq = vec_vmuloud ((vui64_t) a, (vui64_t) b);
5401  /* compute the 2 middle partial projects. Can't directly use
5402  * vmsumudm here because the sum of partial products can overflow. */
5403  tab = vec_vmuloud ((vui64_t) a, b_swap);
5404  tba = vec_vmuleud ((vui64_t) a, b_swap);
5405  t = (vui32_t) vec_adduqm (tab, tba);
5406  tc1 = vec_addcuq (tab, tba);
5407  tmh = (vui128_t) vec_mrgahd ((vui128_t) zero, (vui128_t) tmq);
5408  t = (vui32_t ) vec_adduqm ((vui128_t) t, tmh);
5409  tc2 = vec_addcuq ((vui128_t) t, tmh);
5410  tc1 = (vui128_t) vec_vadduwm ((vui32_t) tc1, (vui32_t) tc2);
5411  /* result = t[l] || tmq[l]. */
5412  tmq = (vui128_t) vec_mrgald ((vui128_t) t, (vui128_t) tmq);
5413  /* we can use multiply sum here because the high product plus the
5414  * high sum of middle partial products can't overflow. */
5415  t = (vui32_t) vec_permdi ((vui64_t) tc1, (vui64_t) t, 2);
5416  tb0 = (vui128_t) vec_mrgahd ((vui128_t) b, (vui128_t) zero);
5417  /* sum = (a[h] * b[h]) + (a[l] * 0) + (tc1[l] || t[h]). */
5418  t = (vui32_t) vec_msumudm ((vui64_t) a, (vui64_t) tb0, (vui128_t) t);
5419 #else
5420 #ifdef _ARCH_PWR8
5421  vui32_t tsw;
5422  vui32_t t_odd, t_even;
5423  vui32_t z = { 0, 0, 0, 0 };
5424  /* We use Vector Multiply Even/Odd Unsigned Word to compute
5425  * the 128 x 32 partial (160-bit) product of vector a with a
5426  * word element of b. The (for each word of vector b) 4 X 160-bit
5427  * partial products are summed to produce the full 256-bit product.
5428  * See the comment in vec_muludq for details.
5429  */
5430  tsw = vec_splat ((vui32_t) b, VEC_WE_3);
5431  t_even = (vui32_t) vec_vmuleuw ((vui32_t) a, tsw);
5432  t_odd = (vui32_t) vec_vmulouw ((vui32_t) a, tsw);
5433  /* shift the low 128 bits of partial product right 32-bits */
5434  t_odd = vec_sld (z, t_odd, 12);
5435  /* add the high 128 bits of even / odd partial products */
5436  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5437 
5438  tsw = vec_splat ((vui32_t) b, VEC_WE_2);
5439  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5440  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5441  /* shift the low 128 bits of partial product right 32-bits */
5442  t_odd = vec_sld (z, t_odd, 12);
5443  /* add the top 128 bits of even / odd partial products */
5444  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5445 
5446  tsw = vec_splat ((vui32_t) b, VEC_WE_1);
5447  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5448  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5449  /* shift the low 128 bits of partial product right 32-bits */
5450  t_odd = vec_sld (z, t_odd, 12);
5451  /* add the top 128 bits of even / odd partial products */
5452  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5453 
5454  tsw = vec_splat ((vui32_t) b, VEC_WE_0);
5455  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5456  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5457  /* shift the low 128 bits of partial product right 32-bits */
5458  t_odd = vec_sld (z, t_odd, 12);
5459  /* add the top 128 bits of even / odd partial products */
5460  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5461 #else // _ARCH_PWR7 or earlier and Big Endian only. */
5462  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute
5463  * the 128 x 16 partial (144-bit) product of vector a with a
5464  * halfword element of b. The (for each halfword of vector b)
5465  * 8 X 144-bit partial products are summed to produce the full
5466  * 256-bit product. */
5467  vui16_t tsw;
5468  vui16_t t_odd, t_even;
5469  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };
5470 
5471  tsw = vec_splat ((vui16_t) b, 7);
5472  t_even = (vui16_t) vec_vmuleuh ((vui16_t) a, tsw);
5473  t_odd = (vui16_t) vec_vmulouh ((vui16_t) a, tsw);
5474 
5475  /* shift the low 128 bits of partial product right 16-bits */
5476  t_odd = vec_sld (z, t_odd, 14);
5477  /* add the high 128 bits of even / odd partial products */
5478  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5479 
5480  tsw = vec_splat ((vui16_t) b, 6);
5481  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5482  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5483  /* shift the low 128 bits of partial product right 16-bits */
5484  t_odd = vec_sld (z, t_odd, 14);
5485  /* add the top 128 bits of even / odd partial products */
5486  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5487 
5488  tsw = vec_splat ((vui16_t) b, 5);
5489  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5490  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5491  /* shift the low 128 bits of partial product right 16-bits */
5492  t_odd = vec_sld (z, t_odd, 14);
5493  /* add the top 128 bits of even / odd partial products */
5494  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5495 
5496  tsw = vec_splat ((vui16_t) b, 4);
5497  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5498  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5499  /* shift the low 128 bits of partial product right 16-bits */
5500  t_odd = vec_sld (z, t_odd, 14);
5501  /* add the top 128 bits of even / odd partial products */
5502  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5503 
5504  tsw = vec_splat ((vui16_t) b, 3);
5505  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5506  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5507  /* shift the low 128 bits of partial product right 16-bits */
5508  t_odd = vec_sld (z, t_odd, 14);
5509  /* add the top 128 bits of even / odd partial products */
5510  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5511 
5512  tsw = vec_splat ((vui16_t) b, 2);
5513  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5514  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5515  /* shift the low 128 bits of partial product right 16-bits */
5516  t_odd = vec_sld (z, t_odd, 14);
5517  /* add the top 128 bits of even / odd partial products */
5518  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5519 
5520  tsw = vec_splat ((vui16_t) b, 1);
5521  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5522  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5523  /* shift the low 128 bits of partial product right 16-bits */
5524  t_odd = vec_sld (z, t_odd, 14);
5525  /* add the top 128 bits of even / odd partial products */
5526  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5527 
5528  tsw = vec_splat ((vui16_t) b, 0);
5529  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5530  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5531  /* shift the low 128 bits of partial product right 16-bits */
5532  t_odd = vec_sld (z, t_odd, 14);
5533  /* add the top 128 bits of even / odd partial products */
5534  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5535 #endif
5536 #endif
5537  return ((vui128_t) t);
5538 }
5539 
5554 static inline vui128_t
5556 {
5557  vui32_t t, tmq;
5558  /* compute the 256 bit product of two 128 bit values a, b.
5559  * The high 128 bits are accumulated in t and the low 128-bits
5560  * in tmq. Only the low order 128 bits of the product are
5561  * returned.
5562  */
5563 #ifdef _ARCH_PWR9
5564  const vui64_t zero = { 0, 0 };
5565  vui64_t b_swap = vec_swapd ((vui64_t) b);
5566  /* multiply the low 64-bits of a and b. For PWR9 this is just
5567  * vmsumudm with conditioned inputs. */
5568  tmq = (vui32_t) vec_vmuloud ((vui64_t) a, (vui64_t) b);
5569  /* we can use multiply sum here because we only need the low 64-bits
5570  * and don't care if we lose the carry / overflow. */
5571  t = (vui32_t) vec_mrgahd ((vui128_t) zero, (vui128_t) tmq);
5572  /* sum = (a[h] * b[l]) + (a[l] * b[h]) + (zero || tmq[h]). */
5573  t = (vui32_t) vec_msumudm ((vui64_t) a, b_swap, (vui128_t) t);
5574  /* result = t[l] || tmq[l]. */
5575  tmq = (vui32_t) vec_mrgald ((vui128_t) t, (vui128_t) tmq);
5576 #else
5577 #ifdef _ARCH_PWR8
5578  /* We use Vector Multiply Even/Odd Unsigned Word to compute
5579  * the 128 x 32 partial (160-bit) product of vector a with a
5580  * word element of b. The (for each word of vector b) 4 X 160-bit
5581  * partial products are summed to produce the full 256-bit product.
5582  * See the comment in vec_muludq for details.
5583  */
5584  vui32_t tsw;
5585  vui32_t t_odd, t_even;
5586  vui32_t z = { 0, 0, 0, 0 };
5587 
5588  tsw = vec_splat ((vui32_t) b, VEC_WE_3);
5589  t_even = (vui32_t) vec_vmuleuw ((vui32_t) a, tsw);
5590  t_odd = (vui32_t) vec_vmulouw ((vui32_t) a, tsw);
5591  /* Rotate the low 32-bits (right) into tmq. This is actually
5592  * implemented as 96-bit (12-byte) shift left. */
5593  tmq = vec_sld (t_odd, z, 12);
5594  /* shift the low 128 bits of partial product right 32-bits */
5595  t_odd = vec_sld (z, t_odd, 12);
5596  /* add the high 128 bits of even / odd partial products */
5597  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5598 
5599  tsw = vec_splat ((vui32_t) b, VEC_WE_2);
5600  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5601  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5602  /* rotate right the low 32-bits into tmq */
5603  tmq = vec_sld (t_odd, tmq, 12);
5604  /* shift the low 128 bits of partial product right 32-bits */
5605  t_odd = vec_sld (z, t_odd, 12);
5606  /* add the top 128 bits of even / odd partial products */
5607  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5608 
5609  tsw = vec_splat ((vui32_t) b, VEC_WE_1);
5610  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5611  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5612  /* rotate right the low 32-bits into tmq */
5613  tmq = vec_sld (t_odd, tmq, 12);
5614  /* shift the low 128 bits of partial product right 32-bits */
5615  t_odd = vec_sld (z, t_odd, 12);
5616  /* add the top 128 bits of even / odd partial products */
5617  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5618 
5619  tsw = vec_splat ((vui32_t) b, VEC_WE_0);
5620  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5621  /* rotate right the low 32-bits into tmq */
5622  tmq = vec_sld (t_odd, tmq, 12);
5623  // dont need the high 128-bits of 160-bits.
5624 #else
5625  // _ARCH_PWR7 or earlier and Big Endian only.
5626  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute
5627  * the 128 x 16 partial (144-bit) product of vector a with a
5628  * halfword element of b. The (for each halfword of vector b)
5629  * 8 X 144-bit partial products are summed to produce the full
5630  * 256-bit product. */
5631  vui16_t tsw;
5632  vui16_t t_odd, t_even;
5633  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };
5634 
5635  tsw = vec_splat ((vui16_t) b, 7);
5636  t_even = (vui16_t) vec_vmuleuh ((vui16_t) a, tsw);
5637  t_odd = (vui16_t) vec_vmulouh ((vui16_t) a, tsw);
5638 
5639  /* Rotate the low 16-bits (right) into tmq. This is actually
5640  * implemented as 112-bit (14-byte) shift left. */
5641  tmq = (vui32_t) vec_sld (t_odd, z, 14);
5642  /* shift the low 128 bits of partial product right 16-bits */
5643  t_odd = vec_sld (z, t_odd, 14);
5644  /* add the high 128 bits of even / odd partial products */
5645  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5646 
5647  tsw = vec_splat ((vui16_t) b, 6);
5648  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5649  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5650  /* rotate right the low 16-bits into tmq */
5651  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5652  /* shift the low 128 bits of partial product right 16-bits */
5653  t_odd = vec_sld (z, t_odd, 14);
5654  /* add the top 128 bits of even / odd partial products */
5655  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5656 
5657  tsw = vec_splat ((vui16_t) b, 5);
5658  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5659  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5660  /* rotate right the low 16-bits into tmq */
5661  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5662  /* shift the low 128 bits of partial product right 16-bits */
5663  t_odd = vec_sld (z, t_odd, 14);
5664  /* add the top 128 bits of even / odd partial products */
5665  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5666 
5667  tsw = vec_splat ((vui16_t) b, 4);
5668  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5669  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5670  /* rotate right the low 16-bits into tmq */
5671  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5672  /* shift the low 128 bits of partial product right 16-bits */
5673  t_odd = vec_sld (z, t_odd, 14);
5674  /* add the top 128 bits of even / odd partial products */
5675  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5676 
5677  tsw = vec_splat ((vui16_t) b, 3);
5678  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5679  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5680  /* rotate right the low 16-bits into tmq */
5681  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5682  /* shift the low 128 bits of partial product right 16-bits */
5683  t_odd = vec_sld (z, t_odd, 14);
5684  /* add the top 128 bits of even / odd partial products */
5685  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5686 
5687  tsw = vec_splat ((vui16_t) b, 2);
5688  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5689  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5690  /* rotate right the low 16-bits into tmq */
5691  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5692  /* shift the low 128 bits of partial product right 16-bits */
5693  t_odd = vec_sld (z, t_odd, 14);
5694  /* add the top 128 bits of even / odd partial products */
5695  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5696 
5697  tsw = vec_splat ((vui16_t) b, 1);
5698  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5699  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5700  /* rotate right the low 16-bits into tmq */
5701  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5702  /* shift the low 128 bits of partial product right 16-bits */
5703  t_odd = vec_sld (z, t_odd, 14);
5704  /* add the top 128 bits of even / odd partial products */
5705  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5706 
5707  tsw = vec_splat ((vui16_t) b, 0);
5708  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5709  /* rotate right the low 16-bits into tmq */
5710  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5711 #endif
5712 #endif
5713  return ((vui128_t) tmq);
5714 }
5715 
5733 static inline vui128_t
5735 {
5736  vui32_t t, tmq;
5737  /* compute the 256 bit product of two 128 bit values a, b.
5738  * The high 128 bits are accumulated in t and the low 128-bits
5739  * in tmq. The high 128-bits of the product are returned to the
5740  * address of the 1st parm. The low 128-bits are the return
5741  * value.
5742  */
5743 #ifdef _ARCH_PWR9
5744  const vui64_t zero = { 0, 0 };
5745  vui64_t a_swap = vec_swapd ((vui64_t) a);
5746  vui128_t thq, tlq, tx;
5747  vui128_t t0l, tc1;
5748  vui128_t thh, thl, tlh, tll;
5749  /* multiply the low 64-bits of a and b. For PWR9 this is just
5750  * vmsumudm with conditioned inputs. */
5751  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);
5752  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);
5753  thl = vec_vmuloud (a_swap, (vui64_t)b);
5754  tlh = vec_vmuleud (a_swap, (vui64_t)b);
5755  /* sum the two middle products (plus the high 64-bits of the low
5756  * product. This will generate a carry that we need to capture. */
5757  t0l = (vui128_t) vec_mrgahd ( (vui128_t) zero, tll);
5758  tc1 = vec_addcuq (thl, tlh);
5759  tx = vec_adduqm (thl, tlh);
5760  tx = vec_adduqm (tx, t0l);
5761  /* result = t[l] || tll[l]. */
5762  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);
5763  /* Sum the high product plus the high sum (with carry) of middle
5764  * partial products. This can't overflow. */
5765  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);
5766  thq = vec_adduqm ( thh, thq);
5767 
5768  t = (vui32_t) thq;
5769  tmq = (vui32_t) tlq;
5770 #else
5771 #ifdef _ARCH_PWR8
5772  vui32_t tsw;
5773  vui32_t t_odd, t_even;
5774  vui32_t z = { 0, 0, 0, 0 };
5775  /* We use the Vector Multiple Even/Odd Unsigned Word to compute
5776  * the 128 x 32 partial (160-bit) product of value a with the
5777  * word splat of b. This produces four 64-bit (32 x 32)
5778  * partial products in two vector registers.
5779  *
5780  * These results
5781  * are not aligned for summation as is. So the odd result is
5782  * shifted right 32-bits before it is summed (via Vector Add
5783  * Unsigned Quadword Modulo) with the the even result.
5784  * The low order 32-bits, of the 160-bit product
5785  * is shifted (right) in to a separate vector (tmq).
5786  *
5787  * This is repeated for each (low to high order) words of b.
5788  * After the first (160-bit) partial product, the high 128-bits
5789  * (t) of the previous partial product is summed with the current
5790  * odd multiply result, before this sum (including any carry out)
5791  * is shifted right 32-bits. Bits shifted out of the of this sum
5792  * are shifted (32-bits at a time) into the low order 128-bits
5793  * of the product (tmq). The shifted odd sum is then added to the
5794  * current even product, After the 4th step this sum is the
5795  * final high order 128-bits of the quadword product. */
5796  tsw = vec_splat ((vui32_t) b, VEC_WE_3);
5797  t_even = (vui32_t)vec_vmuleuw((vui32_t)a, tsw);
5798  t_odd = (vui32_t)vec_vmulouw((vui32_t)a, tsw);
5799  /* Rotate the low 32-bits (right) into tmq. This is actually
5800  * implemented as 96-bit (12-byte) shift left. */
5801  tmq = vec_sld (t_odd, z, 12);
5802  /* shift the low 128 bits of partial product right 32-bits */
5803  t_odd = vec_sld (z, t_odd, 12);
5804  /* add the high 128 bits of even / odd partial products */
5805  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5806 
5807  tsw = vec_splat ((vui32_t) b, VEC_WE_2);
5808  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5809  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5810  /* rotate right the low 32-bits into tmq */
5811  tmq = vec_sld (t_odd, tmq, 12);
5812  /* shift the low 128 bits of partial product right 32-bits */
5813  t_odd = vec_sld (z, t_odd, 12);
5814  /* add the top 128 bits of even / odd partial products */
5815  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5816 
5817  tsw = vec_splat ((vui32_t) b, VEC_WE_1);
5818  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5819  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5820  /* rotate right the low 32-bits into tmq */
5821  tmq = vec_sld (t_odd, tmq, 12);
5822  /* shift the low 128 bits of partial product right 32-bits */
5823  t_odd = vec_sld (z, t_odd, 12);
5824  /* add the top 128 bits of even / odd partial products */
5825  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5826 
5827  tsw = vec_splat ((vui32_t) b, VEC_WE_0);
5828  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
5829  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
5830  /* rotate right the low 32-bits into tmq */
5831  tmq = vec_sld (t_odd, tmq, 12);
5832  /* shift the low 128 bits of partial product right 32-bits */
5833  t_odd = vec_sld (z, t_odd, 12);
5834  /* add the top 128 bits of even / odd partial products */
5835  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5836 #else // _ARCH_PWR7 or earlier and Big Endian only. */
5837  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute
5838  * the 128 x 16 partial (144-bit) product of vector a with a
5839  * halfword element of b. The (for each halfword of vector b)
5840  * 8 X 144-bit partial products are summed to produce the full
5841  * 256-bit product. */
5842  vui16_t tsw;
5843  vui16_t t_odd, t_even;
5844  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };
5845 
5846  tsw = vec_splat ((vui16_t) b, 7);
5847  t_even = (vui16_t)vec_vmuleuh((vui16_t)a, tsw);
5848  t_odd = (vui16_t)vec_vmulouh((vui16_t)a, tsw);
5849 
5850  /* Rotate the low 16-bits (right) into tmq. This is actually
5851  * implemented as 112-bit (14-byte) shift left. */
5852  tmq = (vui32_t)vec_sld (t_odd, z, 14);
5853  /* shift the low 128 bits of partial product right 16-bits */
5854  t_odd = vec_sld (z, t_odd, 14);
5855  /* add the high 128 bits of even / odd partial products */
5856  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5857 
5858  tsw = vec_splat ((vui16_t) b, 6);
5859  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5860  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5861  /* rotate right the low 16-bits into tmq */
5862  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5863  /* shift the low 128 bits of partial product right 16-bits */
5864  t_odd = vec_sld (z, t_odd, 14);
5865  /* add the top 128 bits of even / odd partial products */
5866  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5867 
5868  tsw = vec_splat ((vui16_t) b, 5);
5869  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5870  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5871  /* rotate right the low 16-bits into tmq */
5872  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5873  /* shift the low 128 bits of partial product right 16-bits */
5874  t_odd = vec_sld (z, t_odd, 14);
5875  /* add the top 128 bits of even / odd partial products */
5876  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5877 
5878  tsw = vec_splat ((vui16_t) b, 4);
5879  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5880  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5881  /* rotate right the low 16-bits into tmq */
5882  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5883  /* shift the low 128 bits of partial product right 16-bits */
5884  t_odd = vec_sld (z, t_odd, 14);
5885  /* add the top 128 bits of even / odd partial products */
5886  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5887 
5888  tsw = vec_splat ((vui16_t) b, 3);
5889  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5890  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5891  /* rotate right the low 16-bits into tmq */
5892  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5893  /* shift the low 128 bits of partial product right 16-bits */
5894  t_odd = vec_sld (z, t_odd, 14);
5895  /* add the top 128 bits of even / odd partial products */
5896  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5897 
5898  tsw = vec_splat ((vui16_t) b, 2);
5899  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5900  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5901  /* rotate right the low 16-bits into tmq */
5902  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5903  /* shift the low 128 bits of partial product right 16-bits */
5904  t_odd = vec_sld (z, t_odd, 14);
5905  /* add the top 128 bits of even / odd partial products */
5906  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5907 
5908  tsw = vec_splat ((vui16_t) b, 1);
5909  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5910  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5911  /* rotate right the low 16-bits into tmq */
5912  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5913  /* shift the low 128 bits of partial product right 16-bits */
5914  t_odd = vec_sld (z, t_odd, 14);
5915  /* add the top 128 bits of even / odd partial products */
5916  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5917 
5918  tsw = vec_splat ((vui16_t) b, 0);
5919  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
5920  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
5921  /* rotate right the low 16-bits into tmq */
5922  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
5923  /* shift the low 128 bits of partial product right 16-bits */
5924  t_odd = vec_sld (z, t_odd, 14);
5925  /* add the top 128 bits of even / odd partial products */
5926  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
5927 #endif
5928 #endif
5929  *mulu = (vui128_t) t;
5930  return ((vui128_t) tmq);
5931 }
5932 
5955 static inline vui128_t
5957 {
5958  vui128_t ph, pl;
5959 #ifdef _ARCH_PWR9
5960  vui64_t a_swap = vec_swapd ((vui64_t) a);
5961  vui128_t thq, tlq, tx;
5962  vui128_t t0l, tc1, tcl;
5963  vui128_t thh, thl, tlh, tll;
5964  /* multiply the low 64-bits of a and b. For PWR9 this is just
5965  * vmsumudm with conditioned inputs. */
5966  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);
5967  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);
5968  thl = vec_vmuloud (a_swap, (vui64_t)b);
5969  tlh = vec_vmuleud (a_swap, (vui64_t)b);
5970  /* Add c to lower 128-bits of the partial product. */
5971  tcl = vec_addcuq (tll, c);
5972  tll = vec_adduqm (tll, c);
5973  t0l = (vui128_t) vec_permdi ((vui64_t) tcl, (vui64_t) tll, 2);
5974  /* sum the two middle products (plus the high 64-bits of the low
5975  * product. This will generate a carry that we need to capture. */
5976  tc1 = vec_addcuq (thl, tlh);
5977  tx = vec_adduqm (thl, tlh);
5978  tx = vec_adduqm (tx, t0l);
5979  /* result = t[l] || tll[l]. */
5980  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);
5981  /* Sum the high product plus the high sum (with carry) of middle
5982  * partial products. This can't overflow. */
5983  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);
5984  thq = vec_adduqm ( thh, thq);
5985 
5986  pl = tlq;
5987  ph = thq;
5988 #else
5989 #if _ARCH_PWR8
5990  vui32_t t, tmq;
5991  vui32_t tsw;
5992  vui32_t t_odd, t_even;
5993  vui32_t z = { 0, 0, 0, 0 };
5994  /* We use Vector Multiple Even/Odd Unsigned Word to compute
5995  * a 128 x 32 partial (160-bit) product of value a with the
5996  * word splat [3,2,1,0] of b in 4 steps. Each step produces
5997  * four 64-bit (32 x 32) partial products in two vector registers.
5998  * These must be shifted for alignment and summed (128-bit add)
5999  * to product the 160-bit partial product.
6000  *
6001  * These results
6002  * are not aligned for summation as is. So the odd result is
6003  * shifted right 32-bits before it is summed (via Vector Add
6004  * Unsigned Quadword Modulo) with the the even result.
6005  * The low order 32-bits, of the 160-bit product
6006  * is shifted (right) in to a separate vector (tmq).
6007  * This is repeated for each stage of the multiply, so that tmq
6008  * accumulates the low order 128-bits of the 256-bit product.
6009  *
6010  * This is repeated for each (low to high order) words of b.
6011  * After the first (160-bit) partial product, the high 128-bits
6012  * (t) of the previous partial product is summed with the current
6013  * odd multiply result, before this sum (including any carry out)
6014  * is shifted right 32-bits. Bits shifted out of the of this sum
6015  * are shifted (32-bits at a time) into the low order 128-bits
6016  * of the product (tmq). The shifted odd sum is then added to the
6017  * current even product, After the 4th step this sum is the
6018  * final high order 128-bits of the quadword product. */
6019  tsw = vec_splat ((vui32_t) b, VEC_WE_3);
6020  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw,(vui32_t)c);
6021  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, (vui32_t)c);
6022  /* Rotate the low 32-bits (right) into tmq. This is actually
6023  * implemented as 96-bit (12-byte) shift left. */
6024  tmq = vec_sld (t_odd, z, 12);
6025  /* shift the low 128 bits of partial product right 32-bits */
6026  t_odd = vec_sld (z, t_odd, 12);
6027  /* add the high 128 bits of even / odd partial products */
6028  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6029 
6030  tsw = vec_splat ((vui32_t) b, VEC_WE_2);
6031  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
6032  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
6033  /* rotate right the low 32-bits into tmq */
6034  tmq = vec_sld (t_odd, tmq, 12);
6035  /* shift the low 128 bits of partial product right 32-bits */
6036  t_odd = vec_sld (z, t_odd, 12);
6037  /* add the top 128 bits of even / odd partial products */
6038  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6039 
6040  tsw = vec_splat ((vui32_t) b, VEC_WE_1);
6041  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
6042  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
6043  /* rotate right the low 32-bits into tmq */
6044  tmq = vec_sld (t_odd, tmq, 12);
6045  /* shift the low 128 bits of partial product right 32-bits */
6046  t_odd = vec_sld (z, t_odd, 12);
6047  /* add the top 128 bits of even / odd partial products */
6048  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6049 
6050  tsw = vec_splat ((vui32_t) b, VEC_WE_0);
6051  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
6052  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
6053  /* rotate right the low 32-bits into tmq */
6054  tmq = vec_sld (t_odd, tmq, 12);
6055  /* shift the low 128 bits of partial product right 32-bits */
6056  t_odd = vec_sld (z, t_odd, 12);
6057  /* add the top 128 bits of even / odd partial products */
6058  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6059  ph = (vui128_t) t;
6060  pl = (vui128_t) tmq;
6061 #else // _ARCH_PWR7 or earlier and Big Endian only. */
6062  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute
6063  * the 128 x 16 partial (144-bit) product of vector a with a
6064  * halfword element of b. The (for each halfword of vector b)
6065  * 8 X 144-bit partial products are summed to produce the full
6066  * 256-bit product. */
6067  vui32_t t, tmq;
6068  vui16_t tsw;
6069  vui16_t t_odd, t_even;
6070  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };
6071 
6072  tsw = vec_splat ((vui16_t) b, 7);
6073  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) c);
6074  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) c);
6075  /* Rotate the low 16-bits (right) into tmq. This is actually
6076  * implemented as 112-bit (14-byte) shift left. */
6077  tmq = (vui32_t)vec_sld (t_odd, z, 14);
6078  /* shift the low 128 bits of partial product right 16-bits */
6079  t_odd = vec_sld (z, t_odd, 14);
6080  /* add the high 128 bits of even / odd partial products */
6081  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6082 
6083  tsw = vec_splat ((vui16_t) b, 6);
6084  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6085  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6086  /* rotate right the low 16-bits into tmq */
6087  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6088  /* shift the low 128 bits of partial product right 16-bits */
6089  t_odd = vec_sld (z, t_odd, 14);
6090  /* add the top 128 bits of even / odd partial products */
6091  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6092 
6093  tsw = vec_splat ((vui16_t) b, 5);
6094  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6095  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6096  /* rotate right the low 16-bits into tmq */
6097  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6098  /* shift the low 128 bits of partial product right 16-bits */
6099  t_odd = vec_sld (z, t_odd, 14);
6100  /* add the top 128 bits of even / odd partial products */
6101  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6102 
6103  tsw = vec_splat ((vui16_t) b, 4);
6104  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6105  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6106  /* rotate right the low 16-bits into tmq */
6107  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6108  /* shift the low 128 bits of partial product right 16-bits */
6109  t_odd = vec_sld (z, t_odd, 14);
6110  /* add the top 128 bits of even / odd partial products */
6111  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6112 
6113  tsw = vec_splat ((vui16_t) b, 3);
6114  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6115  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6116  /* rotate right the low 16-bits into tmq */
6117  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6118  /* shift the low 128 bits of partial product right 16-bits */
6119  t_odd = vec_sld (z, t_odd, 14);
6120  /* add the top 128 bits of even / odd partial products */
6121  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6122 
6123  tsw = vec_splat ((vui16_t) b, 2);
6124  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6125  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6126  /* rotate right the low 16-bits into tmq */
6127  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6128  /* shift the low 128 bits of partial product right 16-bits */
6129  t_odd = vec_sld (z, t_odd, 14);
6130  /* add the top 128 bits of even / odd partial products */
6131  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6132 
6133  tsw = vec_splat ((vui16_t) b, 1);
6134  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6135  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6136  /* rotate right the low 16-bits into tmq */
6137  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6138  /* shift the low 128 bits of partial product right 16-bits */
6139  t_odd = vec_sld (z, t_odd, 14);
6140  /* add the top 128 bits of even / odd partial products */
6141  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6142 
6143  tsw = vec_splat ((vui16_t) b, 0);
6144  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);
6145  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);
6146  /* rotate right the low 16-bits into tmq */
6147  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);
6148  /* shift the low 128 bits of partial product right 16-bits */
6149  t_odd = vec_sld (z, t_odd, 14);
6150  /* add the top 128 bits of even / odd partial products */
6151  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);
6152  ph = (vui128_t) t;
6153  pl = (vui128_t) tmq;
6154 #endif
6155 #endif
6156  *mulu = ph;
6157  return (pl);
6158 }
6159 
6183 static inline vui128_t
6185 {
6186  vui128_t ph, pl, cs;
6187 #ifdef _ARCH_PWR9
6188  vui128_t cl;
6189  // P9 has 3 cycles vadduqm so sum C1/C2 early
6190  cl = vec_addcuq (c1, c2);
6191  cs = vec_adduqm (c1, c2);
6192  // Pass the low 128-bit od C1/C2 sum into madduq
6193  pl = vec_madduq (&ph, a, b, cs);
6194  // Deal with C1/C2 carry last
6195  *mulu = vec_adduqm (ph, cl);
6196 #else
6197 #ifdef _ARCH_PWR8
6198  vui128_t cl, cl2;
6199  // P8 has vadduqm but slower, so sum C1/C2 early
6200  cl = vec_addcuq (c1, c2);
6201  cs = vec_adduqm (c1, c2);
6202  // Overlapping execution of vaddcuq/vadduqm with muludq
6203  pl = vec_muludq (&ph, a, b);
6204  // Delay sum of product plus (c1 + c2) + (cl + cl2)
6205  cl2 = vec_addcuq (pl, cs);
6206  pl = vec_adduqm (pl, cs);
6207 
6208  *mulu = vec_addeuqm (ph, cl, cl2);;
6209 #else
6210  // P7 and earlier do not have vadduqm, must use vaddcuw/vadduwm
6211  // so leverage madduq to sum (a * b) + c1
6212  pl = vec_madduq (&ph, a, b, c1);
6213  // Then add c2 to the madd sum as last stage.
6214  pl = vec_addcq (&cs, pl, c2);
6215  *mulu = vec_adduqm (ph, cs);
6216 #endif
6217 #endif
6218  return (pl);
6219 }
6220 
6233 static inline vi128_t
6235 {
6236  const vui128_t q_zero = (vui128_t) { 0 };
6237  // Negate 2s complement quadword integer.
6238  return (vi128_t) vec_subuqm (q_zero, (vui128_t)int128);
6239 }
6240 
6253 static inline vui128_t
6255 {
6256  const vui128_t q_zero = (vui128_t) { 0 };
6257  // Negate 2s complement quadword integer.
6258  return vec_subuqm (q_zero, int128);
6259 }
6260 
6276 static inline vui128_t
6278 {
6279  vui64_t result;
6280 
6281 #ifdef _ARCH_PWR9
6282  /*
6283  * Use the Vector Population Count Doubleword instruction to get
6284  * the count for the left and right vector halves. Then sum across
6285  * the left and right counts to get the final 128-bit vector count
6286  * (0-128).
6287  */
6288  vui64_t vt1, h64, l64;
6289  const vui64_t vzero = { 0, 0 };
6290 
6291  vt1 = vec_popcntd ((vui64_t) vra);
6292  h64 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt1);
6293  l64 = vec_mrgald ((vui128_t)vzero, (vui128_t)vt1);
6294  result = vec_addudm (h64, l64);
6295 #elif defined(_ARCH_PWR8)
6296  /*
6297  * Use the Vector Population Count Word instruction to get
6298  * the count for each word. Then sum across the words
6299  * to get the final 128-bit vector count (0-128).
6300  * For P8 popcntw is 2 cycles faster then popcntd but requires
6301  * vsumsws (7 cycles) as the best option to sum across words.
6302  */
6303  vui32_t vt1;
6304  const vui64_t vzero = { 0, 0 };
6305 
6306  vt1 = vec_popcntw ((vui32_t) vra);
6307  result = (vui64_t) vec_vsumsw ((vi32_t) vt1,
6308  (vi32_t) vzero);
6309 #else
6310  //#warning Implememention pre power8
6311  vui32_t z= { 0,0,0,0};
6312  vui32_t x;
6313  x = vec_popcntw ((vui32_t)vra);
6314  result = (vui64_t) vec_sums ((vi32_t) x, (vi32_t) z);
6315 #endif
6316  return ((vui128_t) result);
6317 }
6318 
6331 static inline vui128_t
6333 {
6334  vui128_t result;
6335 
6336 #ifdef _ARCH_PWR9
6337 #if defined (vec_revb) || defined (__clang__)
6338  result = vec_revb (vra);
6339 #else
6340  __asm__(
6341  "xxbrq %x0,%x1;"
6342  : "=wa" (result)
6343  : "wa" (vra)
6344  : );
6345 #endif
6346 #else
6347 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
6348  const vui64_t vconstp =
6349  CONST_VINT64_DW(0x0F0E0D0C0B0A0908UL, 0x0706050403020100UL);
6350 #else
6351  const vui64_t vconstp =
6352  CONST_VINT64_DW(0x0001020304050607UL, 0x08090A0B0C0D0E0FUL);
6353 #endif
6354  result = (vui128_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);
6355 #endif
6356 
6357  return (result);
6358 }
6359 
6374 static inline vui128_t
6376 {
6377  vui128_t result;
6378 
6379 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6380  // vrlq takes the shift count from bits 57:63
6381  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);
6382  __asm__(
6383  "vrlq %0,%1,%2;\n"
6384  : "=v" (result)
6385  : "v" (vra), "v" (vrb)
6386  : );
6387 #else
6388  result = vec_sldq (vra, vra, vrb);
6389 #endif
6390  return ((vui128_t) result);
6391 }
6392 
6407 static inline vui128_t
6408 vec_rlqi (vui128_t vra, const unsigned int shb)
6409 {
6410  vui8_t result;
6411 
6412 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6413  if (__builtin_constant_p (shb) && (shb < 8))
6414  {
6415  /* When shifting by a constant less then 8, can use bit immediate
6416  * vec_vsldbi (vra, vra, shb) as rotate left. */
6417  result = (vui8_t) vec_vsldbi (vra, vra, shb);
6418  }
6419  else
6420  {
6421  vui32_t lshift = vec_splats((unsigned int) shb);
6422  __asm__(
6423  "vrlq %0,%1,%2;\n"
6424  : "=v" (result)
6425  : "v" (vra), "v" (lshift)
6426  : );
6427  }
6428 #else
6429  if (__builtin_constant_p (shb) && ((shb % 8) == 0))
6430  {
6431  /* When shifting an multiple of 8 bits (octet), use Vector
6432  Shift Left Double By Octet Immediate. This eliminates
6433  loading the shift const into a VR. */
6434  if (shb > 0)
6435  result = vec_sld ((vui8_t) vra, (vui8_t) vra, ((shb / 8) & 15));
6436  else
6437  result = (vui8_t) vra;
6438  }
6439  else
6440  {
6441  result = (vui8_t) vec_sldqi (vra, vra, shb);
6442  }
6443 #endif
6444  return ((vui128_t) result);
6445 }
6446 
6461 static inline vi128_t
6463 {
6464  return (vi128_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);
6465 }
6466 
6481 static inline vui128_t
6483 {
6484  return (vui128_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);
6485 }
6486 
6508 static inline vb128_t
6510 {
6511 #ifdef _ARCH_PWR9
6512  const vui128_t zero = (vui128_t) vec_splat_u32(0);
6513 
6514  return (vb128_t) vec_vsubuqm (zero, vcy);
6515 #else
6516  const vui32_t ones = vec_splat_u32(1);
6517  vui32_t rcy;
6518 
6519  rcy = vec_splat ((vui32_t) vcy, VEC_W_L);
6520  return (vb128_t) vec_cmpeq (rcy, ones);
6521 #endif
6522 }
6523 
6545 static inline vb128_t
6547 {
6548 #ifdef _ARCH_PWR9
6549  const vui128_t zero = (vui128_t) vec_splat_u32(0);
6550 
6551  return (vb128_t) vec_vsubeuqm (zero, zero, vcy);
6552 #else
6553  const vui32_t zero = CONST_VINT128_W(0, 0, 0, 0);
6554  vui32_t rcy;
6555 
6556  rcy = vec_splat ((vui32_t) vcy, VEC_W_L);
6557  return (vb128_t) vec_cmpeq (rcy, zero);
6558 #endif
6559 }
6560 
6575 static inline vb128_t
6577 {
6578  vb128_t result;
6579 
6580 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6581  __asm__(
6582  "vexpandqm %0,%1"
6583  : "=v" (result)
6584  : "v" (vra)
6585  : );
6586 #else
6587  const vui8_t shift = vec_splat_u8 (7);
6588  vui8_t splat = vec_splat ((vui8_t) vra, VEC_BYTE_H);
6589 
6590  result = (vb128_t) vec_sra (splat, shift);
6591 #endif
6592  return result;
6593 }
6594 
6612 static inline vui128_t
6614 {
6615  vui8_t result, vt1, vt2, vt3, vbs;
6616  const vui8_t vzero = vec_splat_u8 (0);
6617 
6618  vt1 = vec_slo ((vui8_t) vrw, (vui8_t) vrb);
6619  /* The vsl/vsr instruction only works correctly if the bit shift
6620  value is splatted to each byte of the vector. */
6621  vbs = vec_splat ((vui8_t) vrb, VEC_BYTE_L);
6622  vt1 = vec_sll (vt1, vbs);
6623  vt3 = vec_sub (vzero, vbs);
6624  vt2 = vec_sro ((vui8_t) vrx, vt3);
6625  vt2 = vec_srl (vt2, vt3);
6626  result = vec_or (vt1, vt2);
6627 
6628  return ((vui128_t) result);
6629 }
6630 
6648 static inline vui128_t
6649 vec_sldqi (vui128_t vrw, vui128_t vrx, const unsigned int shb)
6650 {
6651  vui128_t result;
6652 
6653  if (__builtin_constant_p(shb))
6654  {
6655  if ((shb % 8) == 0)
6656  /* When shifting an multiple of 8 bits (octet), use Vector
6657  Shift Left Double By Octet Immediate. This eliminates
6658  loading the shift const into a VR. */
6659  if (shb > 0)
6660  result = (vui128_t) vec_sld ((vui8_t) vrw, (vui8_t) vrx, (shb / 8));
6661  else
6662  result = vrw;
6663  else // Not just an immediate octet shift
6664  if (shb < 8)
6665  // Special case for 0-7 shifts, use vec_vsldbi to exploit P10.
6666  result = vec_vsldbi (vrw, vrx, shb);
6667  else
6668  {
6669 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6670  // Special case of P10.
6671  vui8_t h, l;
6672  // Shift left double quad (256-bits) by Octet
6673  h = vec_sld ((vui8_t) vrw, (vui8_t) vrx, (shb / 8));
6674  l = vec_sld ((vui8_t) vrx, (vui8_t) vrx, (shb / 8));
6675  // Then Shift Left Double by Bit to complete the shift.
6676  result = vec_vsldbi ((vui128_t) h, (vui128_t) l, (shb % 8));
6677 #else // Load shb as vector and use general vec_sldq case.
6678  const vui8_t vrb = vec_splats ((unsigned char) shb);
6679  result = vec_sldq (vrw, vrx, (vui128_t) vrb);
6680 #endif
6681  }
6682  }
6683  else
6684  {
6685  const vui8_t vrb = vec_splats ((unsigned char) shb);
6686  result = vec_sldq (vrw, vrx, (vui128_t) vrb);
6687  }
6688 
6689  return ((vui128_t) result);
6690 }
6691 
6706 static inline vui128_t
6708 {
6709  vui8_t result;
6710 
6711 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6712  // vslq takes the shift count from bits 57:63
6713  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);
6714  __asm__(
6715  "vslq %0,%1,%2;\n"
6716  : "=v" (result)
6717  : "v" (vra), "v" (vrb)
6718  : );
6719 #else
6720  vui8_t vshift_splat;
6721  /* For some reason, the vsl instruction only works
6722  * correctly if the bit shift value is splatted to each byte
6723  * of the vector. */
6724  vshift_splat = vec_splat ((vui8_t) vrb, VEC_BYTE_L);
6725  result = vec_slo ((vui8_t) vra, (vui8_t) vrb);
6726  result = vec_sll (result, vshift_splat);
6727 #endif
6728  return ((vui128_t) result);
6729 }
6730 
6747 static inline vui128_t
6748 vec_slqi (vui128_t vra, const unsigned int shb)
6749 {
6750  vui8_t result;
6751 
6752  if (shb < 128)
6753  {
6754  vui8_t lshift;
6755 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6756  lshift = (vui8_t) vec_splats((unsigned int) shb);
6757  __asm__(
6758  "vslq %0,%1,%2;\n"
6759  : "=v" (result)
6760  : "v" (vra), "v" (lshift)
6761  : );
6762 #else
6763  if (__builtin_constant_p (shb) && ((shb % 8) == 0))
6764  {
6765  /* When shifting an multiple of 8 bits (octet), use Vector
6766  Shift Left Double By Octet Immediate. This eliminates
6767  loading the shift const into a VR, but requires an
6768  explicit vector of zeros. */
6769  vui8_t zero =
6770  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
6771  if (shb > 0)
6772  result = vec_sld ((vui8_t) vra, zero, (shb / 8));
6773  else
6774  result = (vui8_t) vra;
6775  }
6776  else
6777  {
6778  /* Load the shift const in a vector. The bit level shifts
6779  require the shift amount is splatted to all 16-bytes of
6780  the shift control. */
6781  if (__builtin_constant_p (shb) && (shb < 16))
6782  lshift = (vui8_t) vec_splat_s8(shb);
6783  else
6784  lshift = vec_splats ((unsigned char) shb);
6785 
6786  if (shb > 7)
6787  /* Vector Shift Left By Octet by bits 121-124 of lshift. */
6788  result = vec_slo ((vui8_t) vra, lshift);
6789  else
6790  result = ((vui8_t) vra);
6791 
6792  /* Vector Shift Left by bits 125-127 of lshift. */
6793  result = vec_sll (result, lshift);
6794  }
6795 #endif
6796  }
6797  else
6798  { /* shifts greater then 127 bits return zeros. */
6799  result = vec_xor ((vui8_t) vra, (vui8_t) vra);
6800  }
6801  return (vui128_t) result;
6802 }
6803 
6828 static inline vi128_t
6829 vec_splat_s128 (const int sim)
6830 {
6831  vi128_t result;
6832 #ifdef _ARCH_PWR9
6833  // TBD! No Vector Extend Sign Byte To Qword
6834  // But does have VSX Vector Splat Immediate Byte (-128 -> 127)
6835  if (__builtin_constant_p (sim) && ((sim >= -128) && (sim < 128)))
6836  {
6837  // Expect the compiler to generate a single xxspltib for this.
6838  vi8_t vbi = vec_splats ((signed char) sim);
6839 
6840  if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
6841  {
6842  // Special case for -1 and 0. Skip vec_sld().
6843  result = (vi128_t) vbi;
6844  }
6845  else
6846  {
6847  if (__builtin_constant_p (sim) && (sim > 0))
6848  {
6849  const vui32_t q_zero = {0, 0, 0, 0};
6850  result = (vi128_t) vec_sld ((vi8_t) q_zero, vbi, 1);
6851  }
6852  else
6853  {
6854  const vui32_t q_ones = {-1, -1, -1, -1};
6855  result = (vi128_t) vec_sld ((vi8_t) q_ones, vbi, 1);
6856  }
6857  }
6858  }
6859  else
6860  result = vec_splats ((signed __int128) sim);
6861 #else
6862  if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))
6863  {
6864  vui32_t vwi = (vui32_t) vec_splat_s32(sim);
6865 
6866  if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
6867  {
6868  // Special case for -1 and 0. Skip vec_sld().
6869  result = (vi128_t) vwi;
6870  }
6871  else
6872  {
6873  if (__builtin_constant_p (sim) && (sim > 0))
6874  {
6875  const vui32_t q_zero = {0, 0, 0, 0};
6876  result = (vi128_t) vec_sld (q_zero, vwi, 4);
6877  }
6878  else
6879  {
6880  const vui32_t q_ones = {-1, -1, -1, -1};
6881  result = (vi128_t) vec_sld (q_ones, vwi, 4);
6882  }
6883  }
6884  }
6885  else
6886  result = vec_splats ((signed __int128) sim);
6887 #endif
6888  return (result);
6889 }
6890 
6913 static inline vui128_t
6914 vec_splat_u128 (const int sim)
6915 {
6916  vui128_t result;
6917 #ifdef _ARCH_PWR9
6918  // No Vector Extend Sign Byte To Qword
6919  // But does have VSX Vector Splat Immediate Byte (0 -> 255)
6920  if (__builtin_constant_p (sim) && ((sim >= 0) && (sim < 256)))
6921  {
6922  // Expect the compiler to generate a single xxspltib for this.
6923  vui8_t vbi = vec_splats ((unsigned char) sim);
6924 
6925  if (__builtin_constant_p (sim) && (sim == 0))
6926  {
6927  // Special case for 0. Skip vec_sld().
6928  result = (vui128_t) vbi;
6929  }
6930  else
6931  {
6932  if (__builtin_constant_p (sim) && (sim < 256))
6933  {
6934  const vui32_t q_zero = {0, 0, 0, 0};
6935  result = (vui128_t) vec_sld ((vui8_t) q_zero, vbi, 1);
6936  }
6937  else
6938  result = vec_splats ((unsigned __int128) sim);
6939  }
6940  }
6941  else
6942  result = vec_splats ((unsigned __int128) sim);
6943 #else
6944  if (__builtin_constant_p (sim) && ((sim >= 0) && (sim < 16)))
6945  {
6946  const vui32_t q_zero = {0, 0, 0, 0};
6947  vui32_t vwi = vec_splat_u32 (sim);
6948 
6949  if (__builtin_constant_p (sim) && (sim == 0))
6950  {
6951  // Special case for -1 and 0. Skip vec_unpackl().
6952  result = (vui128_t) vwi;
6953  } else {
6954  result = (vui128_t) vec_sld (q_zero, vwi, 4);
6955  }
6956  }
6957  else if (__builtin_constant_p (sim) && (sim == 128))
6958  {
6959  // Expect the compiler to generate vspltisw/vslb here.
6960  vui8_t vbi = vec_splats ((unsigned char) 128);
6961  // Extent left with 120-bits of 0
6962  const vui32_t q_zero = {0, 0, 0, 0};
6963  result = (vui128_t) vec_sld ((vui8_t) q_zero, vbi, 1);
6964  }
6965  else
6966  result = vec_splats ((unsigned __int128) sim);
6967 #endif
6968  return (result);
6969 }
6970 
6985 static inline vi128_t
6987 {
6988  vui8_t result;
6989 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
6990  // vsraq takes the shift count from bits 57:63
6991  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);
6992  __asm__(
6993  "vsraq %0,%1,%2;\n"
6994  : "=v" (result)
6995  : "v" (vra), "v" (vrb)
6996  : );
6997 #else
6998  vui8_t vsht;
6999  vui128_t vsgn;
7000  const vui8_t zero = vec_splat_u8 (0);
7001 
7002  /* For some reason the vsr instruction only works
7003  * correctly if the bit shift value is splatted to each byte
7004  * of the vector. */
7005  vsgn = (vui128_t) vec_setb_sq (vra);
7006  vsht = vec_sub (zero, (vui8_t) vrb);
7007  result = (vui8_t) vec_sldq (vsgn, (vui128_t) vra, (vui128_t) vsht);
7008 #endif
7009  return ((vi128_t) result);
7010 }
7011 
7035 static inline vi128_t
7036 vec_sraqi (vi128_t vra, const unsigned int shb)
7037 {
7038  vui8_t result;
7039 
7040  if (shb < 127)
7041  {
7042 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
7043  vui32_t rshift = vec_splats((unsigned int) shb);
7044  __asm__(
7045  "vsraq %0,%1,%2;\n"
7046  : "=v" (result)
7047  : "v" (vra), "v" (rshift)
7048  : );
7049 #else
7050  vui8_t lshift;
7051  vui128_t vsgn;
7052  if (__builtin_constant_p (shb) && ((shb % 8) == 0))
7053  {
7054  if (shb > 0)
7055  {
7056  vsgn = (vui128_t) vec_setb_sq (vra);
7057  result = vec_sld ((vui8_t) vsgn, (vui8_t) vra, 16 - (shb / 8));
7058  }
7059  else
7060  result = (vui8_t) vra;
7061  }
7062  else
7063  {
7064 #ifdef _ARCH_PWR8
7065  if (shb < 64)
7066  {
7067  vui128_t vrshq;
7068  vi64_t vrshd;
7069  vrshq = vec_srqi ((vui128_t) vra, shb);
7070  vrshd = vec_sradi ((vi64_t) vra, shb);
7071  result = (vui8_t) vec_pasted ((vui64_t) vrshd, (vui64_t) vrshq);
7072  }
7073  else
7074  {
7075 #endif
7076  const unsigned int lshb = 128 - shb;
7077  if (__builtin_constant_p (shb) && (lshb < 16))
7078  lshift = (vui8_t) vec_splat_s8(shb);
7079  else
7080  lshift = vec_splats ((unsigned char) lshb);
7081 
7082  vsgn = (vui128_t) vec_setb_sq (vra);
7083  result = (vui8_t) vec_sldq (vsgn, (vui128_t) vra,
7084  (vui128_t) lshift);
7085 #ifdef _ARCH_PWR8
7086  }
7087 #endif
7088  }
7089 #endif
7090  }
7091  else
7092  { /* shifts greater then 126 bits returns the sign bit. */
7093  result = (vui8_t) vec_setb_sq (vra);
7094  }
7095 
7096  return ((vi128_t) result);
7097 }
7098 
7113 static inline vui128_t
7115 {
7116  vui8_t result;
7117 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
7118  // vsrq takes the shift count from bits 57:63
7119  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);
7120  __asm__(
7121  "vsrq %0,%1,%2;\n"
7122  : "=v" (result)
7123  : "v" (vra), "v" (vrb)
7124  : );
7125 #else
7126  vui8_t vsht_splat;
7127  /* For some reason the vsr instruction only works
7128  * correctly if the bit shift value is splatted to each byte
7129  * of the vector. */
7130  vsht_splat = vec_splat ((vui8_t) vrb, VEC_BYTE_L);
7131  result = vec_sro ((vui8_t) vra, (vui8_t) vrb);
7132  result = vec_srl (result, vsht_splat);
7133 #endif
7134  return ((vui128_t) result);
7135 }
7136 
7153 static inline vui128_t
7154 vec_srqi (vui128_t vra, const unsigned int shb)
7155 {
7156  vui8_t result;
7157 
7158  if (shb < 128)
7159  {
7160 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
7161  vui32_t rshift = vec_splats((unsigned int) shb);
7162  __asm__(
7163  "vsrq %0,%1,%2;\n"
7164  : "=v" (result)
7165  : "v" (vra), "v" (rshift)
7166  : );
7167 #else
7168  vui8_t lshift;
7169  if (__builtin_constant_p (shb) && ((shb % 8)) == 0)
7170  {
7171  /* When shifting an multiple of 8 bits (octet), use Vector
7172  Shift Left Double By Octet Immediate. This eliminates
7173  loading the shift const into a VR, but requires an
7174  explicit vector of zeros. */
7175  vui8_t zero =
7176  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
7177  /* The compiler needs to know at compile time that
7178  0 < shb < 128 is true to insure the constraint (4 bit
7179  immediate field) of vsldoi is meet. So the following if
7180  is required but should not generate any branch code. */
7181  if (shb > 0)
7182  result = vec_sld (zero, (vui8_t) vra, (16 - (shb / 8)));
7183  else
7184  result = (vui8_t) vra;
7185  }
7186  else
7187  {
7188  /* Load the shift const in a vector. The bit level shifts
7189  require the shift amount is splatted to all 16-bytes of
7190  the shift control. */
7191  if ((__builtin_constant_p (shb) && (shb < 16)))
7192  lshift = (vui8_t) vec_splat_s8(shb);
7193  else
7194  lshift = vec_splats ((unsigned char) shb);
7195 
7196  if (shb > 7)
7197  /* Vector Shift right By Octet based on the bits 121-124 of
7198  lshift. */
7199  result = vec_sro ((vui8_t) vra, lshift);
7200  else
7201  result = ((vui8_t) vra);
7202 
7203  /* Vector Shift right based on the lower 3-bits of lshift. */
7204  result = vec_srl (result, lshift);
7205  }
7206 #endif
7207  }
7208  else
7209  { /* shifts greater then 127 bits return zeros. */
7210  result = vec_xor ((vui8_t) vra, (vui8_t) vra);
7211  }
7212  return (vui128_t) result;
7213 }
7214 
7224 static inline vui128_t
7226 {
7227  __vector unsigned char result, vsht_splat;
7228 
7229  /* The vsl instruction only works correctly if the bit shift value
7230  * is splatted to each byte of the vector. */
7231  vsht_splat = vec_splat_u8(4);
7232  result = vec_sll ((__vector unsigned char) vra, vsht_splat);
7233 
7234  return ((vui128_t) result);
7235 }
7236 
7246 static inline vui128_t
7248 {
7249  __vector unsigned char result, vsht_splat;
7250 
7251  /* The vsl instruction only works correctly if the bit shift value
7252  * is splatted to each byte of the vector. */
7253  vsht_splat = vec_splat_u8(5);
7254  result = vec_sll ((__vector unsigned char) vra, vsht_splat);
7255 
7256  return ((vui128_t) result);
7257 }
7258 
7268 static inline vui128_t
7270 {
7271  __vector unsigned char result, vsht_splat;
7272 
7273  /* The vsr instruction only works correctly if the bit shift value
7274  * is splatted to each byte of the vector. */
7275  vsht_splat = vec_splat_u8(4);
7276  result = vec_srl ((__vector unsigned char) vra, vsht_splat);
7277 
7278  return ((vui128_t) result);
7279 }
7280 
7290 static inline vui128_t
7292 {
7293  __vector unsigned char result, vsht_splat;
7294 
7295  /* The vsr instruction only works correctly if the bit shift value
7296  * is splatted to each byte of the vector. */
7297  vsht_splat = vec_splat_u8(5);
7298  result = vec_srl ((__vector unsigned char) vra, vsht_splat);
7299 
7300  return ((vui128_t) result);
7301 }
7302 
7316 static inline vui128_t
7318 {
7319  vui32_t t;
7320 #ifdef _ARCH_PWR8
7321 #if defined (vec_vsubcuq)
7322  t = (vui32_t) vec_vsubcuq (vra, vrb);
7323 #elif defined (__clang__)
7324  t = (vui32_t) vec_subc (vra, vrb);
7325 # else
7326  __asm__(
7327  "vsubcuq %0,%1,%2;"
7328  : "=v" (t)
7329  : "v" (vra),
7330  "v" (vrb)
7331  : );
7332 #endif
7333 #else
7334  /* vsubcuq is defined as (vra + NOT(vrb) + 1) >> 128. */
7335  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);
7336  const vui32_t ci= { 0,0,0,1 };
7337 
7338  t = (vui32_t) vec_addecuq (vra, (vui128_t) _b, (vui128_t) ci);
7339 #endif
7340  return ((vui128_t) t);
7341 }
7342 
7357 static inline vui128_t
7359 {
7360  vui32_t t;
7361 #ifdef _ARCH_PWR8
7362 #if defined (vec_vsubecuq)
7363  t = (vui32_t) vec_vsubecuq (vra, vrb, vrc);
7364 #elif defined (__clang__)
7365  t = (vui32_t) vec_subec (vra, vrb, vrc);
7366 # else
7367  __asm__(
7368  "vsubecuq %0,%1,%2,%3;"
7369  : "=v" (t)
7370  : "v" (vra),
7371  "v" (vrb),
7372  "v" (vrc)
7373  : );
7374 #endif
7375 #else
7376  /* vsubcuq is defined as (vra + NOT(vrb) + vrc.bit[127]) >> 128. */
7377  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);
7378 
7379  t = (vui32_t) vec_addecuq (vra, (vui128_t) _b, vrc);
7380 #endif
7381  return ((vui128_t) t);
7382 }
7383 
7398 static inline vui128_t
7400 {
7401  vui32_t t;
7402 #ifdef _ARCH_PWR8
7403 #if defined (vec_vsubeuqm)
7404  t = (vui32_t) vec_vsubeuqm (vra, vrb, vrc);
7405 #elif defined (__clang__)
7406  t = (vui32_t) vec_sube (vra, vrb, vrc);
7407 # else
7408  __asm__(
7409  "vsubeuqm %0,%1,%2,%3;"
7410  : "=v" (t)
7411  : "v" (vra),
7412  "v" (vrb),
7413  "v" (vrc)
7414  : );
7415 #endif
7416 #else
7417  /* vsubeuqm is defined as vra + NOT(vrb) + vrc.bit[127]. */
7418  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);
7419 
7420  t = (vui32_t) vec_addeuqm (vra, (vui128_t) _b, vrc);
7421 #endif
7422  return ((vui128_t) t);
7423 }
7424 
7438 static inline vui128_t
7440 {
7441  vui32_t t;
7442 #ifdef _ARCH_PWR8
7443 #if defined (vec_vsubuqm)
7444  t = (vui32_t) vec_vsubuqm (vra, vrb);
7445 #elif defined (__clang__)
7446  t = (vui32_t) vec_sub (vra, vrb);
7447  __asm__(
7448  "vsubuqm %0,%1,%2;"
7449  : "=v" (t)
7450  : "v" (vra),
7451  "v" (vrb)
7452  : );
7453 #endif
7454 #else
7455  /* vsubuqm is defined as vra + NOT(vrb) + 1. */
7456  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);
7457  const vui32_t ci= { 0,0,0,1 };
7458 
7459  t = (vui32_t) vec_addeuqm (vra, (vui128_t) _b, (vui128_t) ci);
7460 #endif
7461  return ((vui128_t) t);
7462 }
7463 
7464 
7486 static inline vui128_t
7488 {
7489  vui64_t res;
7490 
7491 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
7492  __asm__(
7493  "vmuleud %0,%1,%2;\n"
7494  : "=v" (res)
7495  : "v" (a), "v" (b)
7496  : );
7497 #elif defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
7498  const vui64_t zero = { 0, 0 };
7499  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
7500  __asm__(
7501  "vmsumudm %0,%1,%2,%3;\n"
7502  : "=v" (res)
7503  : "v" (a), "v" (b_eud), "v" (zero)
7504  : );
7505 #elif defined (_ARCH_PWR8)
7506  const vui64_t zero = { 0, 0 };
7507  vui64_t p0, p1, pp10, pp01;
7508  vui32_t m0, m1;
7509 
7510 // Need the endian invariant merge word high here
7511 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
7512 // Nullify the little endian transform
7513  m0 = vec_mergel ((vui32_t) b, (vui32_t) b);
7514 #else
7515  m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);
7516 #endif
7517  m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 0);
7518 
7519  // Need the endian invariant multiply even/odd word here
7520  p1 = vec_vmulouw (m1, m0);
7521  p0 = vec_vmuleuw (m1, m0);
7522  /* res[1] = p1[1]; res[0] = p0[0]; */
7523  res = vec_pasted (p0, p1);
7524  /*
7525  pp10[1] = p1[0]; pp10[0] = 0;
7526  pp01[1] = p0[1]; pp01[0] = 0;
7527  */
7528  // Need the endian invariant merge algebraic high/low here
7529  pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);
7530  pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);
7531  /* pp01 = pp01 + pp10. */
7532  pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);
7533 
7534  /* res = res + (pp01 << 32) */
7535  pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);
7536  res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);
7537 #else
7538  const vui32_t zero = {0,0,0,0};
7539  vui32_t p0, p1;
7540  vui32_t resw;
7541  vui16_t m0, m1, mm;
7542 
7543  m0 = (vui16_t) vec_mergeh (a, (vui64_t) zero);
7544  mm = (vui16_t) vec_mergeh (b, (vui64_t) zero);
7545 
7546  m1 = vec_splat (mm, 3);
7547 
7548  p0 = vec_vmuleuh (m0, m1);
7549  p1 = vec_vmulouh (m0, m1);
7550 
7551  resw = vec_sld (zero, p1, 14);
7552  {
7553  vui32_t c;
7554  c = vec_vaddcuw (resw, p0);
7555  resw = vec_vadduwm (resw, p0);
7556  c = vec_sld (c, c, 4);
7557  resw = vec_vadduwm (resw, c);
7558  }
7559 
7560  m1 = vec_splat (mm, 2);
7561  p0 = vec_vmuleuh (m0, m1);
7562  p1 = vec_vmulouh (m0, m1);
7563 
7564  {
7565  vui32_t c;
7566  c = vec_vaddcuw (resw, p1);
7567  resw = vec_vadduwm (resw, p1);
7568  c = vec_sld (c, c, 4);
7569  resw = vec_vadduwm (resw, c);
7570  resw = vec_sld (c, resw, 14);
7571  }
7572 
7573  {
7574  vui32_t c;
7575  c = vec_vaddcuw (resw, p0);
7576  resw = vec_vadduwm (resw, p0);
7577  c = vec_sld (c, c, 4);
7578  resw = vec_vadduwm (resw, c);
7579  }
7580 
7581  m1 = vec_splat (mm, 1);
7582  p0 = vec_vmuleuh (m0, m1);
7583  p1 = vec_vmulouh (m0, m1);
7584 
7585  {
7586  vui32_t c;
7587  c = vec_vaddcuw (resw, p1);
7588  resw = vec_vadduwm (resw, p1);
7589  c = vec_sld (c, c, 4);
7590  resw = vec_vadduwm (resw, c);
7591  resw = vec_sld (c, resw, 14);
7592  }
7593 
7594  {
7595  vui32_t c;
7596  c = vec_vaddcuw (resw, p0);
7597  resw = vec_vadduwm (resw, p0);
7598  c = vec_sld (c, c, 4);
7599  resw = vec_vadduwm (resw, c);
7600  }
7601 
7602  m1 = vec_splat (mm, 0);
7603  p0 = vec_vmuleuh (m0, m1);
7604  p1 = vec_vmulouh (m0, m1);
7605 
7606  {
7607  vui32_t c;
7608  c = vec_vaddcuw (resw, p1);
7609  resw = vec_vadduwm (resw, p1);
7610  c = vec_sld (c, c, 4);
7611  resw = vec_vadduwm (resw, c);
7612  resw = vec_sld (c, resw, 14);
7613  }
7614 
7615  {
7616  vui32_t c;
7617  c = vec_vaddcuw (resw, p0);
7618  resw = vec_vadduwm (resw, p0);
7619  c = vec_sld (c, c, 4);
7620  resw = vec_vadduwm (resw, c);
7621  }
7622 
7623  res = (vui64_t)resw;
7624 #endif
7625  return ((vui128_t) res);
7626 }
7627 
7650 static inline vui128_t
7652 {
7653  const vui64_t zero = { 0, 0 };
7654 #ifdef _ARCH_PWR9
7655  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
7656  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);
7657  return vec_msumudm(a, b_eud, (vui128_t) c_eud);
7658 #else
7659  vui128_t res;
7660  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);
7661  res = vec_vmuleud (a, b);
7662  return vec_adduqm (res, (vui128_t) c_eud);
7663 #endif
7664 }
7665 
7689 static inline vui128_t
7691 {
7692  const vui64_t zero = { 0, 0 };
7693 #ifdef _ARCH_PWR9
7694  vui128_t cd_sum;
7695  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
7696  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);
7697  vui64_t d_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) d);
7698  cd_sum = vec_adduqm ((vui128_t) c_eud, (vui128_t) d_eud);
7699  return vec_msumudm(a, b_eud, (vui128_t) cd_sum);
7700 #else
7701  vui128_t res, cd_sum;
7702  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);
7703  vui64_t d_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) d);
7704  cd_sum = vec_adduqm ((vui128_t) c_eud, (vui128_t) d_eud);
7705  res = vec_vmuleud (a, b);
7706  return vec_adduqm (res, (vui128_t) cd_sum);
7707 #endif
7708 }
7709 
7732 static inline vui128_t
7734 {
7735  vui64_t res;
7736 
7737 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
7738  __asm__(
7739  "vmuloud %0,%1,%2;\n"
7740  : "=v" (res)
7741  : "v" (a), "v" (b)
7742  : );
7743 #elif defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
7744  const vui64_t zero = { 0, 0 };
7745  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t)b);
7746  __asm__(
7747  "vmsumudm %0,%1,%2,%3;\n"
7748  : "=v" (res)
7749  : "v" (a), "v" (b_oud), "v" (zero)
7750  : );
7751 #elif defined (_ARCH_PWR8)
7752  const vui64_t zero = { 0, 0 };
7753  vui64_t p0, p1, pp10, pp01;
7754  vui32_t m0, m1;
7755 
7756  // Need the endian invariant merge word low here
7757 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
7758  // Nullify the little endian transform
7759  m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);
7760 #else
7761  m0 = vec_mergel ((vui32_t) b, (vui32_t) b);
7762 #endif
7763  m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 1);
7764 
7765  // Need the endian invariant multiply even/odd word here
7766  p0 = vec_vmuleuw (m1, m0);
7767  p1 = vec_vmulouw (m1, m0);
7768 
7769  /* res[1] = p1[1]; res[0] = p0[0]; */
7770  res = vec_pasted (p0, p1);
7771  /*
7772  pp10[0] = p1[0]; pp10[1] = 0;
7773  pp01[0] = p0[1]; pp01[1] = 0;
7774  */
7775  // Need the endian invariant merge algebraic high/low here
7776  pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);
7777  pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);
7778 
7779  pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);
7780 
7781  pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);
7782 
7783  res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);
7784 #else
7785 // POWER7 and earlier are big Endian only
7786  const vui32_t zero = {0,0,0,0};
7787  vui32_t p0, p1;
7788  vui32_t resw;
7789  vui16_t m0, m1, mm;
7790 
7791  m0 = (vui16_t) vec_mergel (a, (vui64_t) zero);
7792  mm = (vui16_t) vec_mergel (b, (vui64_t) zero);
7793 
7794  m1 = vec_splat (mm, 3);
7795 
7796  p0 = vec_vmuleuh (m0, m1);
7797  p1 = vec_vmulouh (m0, m1);
7798 
7799  resw = vec_sld (zero, p1, 14);
7800 
7801  {
7802  vui32_t c;
7803  c = vec_vaddcuw (resw, p0);
7804  resw = vec_vadduwm (resw, p0);
7805  c = vec_sld (c, c, 4);
7806  resw = vec_vadduwm (resw, c);
7807  }
7808 
7809  m1 = vec_splat (mm, 2);
7810 
7811  p0 = vec_vmuleuh (m0, m1);
7812  p1 = vec_vmulouh (m0, m1);
7813  {
7814  vui32_t c;
7815  c = vec_vaddcuw (resw, p1);
7816  resw = vec_vadduwm (resw, p1);
7817 
7818  c = vec_sld (c, c, 4);
7819  resw = vec_vadduwm (resw, c);
7820  resw = vec_sld (c, resw, 14);
7821  }
7822 
7823  {
7824  vui32_t c;
7825  c = vec_vaddcuw (resw, p0);
7826  resw = vec_vadduwm (resw, p0);
7827  c = vec_sld (c, c, 4);
7828  resw = vec_vadduwm (resw, c);
7829  }
7830 
7831  m1 = vec_splat (mm, 1);
7832 
7833  p0 = vec_vmuleuh (m0, m1);
7834  p1 = vec_vmulouh (m0, m1);
7835 
7836  {
7837  vui32_t c;
7838  c = vec_vaddcuw (resw, p1);
7839  resw = vec_vadduwm (resw, p1);
7840 
7841  c = vec_sld (c, c, 4);
7842  resw = vec_vadduwm (resw, c);
7843  resw = vec_sld (c, resw, 14);
7844  }
7845 
7846  {
7847  vui32_t c;
7848  c = vec_vaddcuw (resw, p0);
7849  resw = vec_vadduwm (resw, p0);
7850  c = vec_sld (c, c, 4);
7851  resw = vec_vadduwm (resw, c);
7852  }
7853 
7854  m1 = vec_splat (mm, 0);
7855 
7856  p0 = vec_vmuleuh (m0, m1);
7857  p1 = vec_vmulouh (m0, m1);
7858 
7859  {
7860  vui32_t c;
7861  c = vec_vaddcuw (resw, p1);
7862  resw = vec_vadduwm (resw, p1);
7863 
7864  c = vec_sld (c, c, 4);
7865  resw = vec_vadduwm (resw, c);
7866  resw = vec_sld (c, resw, 14);
7867  }
7868 
7869  {
7870  vui32_t c;
7871  c = vec_vaddcuw (resw, p0);
7872  resw = vec_vadduwm (resw, p0);
7873  c = vec_sld (c, c, 4);
7874  resw = vec_vadduwm (resw, c);
7875  }
7876 
7877  res = (vui64_t)resw;
7878 #endif
7879  return ((vui128_t) res);
7880 }
7881 
7904 static inline vui128_t
7906 {
7907  const vui64_t zero = { 0, 0 };
7908 #ifdef _ARCH_PWR9
7909  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);
7910  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);
7911  return vec_msumudm(a, b_oud, (vui128_t) c_oud);
7912 #else
7913  vui128_t res;
7914  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);
7915  res = vec_vmuloud (a, b);
7916  return vec_adduqm (res, (vui128_t) c_oud);
7917 #endif
7918 }
7919 
7943 static inline vui128_t
7945 {
7946  const vui64_t zero = { 0, 0 };
7947 #ifdef _ARCH_PWR9
7948  vui128_t cd_sum;
7949  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);
7950  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);
7951  vui64_t d_oud = vec_mrgald ((vui128_t) zero, (vui128_t) d);
7952  cd_sum = vec_adduqm ((vui128_t) c_oud, (vui128_t) d_oud);
7953  return vec_msumudm(a, b_oud, (vui128_t) cd_sum);
7954 #else
7955  vui128_t res, cd_sum;
7956  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);
7957  vui64_t d_oud = vec_mrgald ((vui128_t) zero, (vui128_t) d);
7958  cd_sum = vec_adduqm ((vui128_t) c_oud, (vui128_t) d_oud);
7959  res = vec_vmuloud (a, b);
7960  return vec_adduqm (res, (vui128_t) cd_sum);
7961 #endif
7962 }
7963 
7986 static inline vui128_t
7988 {
7989 #ifdef _ARCH_PWR9
7990  const vui64_t zero = { 0, 0 };
7991  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
7992  return vec_msumudm(a, b_eud, c);
7993 #else
7994  vui128_t res;
7995  res = vec_vmuleud (a, b);
7996  return vec_adduqm (res, c);
7997 #endif
7998 }
7999 
8022 static inline vui128_t
8024 {
8025 #ifdef _ARCH_PWR9
8026  const vui64_t zero = { 0, 0 };
8027  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);
8028  return vec_msumudm(a, b_oud, (vui128_t) c);
8029 #else
8030  vui128_t res;
8031  res = vec_vmuloud (a, b);
8032  return vec_adduqm (res, c);
8033 #endif
8034 }
8035 
8052 static inline vui128_t
8053 vec_vsldbi (vui128_t vra, vui128_t vrb, const unsigned int shb)
8054 {
8055  vui128_t result;
8056 
8057  if (__builtin_constant_p (shb) && (shb < 8))
8058  {
8059 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
8060  __asm__(
8061  "vsldbi %0,%1,%2,%3;\n"
8062  : "=v" (result)
8063  : "v" (vra), "v" (vrb), "K" (shb)
8064  : );
8065 #else
8066  /* For Power7/8/9 the quadword bit shift left/right instructions
8067  * only handle 128-bits.
8068  * So shift vra and vrb separately then combine those into
8069  * a single 128-bit result.
8070  */
8071  if (shb > 0)
8072  {
8073  const vui8_t vshl = vec_splat_u8 (shb);
8074  const vui8_t vshr = vec_splat_u8 (8 - shb);
8075  const vui8_t zero = vec_splat_u8 (0);
8076  vui8_t lowbits, highbits;
8077 
8078  /* Shift left double (vra || 'zero') by 15 octet to isolate
8079  * the high order byte of vrb in to the low 8-bits. Then right
8080  * shift this (8-shb) bits. This provides (128-shb) bits of
8081  * leading '0's. */
8082  lowbits = vec_sld (zero, (vui8_t) vrb, 1);
8083  lowbits = vec_vsrb (lowbits, vshr);
8084  /* Left shift the quadword vra shifting in shb '0' bits. */
8085  highbits = vec_sll ((vui8_t) vra, vshl);
8086  /* Combine left shifted bits from vra, vrb. */
8087  result = (vui128_t) vec_or (highbits, lowbits);
8088  }
8089  else
8090  result = vra;
8091 #endif
8092  }
8093  else
8094  {
8095  result = vec_sldqi (vra, vrb, (shb & 7));
8096  }
8097 
8098  return ((vui128_t) result);
8099 }
8100 
8117 static inline vui128_t
8118 vec_vsrdbi (vui128_t vra, vui128_t vrb, const unsigned int shb)
8119 {
8120  vui128_t result;
8121 
8122  if (__builtin_constant_p (shb) && (shb < 8))
8123  {
8124 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
8125  __asm__(
8126  "vsrdbi %0,%1,%2,%3;\n"
8127  : "=v" (result)
8128  : "v" (vra), "v" (vrb), "K" (shb)
8129  : );
8130 #else
8131  /* For Power7/8/9 the quadword bit shift left/right instructions
8132  * only handle 128-bits.
8133  * So shift vra and vrb separately then combine those into
8134  * a single 128-bit result.
8135  */
8136  if (shb > 0)
8137  {
8138  const vui8_t vshl = vec_splat_u8 (8 - shb);
8139  const vui8_t vshr = vec_splat_u8 (shb);
8140  const vui8_t zero = vec_splat_u8 (0);
8141  vui8_t lowbits, highbits;
8142 
8143  /* Shift left double (vra || 'zero') by 15 octet to isolate
8144  * the low order byte of vra in to the high 8-bits. Then left
8145  * shift this (8-shb) bits. This provides (128-shb) bits of
8146  * trailing '0's. */
8147  highbits = vec_sld ((vui8_t) vra, zero, 15);
8148  highbits = vec_vslb (highbits, vshl);
8149  /* right shift the quadword vrb shifting in shb '0' bits. */
8150  lowbits = vec_srl ((vui8_t) vrb, vshr);
8151  /* Combine right shifted bits from vra, vrb. */
8152  result = (vui128_t) vec_or (highbits, lowbits);
8153  }
8154  else
8155  result = vrb;
8156 #endif
8157  }
8158  else
8159  {
8160 #if defined (__clang__) && (__clang_major__ < 6)
8161  // A workaround for a constant propagation bug in clang-5
8162  if (shb == 0)
8163  result = vrb;
8164  else
8165 #endif
8166  result = vec_sldqi (vra, vrb, (128 - (shb & 7)));
8167  }
8168 
8169  return ((vui128_t) result);
8170 }
8171 #endif /* VEC_INT128_PPC_H_ */
vec_vmaddeud
static vui128_t vec_vmaddeud(vui64_t a, vui64_t b, vui64_t c)
Vector Multiply-Add Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:7651
vec_sraq
static vi128_t vec_sraq(vi128_t vra, vui128_t vrb)
Vector Shift Right Algebraic Quadword.
Definition: vec_int128_ppc.h:6986
VEC_WE_2
#define VEC_WE_2
Element index for vector splat word 2.
Definition: vec_common_ppc.h:334
vec_subeuqm
static vui128_t vec_subeuqm(vui128_t vra, vui128_t vrb, vui128_t vrc)
Vector Subtract Extended Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:7399
vec_cmpsq_all_eq
static int vec_cmpsq_all_eq(vi128_t vra, vi128_t vrb)
Vector Compare all Equal Signed Quadword.
Definition: vec_int128_ppc.h:3522
vec_cmpltuq
static vb128_t vec_cmpltuq(vui128_t vra, vui128_t vrb)
Vector Compare Less Than Unsigned Quadword.
Definition: vec_int128_ppc.h:3406
vec_sraqi
static vi128_t vec_sraqi(vi128_t vra, const unsigned int shb)
Vector Shift Right Algebraic Quadword Immediate.
Definition: vec_int128_ppc.h:7036
vec_minsq
static vi128_t vec_minsq(vi128_t vra, vi128_t vrb)
Vector Minimum Signed Quadword.
Definition: vec_int128_ppc.h:4532
vec_modsq_10e31
static vi128_t vec_modsq_10e31(vi128_t vra, vi128_t q)
Vector Modulo by const 10e31 Signed Quadword.
Definition: vec_int128_ppc.h:4578
VEC_BYTE_L
#define VEC_BYTE_L
Element index for lowest order byte.
Definition: vec_common_ppc.h:344
vec_divuq_10e32
static vui128_t vec_divuq_10e32(vui128_t vra)
Vector Divide by const 10e32 Unsigned Quadword.
Definition: vec_int128_ppc.h:4439
vec_cmplesq
static vb128_t vec_cmplesq(vi128_t vra, vi128_t vrb)
Vector Compare Less Than or Equal Signed Quadword.
Definition: vec_int128_ppc.h:3267
vec_subecuq
static vui128_t vec_subecuq(vui128_t vra, vui128_t vrb, vui128_t vrc)
Vector Subtract Extended and Write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:7358
vec_cmpgtuq
static vb128_t vec_cmpgtuq(vui128_t vra, vui128_t vrb)
Vector Compare Greater Than Unsigned Quadword.
Definition: vec_int128_ppc.h:3227
vec_addecuq
static vui128_t vec_addecuq(vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extended & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2622
vec_vmaddouw
static vui64_t vec_vmaddouw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Odd Unsigned Words.
Definition: vec_int64_ppc.h:4757
vec_mul10cuq
static vui128_t vec_mul10cuq(vui128_t a)
Vector Multiply by 10 & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:4785
vec_subuqm
static vui128_t vec_subuqm(vui128_t vra, vui128_t vrb)
Vector Subtract Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:7439
vec_cmul10ecuq
static vui128_t vec_cmul10ecuq(vui128_t *cout, vui128_t a, vui128_t cin)
Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:4067
vec_negsq
static vi128_t vec_negsq(vi128_t int128)
Vector Negate Signed Quadword.
Definition: vec_int128_ppc.h:6234
vec_cmpuq_all_lt
static int vec_cmpuq_all_lt(vui128_t vra, vui128_t vrb)
Vector Compare any Less Than Unsigned Quadword.
Definition: vec_int128_ppc.h:3980
vb32_t
__vector __bool int vb32_t
vector of 32-bit bool int elements.
Definition: vec_common_ppc.h:228
vec_xxspltd
static vui64_t vec_xxspltd(vui64_t vra, const int ctl)
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of ...
Definition: vec_int64_ppc.h:4647
vec_cmpgtsq
static vb128_t vec_cmpgtsq(vi128_t vra, vi128_t vrb)
Vector Compare Greater Than Signed Quadword.
Definition: vec_int128_ppc.h:3178
vec_vmaddeuw
static vui64_t vec_vmaddeuw(vui32_t a, vui32_t b, vui32_t c)
Vector Multiply-Add Even Unsigned Words.
Definition: vec_int64_ppc.h:4688
VEC_BYTE_H
#define VEC_BYTE_H
Element index for highest order byte.
Definition: vec_common_ppc.h:350
vb128_t
__vector __bool __int128 vb128_t
vector of one 128-bit bool __int128 element.
Definition: vec_common_ppc.h:240
vec_sldq
static vui128_t vec_sldq(vui128_t vrw, vui128_t vrx, vui128_t vrb)
Vector Shift Left Double Quadword.
Definition: vec_int128_ppc.h:6613
vec_slq4
static vui128_t vec_slq4(vui128_t vra)
Definition: vec_int128_ppc.h:7225
vec_muludm
static vui64_t vec_muludm(vui64_t vra, vui64_t vrb)
Vector Multiply Unsigned Doubleword Modulo.
Definition: vec_int128_ppc.h:5344
vec_cmpneuq
static vb128_t vec_cmpneuq(vui128_t vra, vui128_t vrb)
Vector Compare Not Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3475
vec_splat_s128
static vi128_t vec_splat_s128(const int sim)
Vector Splat Immediate Signed Quadword. Extend a signed integer constant across the quadword element ...
Definition: vec_int128_ppc.h:6829
vec_vsrdbi
static vui128_t vec_vsrdbi(vui128_t vra, vui128_t vrb, const unsigned int shb)
Vector Shift Right Double Quadword by Bit Immediate.
Definition: vec_int128_ppc.h:8118
vec_cmpuq_all_ge
static int vec_cmpuq_all_ge(vui128_t vra, vui128_t vrb)
Vector Compare any Greater Than or Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3848
vec_vmaddoud
static vui128_t vec_vmaddoud(vui64_t a, vui64_t b, vui64_t c)
Vector Multiply-Add Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:7905
vec_cmul100ecuq
static vui128_t vec_cmul100ecuq(vui128_t *cout, vui128_t a, vui128_t cin)
Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:5071
vec_divudq_10e32
static vui128_t vec_divudq_10e32(vui128_t *qh, vui128_t vra, vui128_t vrb)
Vector Divide Unsigned Double Quadword by const 10e32.
Definition: vec_int128_ppc.h:4327
vec_rlq
static vui128_t vec_rlq(vui128_t vra, vui128_t vrb)
Vector Rotate Left Quadword.
Definition: vec_int128_ppc.h:6375
vec_vsumsw
static vi32_t vec_vsumsw(vi32_t vra, vi32_t vrb)
Vector Sum-across Signed Word Saturate.
Definition: vec_int32_ppc.h:2907
CONST_VINT128_W
#define CONST_VINT128_W(__w0, __w1, __w2, __w3)
Arrange word elements of a unsigned int initializer in high->low order. May require an explicit cast.
Definition: vec_common_ppc.h:304
vec_cmpsq_all_lt
static int vec_cmpsq_all_lt(vi128_t vra, vi128_t vrb)
Vector Compare any Less Than Signed Quadword.
Definition: vec_int128_ppc.h:3710
vec_cmul100cuq
static vui128_t vec_cmul100cuq(vui128_t *cout, vui128_t a)
Vector combined Multiply by 100 & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:5006
vec_mulluq
static vui128_t vec_mulluq(vui128_t a, vui128_t b)
Vector Multiply Low Unsigned Quadword.
Definition: vec_int128_ppc.h:5555
vec_addeuqm
static vui128_t vec_addeuqm(vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extended Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:2684
vec_vmsumeud
static vui128_t vec_vmsumeud(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:7987
vec_maxuq
static vui128_t vec_maxuq(vui128_t vra, vui128_t vrb)
Vector Maximum Unsigned Quadword.
Definition: vec_int128_ppc.h:4508
vec_cmpsq_all_le
static int vec_cmpsq_all_le(vi128_t vra, vi128_t vrb)
Vector Compare any Less Than or Equal Signed Quadword.
Definition: vec_int128_ppc.h:3662
vec_cmpuq_all_ne
static int vec_cmpuq_all_ne(vui128_t vra, vui128_t vrb)
Vector Compare all Not Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:4025
CONST_VINT64_DW
#define CONST_VINT64_DW(__dw0, __dw1)
Arrange elements of dword initializer in high->low order.
Definition: vec_common_ppc.h:295
vec_cmpsq_all_gt
static int vec_cmpsq_all_gt(vi128_t vra, vi128_t vrb)
Vector Compare any Greater Than Signed Quadword.
Definition: vec_int128_ppc.h:3614
vui16_t
__vector unsigned short vui16_t
vector of 16-bit unsigned short elements.
Definition: vec_common_ppc.h:204
vec_minuq
static vui128_t vec_minuq(vui128_t vra, vui128_t vrb)
Vector Minimum Unsigned Quadword.
Definition: vec_int128_ppc.h:4556
vec_vmuleud
static vui128_t vec_vmuleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:7487
vec_cmpuq_all_eq
static int vec_cmpuq_all_eq(vui128_t vra, vui128_t vrb)
Vector Compare all Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3804
vec_popcntw
static vui32_t vec_popcntw(vui32_t vra)
Vector Population Count word.
Definition: vec_int32_ppc.h:1184
vi128_t
__vector __int128 vi128_t
vector of one 128-bit signed __int128 element.
Definition: vec_common_ppc.h:235
vec_abssq
static vi128_t vec_abssq(vi128_t vra)
Vector Absolute Value Signed Quadword.
Definition: vec_int128_ppc.h:2516
vec_madd2uq
static vui128_t vec_madd2uq(vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c1, vui128_t c2)
Vector Multiply-Add2 Unsigned Quadword.
Definition: vec_int128_ppc.h:6184
vec_divsq_10e31
static vi128_t vec_divsq_10e31(vi128_t vra)
Vector Divide by const 10e31 Signed Quadword.
Definition: vec_int128_ppc.h:4193
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vec_common_ppc.h
Common definitions and typedef used by the collection of Power Vector Library (pveclib) headers.
vec_srq4
static vui128_t vec_srq4(vui128_t vra)
Definition: vec_int128_ppc.h:7269
vec_splatd
static vui64_t vec_splatd(vui64_t vra, const int ctl)
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of ...
Definition: vec_int64_ppc.h:3382
vec_vmuleuw
static vui64_t vec_vmuleuw(vui32_t vra, vui32_t vrb)
Vector Multiply Even Unsigned words.
Definition: vec_int32_ppc.h:2237
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vec_popcntq
static vui128_t vec_popcntq(vui128_t vra)
Vector Population Count Quadword for unsigned __int128 elements.
Definition: vec_int128_ppc.h:6277
vi32_t
__vector int vi32_t
vector of 32-bit signed int elements.
Definition: vec_common_ppc.h:215
vec_vmulouw
static vui64_t vec_vmulouw(vui32_t vra, vui32_t vrb)
Vector Multiply Odd Unsigned Words.
Definition: vec_int32_ppc.h:2340
vec_rlqi
static vui128_t vec_rlqi(vui128_t vra, const unsigned int shb)
Vector Rotate Left Quadword Immediate.
Definition: vec_int128_ppc.h:6408
vec_cmpsq_all_ne
static int vec_cmpsq_all_ne(vi128_t vra, vi128_t vrb)
Vector Compare all Not Equal Signed Quadword.
Definition: vec_int128_ppc.h:3759
vec_vrld
static vui64_t vec_vrld(vui64_t vra, vui64_t vrb)
Vector Rotate Left Doubleword.
Definition: vec_int64_ppc.h:4185
vec_modudq_10e31
static vui128_t vec_modudq_10e31(vui128_t vra, vui128_t vrb, vui128_t *ql)
Vector Modulo Unsigned Double Quadword by const 10e31.
Definition: vec_int128_ppc.h:4620
vec_vsld
static vui64_t vec_vsld(vui64_t vra, vui64_t vrb)
Vector Shift Left Doubleword.
Definition: vec_int64_ppc.h:4238
vec_srqi
static vui128_t vec_srqi(vui128_t vra, const unsigned int shb)
Vector Shift Right Quadword Immediate.
Definition: vec_int128_ppc.h:7154
vec_moduq_10e32
static vui128_t vec_moduq_10e32(vui128_t vra, vui128_t q)
Vector Modulo by const 10e32 Unsigned Quadword.
Definition: vec_int128_ppc.h:4751
vec_splat_u128
static vui128_t vec_splat_u128(const int sim)
Vector Splat Immediate Unsigned Quadword. Extend a unsigned integer constant across the quadword elem...
Definition: vec_int128_ppc.h:6914
vec_clzq
static vui128_t vec_clzq(vui128_t vra)
Vector Count Leading Zeros Quadword for unsigned __int128 elements.
Definition: vec_int128_ppc.h:2918
vec_madduq
static vui128_t vec_madduq(vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c)
Vector Multiply-Add Unsigned Quadword.
Definition: vec_int128_ppc.h:5956
vec_cmul10cuq
static vui128_t vec_cmul10cuq(vui128_t *cout, vui128_t a)
Vector combined Multiply by 10 & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:4130
vec_muludq
static vui128_t vec_muludq(vui128_t *mulu, vui128_t a, vui128_t b)
Vector Multiply Unsigned Double Quadword.
Definition: vec_int128_ppc.h:5734
VEC_WE_3
#define VEC_WE_3
Element index for vector splat word 3.
Definition: vec_common_ppc.h:336
vec_srq5
static vui128_t vec_srq5(vui128_t vra)
Definition: vec_int128_ppc.h:7291
vec_addcuq
static vui128_t vec_addcuq(vui128_t a, vui128_t b)
Vector Add & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2568
vec_cmpuq_all_le
static int vec_cmpuq_all_le(vui128_t vra, vui128_t vrb)
Vector Compare any Less Than or Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3936
vui128_t
__vector unsigned __int128 vui128_t
vector of one 128-bit unsigned __int128 element.
Definition: vec_common_ppc.h:237
vec_maxsq
static vi128_t vec_maxsq(vi128_t vra, vi128_t vrb)
Vector Maximum Signed Quadword.
Definition: vec_int128_ppc.h:4484
vec_msumudm
static vui128_t vec_msumudm(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Unsigned Doubleword Modulo.
Definition: vec_int128_ppc.h:5202
vec_slq5
static vui128_t vec_slq5(vui128_t vra)
Definition: vec_int128_ppc.h:7247
vec_vmuloud
static vui128_t vec_vmuloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:7733
vec_permdi
static vui64_t vec_permdi(vui64_t vra, vui64_t vrb, const int ctl)
Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a d...
Definition: vec_int64_ppc.h:2983
vec_vmadd2oud
static vui128_t vec_vmadd2oud(vui64_t a, vui64_t b, vui64_t c, vui64_t d)
Vector Multiply-Add2 Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:7944
vec_addcq
static vui128_t vec_addcq(vui128_t *cout, vui128_t a, vui128_t b)
Vector Add with carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2788
vec_clzd
static vui64_t vec_clzd(vui64_t vra)
Vector Count Leading Zeros Doubleword for unsigned long long elements.
Definition: vec_int64_ppc.h:1313
vec_avguq
static vui128_t vec_avguq(vui128_t vra, vui128_t vrb)
Vector Average Unsigned Quadword.
Definition: vec_int128_ppc.h:2541
vec_cmpeqsq
static vb128_t vec_cmpeqsq(vi128_t vra, vi128_t vrb)
Vector Compare Equal Signed Quadword.
Definition: vec_int128_ppc.h:3013
vec_cmpltsq
static vb128_t vec_cmpltsq(vi128_t vra, vi128_t vrb)
Vector Compare Less Than Signed Quadword.
Definition: vec_int128_ppc.h:3357
vec_setb_cyq
static vb128_t vec_setb_cyq(vui128_t vcy)
Vector Set Bool from Quadword Carry.
Definition: vec_int128_ppc.h:6509
vi64_t
__vector long long vi64_t
vector of 64-bit signed long long elements.
Definition: vec_common_ppc.h:217
vec_vmsumoud
static vui128_t vec_vmsumoud(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:8023
CONST_VINT128_DW
#define CONST_VINT128_DW(__dw0, __dw1)
Initializer for 128-bits vector, as two unsigned long long elements in high->low order....
Definition: vec_common_ppc.h:298
vec_int64_ppc.h
Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements.
vi8_t
__vector signed char vi8_t
vector of 8-bit signed char elements.
Definition: vec_common_ppc.h:211
vec_revbq
static vui128_t vec_revbq(vui128_t vra)
Vector Byte Reverse Quadword.
Definition: vec_int128_ppc.h:6332
vec_subcuq
static vui128_t vec_subcuq(vui128_t vra, vui128_t vrb)
Vector Subtract and Write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:7317
vec_divudq_10e31
static vui128_t vec_divudq_10e31(vui128_t *qh, vui128_t vra, vui128_t vrb)
Vector Divide Unsigned Double Quadword by const 10e31.
Definition: vec_int128_ppc.h:4257
vec_cmpuq_all_gt
static int vec_cmpuq_all_gt(vui128_t vra, vui128_t vrb)
Vector Compare any Greater Than Unsigned Quadword.
Definition: vec_int128_ppc.h:3892
vec_modudq_10e32
static vui128_t vec_modudq_10e32(vui128_t vra, vui128_t vrb, vui128_t *ql)
Vector Modulo Unsigned Double Quadword by const 10e32.
Definition: vec_int128_ppc.h:4673
vec_setb_sq
static vb128_t vec_setb_sq(vi128_t vra)
Vector Set Bool from Signed Quadword.
Definition: vec_int128_ppc.h:6576
VEC_DW_L
#define VEC_DW_L
Element index for low order dword.
Definition: vec_common_ppc.h:324
vec_cmpnesq
static vb128_t vec_cmpnesq(vi128_t vra, vi128_t vrb)
Vector Compare Equal Signed Quadword.
Definition: vec_int128_ppc.h:3445
vec_vmadd2eud
static vui128_t vec_vmadd2eud(vui64_t a, vui64_t b, vui64_t c, vui64_t d)
Vector Multiply-Add2 Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:7690
VEC_WE_1
#define VEC_WE_1
Element index for vector splat word 1.
Definition: vec_common_ppc.h:332
vec_neguq
static vui128_t vec_neguq(vui128_t int128)
Vector Negate Unsigned Quadword.
Definition: vec_int128_ppc.h:6254
VEC_WE_0
#define VEC_WE_0
Element index for vector splat word 0.
Definition: vec_common_ppc.h:330
vec_cmpequq
static vb128_t vec_cmpequq(vui128_t vra, vui128_t vrb)
Vector Compare Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3043
vec_absduq
static vui128_t vec_absduq(vui128_t vra, vui128_t vrb)
Vector Absolute Difference Unsigned Quadword.
Definition: vec_int128_ppc.h:2489
vec_cmpleuq
static vb128_t vec_cmpleuq(vui128_t vra, vui128_t vrb)
Vector Compare Less Than or Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3316
vec_pasted
static vui64_t vec_pasted(vui64_t __VH, vui64_t __VL)
Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word o...
Definition: vec_int64_ppc.h:2937
vec_mrgald
static vui64_t vec_mrgald(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic Low Doublewords.
Definition: vec_int64_ppc.h:2736
vec_ctzq
static vui128_t vec_ctzq(vui128_t vra)
Vector Count Trailing Zeros Quadword for unsigned __int128 elements.
Definition: vec_int128_ppc.h:2984
vec_mrgahd
static vui64_t vec_mrgahd(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic High Doublewords.
Definition: vec_int64_ppc.h:2710
vec_divuq_10e31
static vui128_t vec_divuq_10e31(vui128_t vra)
Vector Divide by const 10e31 Unsigned Quadword.
Definition: vec_int128_ppc.h:4391
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
vec_mulhud
static vui64_t vec_mulhud(vui64_t vra, vui64_t vrb)
Vector Multiply High Unsigned Doubleword.
Definition: vec_int128_ppc.h:5277
vec_sldqi
static vui128_t vec_sldqi(vui128_t vrw, vui128_t vrx, const unsigned int shb)
Vector Shift Left Double Quadword Immediate.
Definition: vec_int128_ppc.h:6649
vec_mul10uq
static vui128_t vec_mul10uq(vui128_t a)
Vector Multiply by 10 Unsigned Quadword.
Definition: vec_int128_ppc.h:4956
vec_adduqm
static vui128_t vec_adduqm(vui128_t a, vui128_t b)
Vector Add Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:2739
vec_mul10ecuq
static vui128_t vec_mul10ecuq(vui128_t a, vui128_t cin)
Vector Multiply by 10 Extended & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:4841
vec_vmsumuwm
static vui64_t vec_vmsumuwm(vui32_t vra, vui32_t vrb, vui64_t vrc)
Vector Multiply-Sum Unsigned Word Modulo.
Definition: vec_int64_ppc.h:4829
vec_addudm
static vui64_t vec_addudm(vui64_t a, vui64_t b)
Vector Add Unsigned Doubleword Modulo.
Definition: vec_int64_ppc.h:1261
vec_vmaddouh
static vui32_t vec_vmaddouh(vui16_t a, vui16_t b, vui16_t c)
Vector Multiply-Add Odd Unsigned Halfwords.
Definition: vec_int16_ppc.h:1229
vec_addeq
static vui128_t vec_addeq(vui128_t *cout, vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extend with carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2849
vec_srq
static vui128_t vec_srq(vui128_t vra, vui128_t vrb)
Vector Shift Right Quadword.
Definition: vec_int128_ppc.h:7114
vec_popcntd
static vui64_t vec_popcntd(vui64_t vra)
Vector Population Count doubleword.
Definition: vec_int64_ppc.h:3068
vec_muleud
static vui128_t vec_muleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:5244
vec_moduq_10e31
static vui128_t vec_moduq_10e31(vui128_t vra, vui128_t q)
Vector Modulo by const 10e31 Unsigned Quadword.
Definition: vec_int128_ppc.h:4718
vec_selsq
static vi128_t vec_selsq(vi128_t vra, vi128_t vrb, vb128_t vrc)
Vector Select Signed Quadword.
Definition: vec_int128_ppc.h:6462
vec_mulhuq
static vui128_t vec_mulhuq(vui128_t a, vui128_t b)
Vector Multiply High Unsigned Quadword.
Definition: vec_int128_ppc.h:5387
vec_swapd
static vui64_t vec_swapd(vui64_t vra)
Vector doubleword swap. Exchange the high and low doubleword elements of a vector.
Definition: vec_int64_ppc.h:3789
vec_sradi
static vi64_t vec_sradi(vi64_t vra, const unsigned int shb)
Vector Shift Right Algebraic Doubleword Immediate.
Definition: vec_int64_ppc.h:3692
vec_muloud
static vui128_t vec_muloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:5313
vec_vsldbi
static vui128_t vec_vsldbi(vui128_t vra, vui128_t vrb, const unsigned int shb)
Vector Shift Left Double Quadword by Bit Immediate.
Definition: vec_int128_ppc.h:8053
vec_slqi
static vui128_t vec_slqi(vui128_t vra, const unsigned int shb)
Vector Shift Left Quadword Immediate.
Definition: vec_int128_ppc.h:6748
vec_clzw
static vui32_t vec_clzw(vui32_t vra)
Vector Count Leading Zeros word.
Definition: vec_int32_ppc.h:503
vec_cmpgesq
static vb128_t vec_cmpgesq(vi128_t vra, vi128_t vrb)
Vector Compare Greater Than or Equal Signed Quadword.
Definition: vec_int128_ppc.h:3089
vec_slq
static vui128_t vec_slq(vui128_t vra, vui128_t vrb)
Vector Shift Left Quadword.
Definition: vec_int128_ppc.h:6707
vec_vmaddeuh
static vui32_t vec_vmaddeuh(vui16_t a, vui16_t b, vui16_t c)
Vector Multiply-Add Even Unsigned Halfwords.
Definition: vec_int16_ppc.h:1193
VEC_W_L
#define VEC_W_L
Element index for lowest order word.
Definition: vec_common_ppc.h:328
vec_cmpequd
static vb64_t vec_cmpequd(vui64_t a, vui64_t b)
Vector Compare Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1451
vec_mul10euq
static vui128_t vec_mul10euq(vui128_t a, vui128_t cin)
Vector Multiply by 10 Extended Unsigned Quadword.
Definition: vec_int128_ppc.h:4903
vec_setb_ncq
static vb128_t vec_setb_ncq(vui128_t vcy)
Vector Set Bool from Quadword not Carry.
Definition: vec_int128_ppc.h:6546
vec_cmpgeuq
static vb128_t vec_cmpgeuq(vui128_t vra, vui128_t vrb)
Vector Compare Greater Than or Equal Unsigned Quadword.
Definition: vec_int128_ppc.h:3138
vec_cmpsq_all_ge
static int vec_cmpsq_all_ge(vi128_t vra, vi128_t vrb)
Vector Compare any Greater Than or Equal Signed Quadword.
Definition: vec_int128_ppc.h:3566
vec_msumcud
static vui128_t vec_msumcud(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum and Write Carryout Unsigned Doubleword.
Definition: vec_int128_ppc.h:5145
vec_seluq
static vui128_t vec_seluq(vui128_t vra, vui128_t vrb, vb128_t vrc)
Vector Select Unsigned Quadword.
Definition: vec_int128_ppc.h:6482