vec__int128__ppc_8h_source.html

/*

 Copyright (c) [2017, 2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_int128_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: May 10, 2015

      Steven Munroe, additional contributions for POWER9.

 */


#ifndef VEC_INT128_PPC_H_

#define VEC_INT128_PPC_H_


#include <pveclib/vec_common_ppc.h>

#include <pveclib/vec_int64_ppc.h>


#ifndef PVECLIB_DISABLE_CONSTINT128

#define CONST_VUINT128_QxW(__q0, __q1, __q2, __q3) ( (vui128_t) \

      (((unsigned __int128) __q0) << 96) \

    + (((unsigned __int128) __q1) << 64) \

    + (((unsigned __int128) __q2) << 32) \

    +  ((unsigned __int128) __q3) )

#else

// clang does not handle constant folding for __int128

#define CONST_VUINT128_QxW(__q0, __q1, __q2, __q3) ( (vui128_t) \

        CONST_VINT128_W(__q0, __q1, __q2, __q3) )

#endif


#define CONST_VUINT128_QxD(__q0, __q1) ( (vui128_t) \

    (((unsigned __int128) __q0) << 64) \

    + ((unsigned __int128) __q1) )


#define CONST_VUINT128_Qx19d(__q0, __q1) ( (vui128_t) \

    (((unsigned __int128) __q0) * 10000000000000000000UL) \

    + ((unsigned __int128) __q1) )


#define CONST_VUINT128_Qx18d(__q0, __q1) ( (vui128_t) \

    (((unsigned __int128) __q0) * 1000000000000000000UL) \

    + ((unsigned __int128) __q1) )


#define CONST_VUINT128_Qx16d(__q0, __q1) ( (vui128_t) \

    (((unsigned __int128) __q0) * 10000000000000000UL) \

    + ((unsigned __int128) __q1) )


static inline vui128_t vec_addecuq (vui128_t a, vui128_t b, vui128_t ci);

static inline vui128_t vec_addeuqm (vui128_t a, vui128_t b, vui128_t ci);

static inline vb128_t vec_cmpequq (vui128_t vra, vui128_t vrb);

static inline vb128_t vec_cmpgeuq (vui128_t vra, vui128_t vrb);

static inline vb128_t vec_cmpgtuq (vui128_t vra, vui128_t vrb);

static inline vb128_t vec_cmpleuq (vui128_t vra, vui128_t vrb);

static inline vb128_t vec_cmpltuq (vui128_t vra, vui128_t vrb);

static inline vb128_t vec_cmpneuq (vui128_t vra, vui128_t vrb);

static inline vui128_t vec_divuq_10e31 (vui128_t vra);

static inline vui128_t vec_divuq_10e32 (vui128_t vra);

static inline vui128_t vec_maxuq (vui128_t a, vui128_t b);

static inline vui128_t vec_minuq (vui128_t a, vui128_t b);

static inline vui128_t vec_moduq_10e31 (vui128_t vra, vui128_t q);

static inline vui128_t vec_moduq_10e32 (vui128_t vra, vui128_t q);

static inline vui128_t vec_muleud (vui64_t a, vui64_t b);

static inline vui128_t vec_mulhuq (vui128_t a, vui128_t b);

static inline vui128_t vec_mulluq (vui128_t a, vui128_t b);

static inline vui128_t vec_muloud (vui64_t a, vui64_t b);

static inline vui128_t vec_muludq (vui128_t *mulu, vui128_t a, vui128_t b);

static inline vi128_t vec_negsq (vi128_t int128);

static inline vui128_t vec_popcntq (vui128_t vra);

static inline vb128_t vec_setb_cyq (vui128_t vcy);

static inline vb128_t vec_setb_ncq (vui128_t vcy);

static inline vb128_t vec_setb_sq (vi128_t vra);

static inline vi128_t vec_selsq (vi128_t vra, vi128_t vrb, vb128_t vrc);

static inline vui128_t vec_sldq (vui128_t vrw, vui128_t vrx,

                                 vui128_t vrb);

static inline vui128_t vec_sldqi (vui128_t vrw, vui128_t vrx,

                                  const unsigned int shb);

static inline vui128_t vec_srqi (vui128_t vra, const unsigned int shb);

static inline vui128_t vec_subcuq (vui128_t vra, vui128_t vrb);

static inline vui128_t vec_subeuqm (vui128_t vra, vui128_t vrb, vui128_t vrc);

static inline vui128_t vec_subuqm (vui128_t vra, vui128_t vrb);

static inline vui128_t vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c);

static inline vui128_t vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c);

static inline vui128_t vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c);

static inline vui128_t vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c);

static inline vui128_t vec_vmuleud (vui64_t a, vui64_t b);

static inline vui128_t vec_vmuloud (vui64_t a, vui64_t b);

static inline vui128_t vec_vsldbi (vui128_t vra, vui128_t vrb,

                                   const unsigned int shb);


static inline vui128_t

vec_absduq (vui128_t vra, vui128_t vrb)

{

#ifdef _ARCH_PWR8

  vui128_t tmp1, tmp2;

  vb128_t cmpbool;

  cmpbool = vec_cmpgtuq ( vra, vrb );

  tmp1 = vec_subuqm ( vra, vrb );

  tmp2 = vec_subuqm ( vrb, vra );

  return (vui128_t) vec_sel ((vui32_t) tmp2, (vui32_t) tmp1, (vui32_t) cmpbool);

#else

  return vec_subuqm (vec_maxuq (vra, vrb), vec_minuq (vra, vrb));

#endif

}


static inline vi128_t

vec_abssq (vi128_t vra)

{

  vi128_t q_neg;

  vb128_t b_sign;

  // Convert 2s complement to unsigned magnitude form.

  q_neg  = vec_negsq (vra);

  b_sign = vec_setb_sq (vra);

  return vec_selsq (vra, q_neg, b_sign);

}


static inline vui128_t

vec_avguq (vui128_t vra, vui128_t vrb)

{

  vui128_t result, tmp1, tmp2;

  const vui128_t qu1 = (vui128_t) CONST_VINT128_W(0, 0, 0, 1);

  // Compute (vra + vrb + 1) with carry

  tmp1 = vec_addeuqm (vra, vrb, qu1);

  tmp2 = vec_addecuq (vra, vrb, qu1);

  // shift sum with carry, right 1 bit

  result = vec_sldqi (tmp2, tmp1, 127);


  return result;

}


static inline vui128_t

vec_addcuq (vui128_t a, vui128_t b)

{

  vui32_t co;

#ifdef _ARCH_PWR8

#if defined (vec_vaddcuq)

  co = (vui32_t) vec_vaddcuq (a, b);

#elif defined (__clang__)

  co = (vui32_t) vec_addc (a, b);

#else

  __asm__(

      "vaddcuq %0,%1,%2;"

      : "=v" (co)

      : "v" (a),

      "v" (b)

      : );

#endif

#else

  vui32_t c, c2, t;

  vui32_t z= { 0,0,0,0};


  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);

  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);

  c = vec_sld (co, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  co = vec_vor (co, c2);

  co = vec_sld (z, co, 4);

#endif

  return ((vui128_t) co);

}


 static inline vui128_t

 vec_addecuq (vui128_t a, vui128_t b, vui128_t ci)

 {

   vui32_t co;

 #ifdef _ARCH_PWR8

 #if defined (vec_vaddcuq)

   co = (vui32_t) vec_vaddecuq (a, b, ci);

 #elif defined (__clang__)

   co = (vui32_t) vec_addec (a, b, ci);

# else

   __asm__(

       "vaddecuq %0,%1,%2,%3;"

       : "=v" (co)

       : "v" (a),

       "v" (b),

       "v" (ci)

       : );

 #endif

 #else

   vui32_t c, c2, t;

   vui32_t z = { 0, 0, 0, 0 };

   co = (vui32_t){ 1, 1, 1, 1 };


   c2 = vec_and ((vui32_t) ci, co);

   c2 = vec_sld ((vui32_t) c2, z, 12);

   co = vec_vaddcuw ((vui32_t) a, (vui32_t) b);

   t = vec_vadduwm ((vui32_t) a, (vui32_t) b);

   c = vec_sld (co, c2, 4);

   c2 = vec_vaddcuw (t, c);

   t = vec_vadduwm (t, c);

   co = vec_vor (co, c2);

   c = vec_sld (c2, z, 4);

   c2 = vec_vaddcuw (t, c);

   t = vec_vadduwm (t, c);

   co = vec_vor (co, c2);

   c = vec_sld (c2, z, 4);

   c2 = vec_vaddcuw (t, c);

   t = vec_vadduwm (t, c);

   co = vec_vor (co, c2);

   c = vec_sld (c2, z, 4);

   c2 = vec_vaddcuw (t, c);

   co = vec_vor (co, c2);

   co = vec_sld (z, co, 4);

 #endif

   return ((vui128_t) co);

 }


static inline vui128_t

vec_addeuqm (vui128_t a, vui128_t b, vui128_t ci)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vaddeuqm)

  t = (vui32_t) vec_vaddeuqm (a, b, ci);

#elif defined (__clang__)

  t = (vui32_t) vec_adde (a, b, ci);

#else

  __asm__(

      "vaddeuqm %0,%1,%2,%3;"

      : "=v" (t)

      : "v" (a),

      "v" (b),

      "v" (ci)

      : );

#endif

#else

  vui32_t c2, c;

  vui32_t z  = { 0,0,0,0};

  vui32_t co = { 1,1,1,1};


  c2 = vec_and ((vui32_t)ci, co);

  c2 = vec_sld ((vui32_t)ci, z, 12);

  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);

  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);

  c = vec_sld (co, c2, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  c = vec_sld (c2, z, 4);

  t = vec_vadduwm (t, c);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_adduqm (vui128_t a, vui128_t b)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vadduqm)

  t = (vui32_t) vec_vadduqm (a, b);

#elif defined (__clang__)

  t = (vui32_t) vec_add (a, b);

#else

  __asm__(

      "vadduqm %0,%1,%2;"

      : "=v" (t)

      : "v" (a),

      "v" (b)

      : );

#endif

#else

  vui32_t c, c2;

  vui32_t z= { 0,0,0,0};


  c = vec_vaddcuw ((vui32_t)a, (vui32_t)b);

  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);

  c = vec_sld (c, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  c = vec_sld (c2, z, 4);

  t = vec_vadduwm (t, c);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_addcq (vui128_t *cout, vui128_t a, vui128_t b)

{

  vui32_t t, co;

#ifdef _ARCH_PWR8

#if defined (vec_vadduqm) && defined (vec_vaddcuq)

  t = (vui32_t) vec_vadduqm (a, b);

  co = (vui32_t) vec_vaddcuq (a, b);

#elif defined (__clang__)

  t = (vui32_t) vec_add (a, b);

  co = (vui32_t) vec_addc (a, b);

#else

  __asm__(

      "vadduqm %0,%2,%3;\n"

      "\tvaddcuq %1,%2,%3;"

      : "=&v" (t),

      "=v" (co)

      : "v" (a),

      "v" (b)

      : );

#endif

#else

  vui32_t c, c2;

  vui32_t z= { 0,0,0,0};


  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);

  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);

  c = vec_sld (co, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  co = vec_sld (z, co, 4);

#endif

  *cout = (vui128_t) co;

  return ((vui128_t) t);

}


static inline vui128_t

vec_addeq (vui128_t *cout, vui128_t a, vui128_t b, vui128_t ci)

{

  vui32_t t, co;

#ifdef _ARCH_PWR8

#if defined (vec_vaddeuqm) && defined (vec_vaddecuq)

  t = (vui32_t) vec_vaddeuqm (a, b, ci);

  co = (vui32_t) vec_vaddecuq (a, b, ci);

#elif defined (__clang__)

  t = (vui32_t) vec_adde (a, b, ci);

  co = (vui32_t) vec_addec (a, b, ci);

#else

  __asm__(

      "vaddeuqm %0,%2,%3,%4;\n"

      "\tvaddecuq %1,%2,%3,%4;"

      : "=&v" (t),

      "=v" (co)

      : "v" (a),

      "v" (b),

      "v" (ci)

      : );

#endif

#else

  vui32_t c, c2;

  vui32_t z= { 0,0,0,0};

  co = (vui32_t){ 1,1,1,1};


  c2 = vec_and ((vui32_t)ci, co);

  c2 = vec_sld ((vui32_t)c2, z, 12);

  co = vec_vaddcuw ((vui32_t)a, (vui32_t)b);

  t = vec_vadduwm ((vui32_t)a, (vui32_t)b);

  c = vec_sld (co, c2, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  c = vec_sld (c2, z, 4);

  c2 = vec_vaddcuw (t, c);

  t = vec_vadduwm (t, c);

  co = vec_vor (co, c2);

  co = vec_sld (z, co, 4);

#endif

  *cout = (vui128_t) co;

  return ((vui128_t) t);

}


static inline vui128_t

vec_clzq (vui128_t vra)

{

  vui64_t result;


#ifdef _ARCH_PWR8

  /*

   * Use the Vector Count Leading Zeros Double Word instruction to get

   * the count for the left and right vector halves.  If the left vector

   * doubleword of the input is nonzero then only the left count is

   * included and we need to mask off the right count.

   * Otherwise the left count is 64 and we need to add 64 to the right

   * count.

   * After masking we sum across the left and right counts to

   * get the final 128-bit vector count (0-128).

   */

  vui64_t vt1, vt2, vt3, h64, l64;

  const vui64_t vzero = { 0, 0 };


  vt1 = vec_clzd ((vui64_t) vra);

  vt2 = (vui64_t) vec_cmpequd((vui64_t) vra, vzero);

  vt3 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt2);

  h64 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt1);

  l64 = vec_and (vt1, vt3);

  result = vec_addudm (h64, l64);

#else

  /* vector clz instructions were introduced in power8. For power7 and

   * earlier, use the pveclib vec_clzw implementation.  For a quadword

   * clz, this requires pre-conditioning the input before computing the

   * the word clz and sum across.   */

  vui32_t c0, clz;

  vui32_t r32, gt32, gt32sr32, gt64sr64;


  c0 = vec_splat_u32 (0);

  gt32 = (vui32_t) vec_cmpgt ((vui32_t) vra, c0);

  gt32sr32 = vec_sld (c0, gt32, 12);

  gt64sr64 = vec_sld (c0, gt32, 8);

  gt32 = vec_sld (c0, gt32, 4);


  gt32sr32 = vec_or (gt32sr32, gt32);

  gt64sr64 = vec_or (gt64sr64, (vui32_t) vra);

  r32 = vec_or (gt32sr32, gt64sr64);


  clz = vec_clzw (r32);

  result = (vui64_t) vec_sums ((vi32_t) clz, (vi32_t) c0);

#endif


  return ((vui128_t) result);

}


static inline vui128_t

vec_ctzq (vui128_t vra)

{

  const vui128_t ones = (vui128_t) vec_splat_s32(-1);

  vui128_t tzmask;


  // tzmask = (!vra & (vra - 1))

  tzmask = (vui128_t) vec_andc ((vui64_t) vec_adduqm (vra, ones),

                                (vui64_t) vra);

  // return = vec_popcnt (!vra & (vra - 1))

  return vec_popcntq (tzmask);

}


static inline vb128_t

vec_cmpeqsq (vi128_t vra, vi128_t vrb)

{

  /* vec_cmpequq works for both signed and unsigned compares.  */

  return vec_cmpequq ((vui128_t) vra, (vui128_t) vrb);

}


static inline vb128_t

vec_cmpequq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpeq (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpequq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return vrt;

#endif

#elif defined (_ARCH_PWR8)

  vui64_t equd, swapd;


  equd = (vui64_t) vec_cmpequd ((vui64_t) vra, (vui64_t) vrb);

  swapd = vec_swapd (equd);

  return (vb128_t) vec_and (equd, swapd);

#else

  if (vec_all_eq ((vui32_t) vra, (vui32_t) vrb))

    return (vb128_t) vec_cmpeq ((vui32_t) vra, (vui32_t) vrb);

  else

    return (vb128_t) vec_splat_u32 (0);

#endif

}


static inline vb128_t

vec_cmpgesq (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpge (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtsq %0,%2,%1;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);

#endif

#else

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui32_t _a, _b;


  _a = vec_xor ((vui32_t) vra, signbit);

  _b = vec_xor ((vui32_t) vrb, signbit);

  return vec_cmpgeuq ((vui128_t) _a, (vui128_t) _b);

#endif

}


static inline vb128_t

vec_cmpgeuq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpge (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtuq %0,%2,%1;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);

#endif

#else

  vui128_t a_b;


  a_b = vec_subcuq (vra, vrb);

  return vec_setb_cyq (a_b);

#endif

}


static inline vb128_t

vec_cmpgtsq (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpgt (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtsq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return vrt;

#endif

#else

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui32_t _a, _b;


  _a = vec_xor ((vui32_t) vra, signbit);

  _b = vec_xor ((vui32_t) vrb, signbit);

  return vec_cmpgtuq ((vui128_t) _a, (vui128_t) _b);

#endif

}


static inline vb128_t

vec_cmpgtuq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpgt (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtuq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return vrt;

#endif

#else

  vui128_t b_a;


  b_a = vec_subcuq (vrb, vra);

  return vec_setb_ncq (b_a);

#endif

}


static inline vb128_t

vec_cmplesq (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmple (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtsq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);

#endif

#else

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui32_t _a, _b;


  _a = vec_xor ((vui32_t) vra, signbit);

  _b = vec_xor ((vui32_t) vrb, signbit);

  return vec_cmpleuq ((vui128_t) _a, (vui128_t) _b);

#endif

}


static inline vb128_t

vec_cmpleuq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmple (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtuq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);

#endif

#else

  vui128_t b_a;


  b_a = vec_subcuq (vrb, vra);

  return vec_setb_cyq (b_a);

#endif

}


static inline vb128_t

vec_cmpltsq (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmplt (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtsq %0,%2,%1;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return vrt;

#endif

#else

  const vui32_t signbit = CONST_VINT128_W(0x80000000, 0, 0, 0);

  vui32_t _a, _b;


  _a = vec_xor ((vui32_t) vra, signbit);

  _b = vec_xor ((vui32_t) vrb, signbit);

  return vec_cmpltuq ((vui128_t) _a, (vui128_t) _b);

#endif

}


static inline vb128_t

vec_cmpltuq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmplt (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpgtuq %0,%2,%1;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return vrt;

#endif

#else

  vui128_t  a_b;


  a_b = vec_subcuq (vra, vrb);

  return vec_setb_ncq (a_b);

#endif

}


static inline vb128_t

vec_cmpnesq (vi128_t vra, vi128_t vrb)

{

  /* vec_cmpneuq works for both signed and unsigned compares.  */

  return vec_cmpneuq ((vui128_t) vra, (vui128_t) vrb);

}


static inline vb128_t

vec_cmpneuq (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_cmpne (vra, vrb);

#else

  vb128_t vrt;

  __asm__(

      "vcmpequq %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra), "v" (vrb)

      : );

  return (vb128_t) vec_nor ((vui64_t) vrt, (vui64_t) vrt);

#endif

#elif defined (_ARCH_PWR8)

  __vector unsigned long long equd, swapd;


  equd = (vui64_t) vec_cmpequd ((vui64_t) vra, (vui64_t) vrb);

  swapd = vec_swapd (equd);

  return (vb128_t) vec_nand (equd, swapd);

#else

  if (vec_any_ne ((vui32_t) vra, (vui32_t) vrb))

    return (vb128_t) vec_splat_s32 (-1);

  else

    return (vb128_t) vec_splat_u32 (0);

#endif

}


static inline

int

vec_cmpsq_all_eq (vi128_t vra, vi128_t vrb)

{

  int result;

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_eq (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpequq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  result = vec_all_eq((vui64_t)vra, (vui64_t)vrb);

#else

  result = vec_all_eq((vui32_t)vra, (vui32_t)vrb);

#endif

  return (result);

}


static inline int

vec_cmpsq_all_ge (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ge (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtsq. %0,%4,%3;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t a_b, _a, _b;


  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);

  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);


  a_b = vec_subcuq (_a, _b);

  return vec_all_eq((vui32_t)a_b, carry128);

#endif

}


static inline int

vec_cmpsq_all_gt (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ge (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtsq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t b_a, _a, _b;


  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);

  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);


  b_a = vec_subcuq (_b, _a);

  return vec_all_eq((vui32_t)b_a, ncarry128);

#endif

}


static inline int

vec_cmpsq_all_le (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_le (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtsq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t b_a, _a, _b;


  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);

  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);


  b_a = vec_subcuq (_b, _a);

  return vec_all_eq((vui32_t)b_a, carry128);

#endif

}


static inline int

vec_cmpsq_all_lt (vi128_t vra, vi128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_lt (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtsq. %0,%4,%3;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signbit = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t a_b, _a, _b;


  _a = (vui128_t) vec_xor ((vui32_t) vra, signbit);

  _b = (vui128_t) vec_xor ((vui32_t) vrb, signbit);


  a_b = vec_subcuq (_a, _b);

  return vec_all_eq((vui32_t)a_b, ncarry128);

#endif

}


static inline

int

vec_cmpsq_all_ne (vi128_t vra, vi128_t vrb)

{

  int result;

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ne (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpequq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  result = !vec_all_eq ((vui64_t) vra, (vui64_t) vrb);

#else

  result = !vec_all_eq ((vui32_t) vra, (vui32_t) vrb);

#endif

  return (result);

}


static inline

int

vec_cmpuq_all_eq (vui128_t vra, vui128_t vrb)

{

  int result;

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_eq (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpequq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  result = vec_all_eq ((vui64_t) vra, (vui64_t) vrb);

#else

  result = vec_all_eq ((vui32_t) vra, (vui32_t) vrb);

#endif

  return (result);

}


static inline int

vec_cmpuq_all_ge (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ge (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtuq. %0,%4,%3;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);

  vui128_t a_b;


  a_b = vec_subcuq (vra, vrb);

  return vec_all_eq ((vui32_t) a_b, carry128);

#endif

}


static inline int

vec_cmpuq_all_gt (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ge (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtuq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);

  vui128_t b_a;


  b_a = vec_subcuq (vrb, vra);

  return vec_all_eq ((vui32_t) b_a, ncarry128);

#endif

}


static inline int

vec_cmpuq_all_le (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_le (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtuq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t carry128 = CONST_VINT128_W (0, 0, 0, 1);

  vui128_t b_a;


  b_a = vec_subcuq (vrb, vra);

  return vec_all_eq ((vui32_t) b_a, carry128);

#endif

}


static inline int

vec_cmpuq_all_lt (vui128_t vra, vui128_t vrb)

{

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_lt (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpgtuq. %0,%4,%3;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,25,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#else

  const vui32_t ncarry128 = CONST_VINT128_W (0, 0, 0, 0);

  vui128_t  a_b;


  a_b = vec_subcuq (vra, vrb);

  return vec_all_eq ((vui32_t) a_b, ncarry128);

#endif

}


static inline

int

vec_cmpuq_all_ne (vui128_t vra, vui128_t vrb)

{

  int result;

#if defined (_ARCH_PWR10) && defined (__VSX__)  && (__GNUC__ >= 10)

#if (__GNUC__ > 11) || ((__GNUC__ == 11) && (__GNUC_MINOR__ >= 2))

  return vec_all_ne (vra, vrb);

#else

  vb128_t vrt;

  int u, r;

  __asm__(

      "vcmpequq. %0,%3,%4;\n"

      "mfocrf    %1,2;\n"

      "rlwinm    %2,%1,27,1"

      : "=v" (vrt), "=&r" (u), "=r" (r)

      : "v" (vra), "v" (vrb)

      : "cr6");

  return r;

#endif

#elif defined (_ARCH_PWR8) && (__GNUC__ >= 6) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  result = !vec_all_eq ((vui64_t) vra, (vui64_t) vrb);

#else

  result = !vec_all_eq ((vui32_t) vra, (vui32_t) vrb);

#endif

  return (result);

}


static inline vui128_t

vec_cmul10ecuq (vui128_t *cout, vui128_t a, vui128_t cin)

{

  vui32_t t;

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10ecuq %0,%2,%3;\n"

      "vmul10euq %1,%2,%3;\n"

      : "=&v" (t_carry),

      "=v" (t)

      : "v" (a),

      "v" (cin)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui32_t tc;

  vui16_t t10;

  vui32_t t_odd, t_even, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh (ts, t10);

  t_odd = vec_vmulouh (ts, t10);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift cin left 112 bits.  */

  tc = vec_sld ((vui32_t) cin, z, 14);

  /* Shift t_even left 16 bits, merging the carry into the low bits.  */

  t_even = vec_sld (t_even, tc, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = t_high; /* there is not carry into high */

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  t_carry = t_high; /* there is no carry into high */

  /* Use pveclib adduqm implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  *cout = (vui128_t) t_carry;

  return ((vui128_t) t);

}


static inline vui128_t

vec_cmul10cuq (vui128_t *cout, vui128_t a)

{

  vui32_t t;

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10cuq %0,%2;\n"

      "vmul10uq %1,%2;\n"

      : "=&v" (t_carry),

      "=v" (t)

      : "v" (a)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui16_t t10;

  vui32_t t_odd, t_even, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh(ts, t10);

  t_odd = vec_vmulouh(ts, t10);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift t_even left 16 bits to align for lower 128-bits. */

  t_even = vec_sld (t_even, z, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = t_high; /* there is no carry into high */

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  t_carry = t_high; /* there is no carry into high */

  /* Use pveclib adduqm implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  *cout = (vui128_t) t_carry;

  return ((vui128_t) t);

}


static inline vi128_t

vec_divsq_10e31 (vi128_t vra)

{

  const vui128_t zero = (vui128_t) { (__int128) 0 };

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };

  /* Magic numbers for multiplicative inverse to divide by 10**31

   are 4804950418589725908363185682083061167, corrective add,

   and shift right 107 bits.  */

  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(

      0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);

  const int shift_ten31 = 103;

  vui128_t result, t, q, uvra;

  vb128_t negbool;


  negbool = vec_setb_sq (vra);

  uvra = (vui128_t) vec_sel ((vui32_t) vra,

                            (vui32_t) vec_subuqm (zero, (vui128_t) vra),

                            (vb32_t) negbool);


  if (vec_cmpuq_all_ge (uvra, ten31))

    {

      q = vec_mulhuq (uvra, mul_invs_ten31);

      // Need corrective add but want to avoid carry & double quad shift

      // The following avoids the carry and less instructions

      t = vec_subuqm (uvra, q);

      t = vec_srqi (t, 1);

      t = vec_adduqm (t, q);

      result = vec_srqi (t, shift_ten31-1);

      result = (vui128_t) vec_sel ((vui32_t) result,

                            (vui32_t) vec_subuqm (zero, (vui128_t) result),

                            (vb32_t) negbool);

    }

  else

    result = zero;


  return (vi128_t) result;

}


static inline vui128_t

vec_divudq_10e31 (vui128_t *qh, vui128_t vra, vui128_t vrb)

{

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };

  const vui128_t zero = (vui128_t) { (__int128) 0UL };

  /* Magic numbers for multiplicative inverse to divide by 10**31

   are 4804950418589725908363185682083061167, corrective add,

   and shift right 103 bits.  */

  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(

      0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);

  const int shift_ten31 = 103;

  vui128_t result, r2, t, q, q1, q2, c;


  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))

    {

      // Multiply high [vra||vrb] * mul_invs_ten31

      q = vec_mulhuq (vrb, mul_invs_ten31);

      q1 = vec_muludq (&t, vra, mul_invs_ten31);

      c = vec_addcuq (q1, q);

      q = vec_adduqm (q1, q);

      q1 = vec_adduqm (t, c);

      // corrective add [q2||q1||q] = [q1||q] + [vra||vrb]

      c = vec_addcuq (vrb, q);

      q = vec_adduqm (vrb, q);

      // q2 is the carry-out from the corrective add

      q2 = vec_addecuq (q1, vra, c);

      q1 = vec_addeuqm (q1, vra, c);

      // shift 384-bits (including the carry) right 107 bits

      // Using shift left double quadword shift by (128-107)-bits

      r2 = vec_sldqi (q2, q1, (128 - shift_ten31));

      result = vec_sldqi (q1, q, (128 - shift_ten31));

    }

  else

    {

      // Dividend less than divisor then return zero quotient

      r2 = zero;

      result = zero;

    }


  // return 256-bit quotient

  *qh = r2;

  return result;

}


static inline vui128_t

vec_divudq_10e32 (vui128_t *qh, vui128_t vra, vui128_t vrb)

{

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten32 = (vui128_t)

          { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };

  const vui128_t zero = (vui128_t) { (__int128) 0UL };

  /* Magic numbers for multiplicative inverse to divide by 10**32

   are 211857340822306639531405861550393824741, corrective add,

   and shift right 107 bits.  */

  const vui128_t mul_invs_ten32 = (vui128_t) CONST_VINT128_DW(

      0x9f623d5a8a732974UL, 0xcfbc31db4b0295e5UL);

  const int shift_ten32 = 107;

  vui128_t result, r2, t, q, q1, q2, c;


  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten32))

    {

      // Multiply high [vra||vrb] * mul_invs_ten31

      q = vec_mulhuq (vrb, mul_invs_ten32);

      q1 = vec_muludq (&t, vra, mul_invs_ten32);

      c = vec_addcuq (q1, q);

      q = vec_adduqm (q1, q);

      q1 = vec_adduqm (t, c);

      // corrective add [q2||q1||q] = [q1||q] + [vra||vrb]

      c = vec_addcuq (vrb, q);

      q = vec_adduqm (vrb, q);

      // q2 is the carry-out from the corrective add

      q2 = vec_addecuq (q1, vra, c);

      q1 = vec_addeuqm (q1, vra, c);

      // shift 384-bits (including the carry) right 107 bits

      // Using shift left double quadword shift by (128-107)-bits

      r2 = vec_sldqi (q2, q1, (128 - shift_ten32));

      result = vec_sldqi (q1, q, (128 - shift_ten32));

    }

  else

    {

      // Dividend less than divisor then return zero quotient

      r2 = zero;

      result = zero;

    }


  // return 256-bit quotient

  *qh = r2;

  return result;

}


static inline vui128_t

vec_divuq_10e31 (vui128_t vra)

{

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };

  /* Magic numbers for multiplicative inverse to divide by 10**31

   are 4804950418589725908363185682083061167, corrective add,

   and shift right 103 bits.  */

  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(

      0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);

  const int shift_ten31 = 103;

  vui128_t result, t, q;


  if (vec_cmpuq_all_ge (vra, ten31))

    {

      q = vec_mulhuq (vra, mul_invs_ten31);

      // Need corrective add but want to avoid carry & double quad shift

      // The following avoids the carry and less instructions

      t = vec_subuqm (vra, q);

      t = vec_srqi (t, 1);

      t = vec_adduqm (t, q);

      result = vec_srqi (t, shift_ten31-1);

    }

  else

    result = (vui128_t) { (__int128) 0 };


  return result;

}


static inline vui128_t

vec_divuq_10e32 (vui128_t vra)

{

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten32 = (vui128_t)

          { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };

  /* Magic numbers for multiplicative inverse to divide by 10**32

   are 211857340822306639531405861550393824741, corrective add,

   and shift right 107 bits.  */

  const vui128_t mul_invs_ten32 = (vui128_t) CONST_VINT128_DW(

      0x9f623d5a8a732974UL, 0xcfbc31db4b0295e5UL);

  const int shift_ten32 = 107;

  vui128_t result, t, q;


  if (vec_cmpuq_all_ge (vra, ten32))

    {

      q = vec_mulhuq (vra, mul_invs_ten32);

      // Need corrective add but want to avoid carry & double quad shift

      // The following avoids the carry and less instructions

      t = vec_subuqm (vra, q);

      t = vec_srqi (t, 1);

      t = vec_adduqm (t, q);

      result = vec_srqi (t, shift_ten32-1);

    }

  else

    result = (vui128_t) { (__int128) 0 };


  return result;

}


static inline

vi128_t

vec_maxsq(vi128_t vra, vi128_t vrb)

{

  vb32_t maxmask;


  maxmask = (vb32_t) vec_cmpgtsq ( vra, vrb );

  return (vi128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, maxmask);

}


static inline

vui128_t

vec_maxuq(vui128_t vra, vui128_t vrb)

{

  vb32_t maxmask;


  maxmask = (vb32_t) vec_cmpgtuq ( vra, vrb );

  return (vui128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, maxmask);

}


static inline

vi128_t

vec_minsq(vi128_t vra, vi128_t vrb)

{

  vb32_t minmask;


  minmask = (vb32_t) vec_cmpgtsq ( vrb, vra );

  return (vi128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, minmask);

}


static inline

vui128_t

vec_minuq(vui128_t vra, vui128_t vrb)

{

  vb32_t minmask;


  minmask = (vb32_t) vec_cmpgtuq ( vrb, vra );

  return (vui128_t) vec_sel ((vui32_t) vrb, (vui32_t) vra, minmask);

}


static inline vi128_t

vec_modsq_10e31 (vi128_t vra, vi128_t q)

{

  const vui128_t zero = (vui128_t) { (__int128) 0 };

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };

  vui128_t result, t;


  // multiply low and subtract modulo are the same for signed/unsigned

  // But now easier to compare q for zero than signed compare to vra

  if (vec_cmpuq_all_ne ((vui128_t) vra, zero))

    {

      t = vec_mulluq ((vui128_t) q, ten31);

      result = vec_subuqm ((vui128_t) vra, (vui128_t) t);

    }

  else

    result = (vui128_t) vra;


  return (vi128_t) result;

}


static inline vui128_t

vec_modudq_10e31 (vui128_t vra, vui128_t vrb, vui128_t *ql)

{

  /* ten31  = +100000000000000000000000000000000UQ  */

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };

  const vui128_t zero = (vui128_t) { (__int128) 0UL };

  const vui128_t minus_one = (vui128_t) { (__int128) -1L };

  vui128_t result, t, th, c;


  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))

    {

      t = vec_muludq (&th, *ql, ten31);

      c = vec_subcuq (vrb, t);

      t = vec_subuqm (vrb, t);

      th = vec_subeuqm (vra, th, c);

      // The remainder should be less than the divisor

      if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten31))

        {

          // If not the estimated quotient is off by 1

          *ql = vec_adduqm (*ql, minus_one);

          // And the remainder is negative, so add the divisor

          t = vec_adduqm (t, ten31);

        }

      result = t;

    }

  else

    result = vrb;


  return result;

}


static inline vui128_t

vec_modudq_10e32 (vui128_t vra, vui128_t vrb, vui128_t *ql)

{

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten32 = (vui128_t)

          { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };

  const vui128_t zero = (vui128_t) { (__int128) 0UL };

  const vui128_t minus_one = (vui128_t) { (__int128) -1L };

  vui128_t result, t, th, c;


  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten32))

    {

      t = vec_muludq (&th, *ql, ten32);

      c = vec_subcuq (vrb, t);

      t = vec_subuqm (vrb, t);

      th = vec_subeuqm (vra, th, c);

      // The remainder should be less than the divisor

      if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten32))

        {

          // If not the estimated quotient is off by 1

          *ql = vec_adduqm (*ql, minus_one);

          // And the remainder is negative, so add the divisor

          t = vec_adduqm (t, ten32);

        }

      result = t;

    }

  else

    result = vrb;


  return result;

}


static inline vui128_t

vec_moduq_10e31 (vui128_t vra, vui128_t q)

{

  /* ten31  = +100000000000000000000000000000000UQ  */

  const vui128_t ten31 = (vui128_t)

          { (__int128) 1000000000000000UL

         * (__int128) 10000000000000000UL };

  vui128_t result, t;


  if (vec_cmpuq_all_ge (vra, ten31))

    {

      t = vec_mulluq (q, ten31);

      result = vec_subuqm (vra, t);

    }

  else

    result = vra;


  return result;

}


static inline vui128_t

vec_moduq_10e32 (vui128_t vra, vui128_t q)

{

  /* ten32  = +100000000000000000000000000000000UQ  */

  const vui128_t ten32 = (vui128_t)

          { (__int128) 10000000000000000UL * (__int128) 10000000000000000UL };

  vui128_t result, t;


  if (vec_cmpuq_all_ge (vra, ten32))

    {

      t = vec_mulluq (q, ten32);

      result = vec_subuqm (vra, t);

    }

  else

    result = vra;


  return result;

}


static inline vui128_t

vec_mul10cuq (vui128_t a)

{

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10cuq %0,%1;\n"

      : "=v" (t_carry)

      : "v" (a)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui16_t t10;

  vui32_t t_even, t_odd, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh(ts, t10);

  t_odd = vec_vmulouh(ts, t10);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift t_even left 16 bits to align for lower 128-bits. */

  t_even = vec_sld (t_even, z, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = (vui32_t) vec_vaddcuq ((vui128_t) t_even, (vui128_t) t_odd);

  t_carry = (vui32_t) vec_vadduqm ((vui128_t) t_carry, (vui128_t) t_high);

#else

  /* Use pveclib addcuq implementation for pre _ARCH_PWR8.  */

  t_carry = (vui32_t) vec_addcuq ((vui128_t) t_even, (vui128_t) t_odd);

  /* The final carry is small (0-9) so use word add, ignore carries.  */

  t_carry = vec_vadduwm (t_carry, t_high);

#endif

#endif

  return ((vui128_t) t_carry);

}


static inline vui128_t

vec_mul10ecuq (vui128_t a, vui128_t cin)

{

//        vui32_t  t;

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10ecuq %0,%1,%2;\n"

      : "=&v" (t_carry)

      : "v" (a),

      "v" (cin)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui32_t tc;

  vui16_t t10;

  vui32_t t_odd;

  vui32_t t_even, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh(ts, t10);

  t_odd = vec_vmulouh(ts, t10);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift cin left 112 bits.  */

  tc = vec_sld ((vui32_t) cin, z, 14);

  /* Shift t_even left 16 bits, merging the carry into the low bits.  */

  t_even = vec_sld (t_even, tc, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = (vui32_t) vec_vaddcuq ((vui128_t) t_even, (vui128_t) t_odd);

  t_carry = (vui32_t) vec_vadduqm ((vui128_t) t_carry, (vui128_t) t_high);

#else

  /* Use pveclib addcuq implementation for pre _ARCH_PWR8.  */

  t_carry = (vui32_t) vec_addcuq ((vui128_t) t_even, (vui128_t) t_odd);

  /* The final carry is small (0-9) so use word add, ignore carries.  */

  t_carry = vec_vadduwm (t_carry, t_high);

#endif

#endif

  return ((vui128_t) t_carry);

}


static inline vui128_t

vec_mul10euq (vui128_t a, vui128_t cin)

{

  vui32_t t;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10euq %0,%1,%2;\n"

      : "=v" (t)

      : "v" (a),

      "v" (cin)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui32_t tc;

  vui16_t t10;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh(ts, t10);

  t_odd = vec_vmulouh(ts, t10);

#endif

  /* Shift cin left 112 bits.  */

  tc = vec_sld ((vui32_t) cin, z, 14);

  /* Shift t_even left 16 bits, merging the carry into the low bits.  */

  t_even = vec_sld (t_even, tc, 2);

  /* then add the even/odd sub-products to generate the final product.  */

#ifdef _ARCH_PWR8

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  /* Use pveclib addcuq implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_mul10uq (vui128_t a)

{

  vui32_t t;

#ifdef _ARCH_PWR9

  __asm__(

      "vmul10uq %0,%1;\n"

      : "=v" (t)

      : "v" (a)

      : );

#else

  vui16_t ts = (vui16_t) a;

  vui16_t t10;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };

  t10 = vec_splat_u16(10);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t10);

  t_odd = vec_vmuleuh (ts, t10);

#else

  t_even = vec_vmuleuh(ts, t10);

  t_odd = vec_vmulouh(ts, t10);

#endif

  /* Shift t_even left 16 bits */

  t_even = vec_sld (t_even, z, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  /* Use pveclib addcuq implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_cmul100cuq (vui128_t *cout, vui128_t a)

{

  vui32_t t;

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  vui128_t t0, t1, tc0, tc1;

  /* Times 10 with 1st carry.  */

  tc0 = vec_mul10cuq (a);

  t0  = vec_mul10uq (a);

  /* Times 10 again with 2nd carry.  */

  tc1 = vec_mul10cuq (t0);

  t1  = vec_mul10uq (t0);

  /* 1st carry times 10 plus 2nd carry.  */

  t_carry  = (vui32_t) vec_mul10euq (tc0, tc1);

  t = (vui32_t)t1;

#else

  vui16_t ts = (vui16_t) a;

  vui16_t t100 = (vui16_t ) { 100, 100, 100, 100, 100, 100, 100, 100 };

  vui32_t t_odd, t_even, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  //t100 = vec_splat_u16 (100);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t100);

  t_odd = vec_vmuleuh (ts, t100);

#else

  t_even = vec_vmuleuh(ts, t100);

  t_odd = vec_vmulouh(ts, t100);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift t_even left 16 bits to align for lower 128-bits. */

  t_even = vec_sld (t_even, z, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = t_high; /* there is no carry into high */

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  t_carry = t_high; /* there is no carry into high */

  /* Use pveclib adduqm implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  *cout = (vui128_t) t_carry;

  return ((vui128_t) t);

}


static inline vui128_t

vec_cmul100ecuq (vui128_t *cout, vui128_t a, vui128_t cin)

{

  vui32_t t;

  vui32_t t_carry;

#ifdef _ARCH_PWR9

  vui128_t t0, t1, tc0, tc1;

  /* Times 10 with 1st carry.  */

  tc0 = vec_mul10cuq (a);

  t0  = vec_mul10uq (a);

  /* Times 10 again with 2nd carry.  No carry in yet.  */

  tc1 = vec_mul10cuq (t0);

  t1  = vec_mul10uq (t0);

  /* 1st carry times 10 plus 2nd carry.  */

  t_carry  = (vui32_t) vec_mul10euq (tc0, tc1);

  /* Add cin to the low bits of a * 100.  If cin is in valid range

   * (0-99) then can not generate carry out of low 128-bits.  */

  t = (vui32_t) vec_vadduqm ((vui128_t) t1, cin);

#else

  vui16_t ts = (vui16_t) a;

  vui32_t tc;

  vui16_t t100 = (vui16_t ) { 100, 100, 100, 100, 100, 100, 100, 100 };

  vui32_t t_odd, t_even, t_high;

  vui32_t z = { 0, 0, 0, 0 };

  //t100 = vec_splat_u16 (100);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  t_even = vec_vmulouh (ts, t100);

  t_odd = vec_vmuleuh (ts, t100);

#else

  t_even = vec_vmuleuh (ts, t100);

  t_odd = vec_vmulouh (ts, t100);

#endif

  /* Shift t_even left 16-bits (right 112-bits) for the partial carry.  */

  t_high = vec_sld (z, t_even, 2);

  /* Shift cin left 112 bits.  */

  tc = vec_sld ((vui32_t) cin, z, 14);

  /* Shift t_even left 16 bits, merging the carry into the low bits.  */

  t_even = vec_sld (t_even, tc, 2);

  /* then add the even/odd sub-products to generate the final product */

#ifdef _ARCH_PWR8

  /* Any compiler that supports ARCH_PWR8 should support these builtins.  */

  t_carry = t_high; /* there is no carry into high */

  t = (vui32_t) vec_vadduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else

  t_carry = t_high; /* there is no carry into high */

  /* Use pveclib adduqm implementation for pre _ARCH_PWR8.  */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  *cout = (vui128_t) t_carry;

  return ((vui128_t) t);

}


static inline vui128_t

vec_msumcud (vui64_t a, vui64_t b, vui128_t c)

{

  vui128_t res;

#if defined (_ARCH_PWR10) && (__GNUC__ >= 10)

  __asm__(

      "vmsumcud %0,%1,%2,%3;\n"

      : "=v" (res)

      : "v" (a), "v" (b), "v" (c)

      : );

#else

  vui128_t p_even, p_odd, p_sum1, p_cry1, p_cry2;

  // Generate separate 128-bit even/odd products to isolate the carries

  p_even = vec_muleud (a, b);

  p_odd  = vec_muloud (a, b);

  // Sum the products and generate the carry

#ifdef _ARCH_PWR8

  p_sum1 = vec_adduqm (p_even, p_odd);

  p_cry1 = vec_addcuq (p_even, p_odd);

#else

  p_sum1 = vec_addcq (&p_cry1, p_even, p_odd);

#endif

  // Generate the carry from the sum (p_even + p_odd + c)

  p_cry2 = vec_addcuq (p_sum1, c);

  // Sum the two carries

#ifdef _ARCH_PWR9

  res    = vec_adduqm (p_cry2, p_cry1);

#else

  /* Results can be 0-2, So Add Word will do.  */

  res    = (vui128_t) vec_add ((vui32_t) p_cry2, (vui32_t) p_cry1);

#endif

#endif

  return (res);

}


static inline vui128_t

vec_msumudm (vui64_t a, vui64_t b, vui128_t c)

{

  vui128_t res;

#if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))

  __asm__(

      "vmsumudm %0,%1,%2,%3;\n"

      : "=v" (res)

      : "v" (a), "v" (b), "v" (c)

      : );

#else

  vui128_t p_even, p_odd, p_sum;


  p_even = vec_muleud (a, b);

  p_odd  = vec_muloud (a, b);

  p_sum  = vec_adduqm (p_even, p_odd);

  res    = vec_adduqm (p_sum, c);

#endif


  return (res);

}


static inline vui128_t

vec_muleud (vui64_t a, vui64_t b)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmuloud (a, b);

#else

  return vec_vmuleud (a, b);

#endif

}


static inline vui64_t

vec_mulhud (vui64_t vra, vui64_t vrb)

{

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  vui64_t res;

  __asm__(

      "vmulhud %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra), "v" (vrb)

      : );

  return res;

#else

  return vec_mrgahd (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));

#endif

}


static inline vui128_t

vec_muloud (vui64_t a, vui64_t b)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmuleud (a, b);

#else

  return vec_vmuloud (a, b);

#endif

}


static inline vui64_t

vec_muludm (vui64_t vra, vui64_t vrb)

{

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  vui64_t res;

  __asm__(

      "vmulld %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra), "v" (vrb)

      : );

  return res;

#elif defined (_ARCH_PWR9)

  return vec_mrgald (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));

#elif defined (_ARCH_PWR8)

  vui64_t s32 = { 32, 32 }; // shift / rotate amount.

  vui64_t z = { 0, 0 };

  vui64_t t2, t3, t4;

  vui32_t t1;


  t1 = (vui32_t) vec_vrld (vrb, s32);

  t2 = vec_vmulouw ((vui32_t)vra, (vui32_t)vrb);

  t3 = vec_vmsumuwm ((vui32_t)vra, t1, z);

  t4 = vec_vsld (t3, s32);

  return (vui64_t) vec_addudm (t4, t2);

#else

  return vec_mrgald (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));

#endif

}


static inline vui128_t

vec_mulhuq (vui128_t a, vui128_t b)

{

  vui32_t t;

  /* compute the 256 bit product of two 128 bit values a, b.

   * The high 128 bits are accumulated in t and the low 128-bits

   * in tmq. The high 128-bits are the return value.

   */

#ifdef _ARCH_PWR9

  const vui64_t zero = { 0, 0 };

  vui64_t b_swap = vec_swapd ((vui64_t) b);

  vui128_t tmh, tab, tba, tb0, tc1, tc2, tmq;

  /* multiply the low 64-bits of a and b.  For PWR9 this is just

   * vmsumudm with conditioned inputs.  */

  tmq = vec_vmuloud ((vui64_t) a, (vui64_t) b);

  /* compute the 2 middle partial projects.  Can't directly use

   * vmsumudm here because the sum of partial products can overflow.  */

  tab = vec_vmuloud ((vui64_t) a, b_swap);

  tba = vec_vmuleud ((vui64_t) a, b_swap);

  t   = (vui32_t) vec_adduqm (tab, tba);

  tc1 = vec_addcuq (tab, tba);

  tmh = (vui128_t) vec_mrgahd ((vui128_t) zero, (vui128_t) tmq);

  t   = (vui32_t ) vec_adduqm ((vui128_t) t, tmh);

  tc2 = vec_addcuq ((vui128_t) t, tmh);

  tc1 = (vui128_t) vec_vadduwm ((vui32_t) tc1, (vui32_t) tc2);

  /* result = t[l] || tmq[l].  */

  tmq = (vui128_t) vec_mrgald ((vui128_t) t, (vui128_t) tmq);

  /* we can use multiply sum here because the high product plus the

   * high sum of middle partial products can't overflow.  */

  t   = (vui32_t) vec_permdi ((vui64_t) tc1, (vui64_t) t, 2);

  tb0 = (vui128_t) vec_mrgahd ((vui128_t) b, (vui128_t) zero);

  /* sum = (a[h] * b[h]) + (a[l] * 0) + (tc1[l] || t[h]).  */

  t   = (vui32_t) vec_msumudm ((vui64_t) a, (vui64_t) tb0, (vui128_t) t);

#else

#ifdef _ARCH_PWR8

  vui32_t tsw;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };

  /* We use Vector Multiply Even/Odd Unsigned Word to compute

   * the 128 x 32 partial (160-bit) product of vector a with a

   * word element of b. The (for each word of vector b) 4 X 160-bit

   * partial products are  summed to produce the full 256-bit product.

   * See the comment in vec_muludq for details.

   */

  tsw = vec_splat ((vui32_t) b, VEC_WE_3);

  t_even = (vui32_t) vec_vmuleuw ((vui32_t) a, tsw);

  t_odd = (vui32_t) vec_vmulouw ((vui32_t) a, tsw);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_2);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_1);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_0);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else // _ARCH_PWR7 or earlier and Big Endian only.  */

  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute

   * the 128 x 16 partial (144-bit) product of vector a with a

   * halfword element of b. The (for each halfword of vector b)

   * 8 X 144-bit partial products are  summed to produce the full

   * 256-bit product. */

  vui16_t tsw;

  vui16_t t_odd, t_even;

  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };


  tsw = vec_splat ((vui16_t) b, 7);

  t_even = (vui16_t) vec_vmuleuh ((vui16_t) a, tsw);

  t_odd = (vui16_t) vec_vmulouh ((vui16_t) a, tsw);


  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 6);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 5);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 4);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 3);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 2);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 1);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 0);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_mulluq (vui128_t a, vui128_t b)

{

  vui32_t t, tmq;

  /* compute the 256 bit product of two 128 bit values a, b.

   * The high 128 bits are accumulated in t and the low 128-bits

   * in tmq.  Only the low order 128 bits of the product are

   * returned.

   */

#ifdef _ARCH_PWR9

  const vui64_t zero = { 0, 0 };

  vui64_t b_swap = vec_swapd ((vui64_t) b);

  /* multiply the low 64-bits of a and b.  For PWR9 this is just

   * vmsumudm with conditioned inputs.  */

  tmq = (vui32_t) vec_vmuloud ((vui64_t) a, (vui64_t) b);

  /* we can use multiply sum here because we only need the low 64-bits

   * and don't care if we lose the carry / overflow.  */

  t   = (vui32_t) vec_mrgahd ((vui128_t) zero, (vui128_t) tmq);

  /* sum = (a[h] * b[l]) + (a[l] * b[h]) + (zero || tmq[h]).  */

  t   = (vui32_t) vec_msumudm ((vui64_t) a, b_swap, (vui128_t) t);

  /* result = t[l] || tmq[l].  */

  tmq = (vui32_t) vec_mrgald ((vui128_t) t, (vui128_t) tmq);

#else

#ifdef _ARCH_PWR8

  /* We use Vector Multiply Even/Odd Unsigned Word to compute

   * the 128 x 32 partial (160-bit) product of vector a with a

   * word element of b. The (for each word of vector b) 4 X 160-bit

   * partial products are  summed to produce the full 256-bit product.

   * See the comment in vec_muludq for details.

   */

  vui32_t tsw;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };


  tsw = vec_splat ((vui32_t) b, VEC_WE_3);

  t_even = (vui32_t) vec_vmuleuw ((vui32_t) a, tsw);

  t_odd = (vui32_t) vec_vmulouw ((vui32_t) a, tsw);

  /* Rotate the low 32-bits (right) into tmq. This is actually

   * implemented as 96-bit (12-byte) shift left. */

  tmq = vec_sld (t_odd, z, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_2);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_1);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_0);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  // dont need the high 128-bits of 160-bits.

#else

  // _ARCH_PWR7 or earlier and Big Endian only.

  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute

   * the 128 x 16 partial (144-bit) product of vector a with a

   * halfword element of b. The (for each halfword of vector b)

   * 8 X 144-bit partial products are  summed to produce the full

   * 256-bit product. */

  vui16_t tsw;

  vui16_t t_odd, t_even;

  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };


  tsw = vec_splat ((vui16_t) b, 7);

  t_even = (vui16_t) vec_vmuleuh ((vui16_t) a, tsw);

  t_odd = (vui16_t) vec_vmulouh ((vui16_t) a, tsw);


  /* Rotate the low 16-bits (right) into tmq. This is actually

   * implemented as 112-bit (14-byte) shift left. */

  tmq = (vui32_t) vec_sld (t_odd, z, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 6);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 5);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 4);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 3);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 2);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 1);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 0);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

#endif

#endif

  return ((vui128_t) tmq);

}


static inline vui128_t

vec_muludq (vui128_t *mulu, vui128_t a, vui128_t b)

{

  vui32_t t, tmq;

  /* compute the 256 bit product of two 128 bit values a, b.

   * The high 128 bits are accumulated in t and the low 128-bits

   * in tmq. The high 128-bits of the product are returned to the

   * address of the 1st parm. The low 128-bits are the return

   * value.

   */

#ifdef _ARCH_PWR9

  const vui64_t zero = { 0, 0 };

  vui64_t a_swap = vec_swapd ((vui64_t) a);

  vui128_t thq, tlq, tx;

  vui128_t t0l, tc1;

  vui128_t thh, thl, tlh, tll;

  /* multiply the low 64-bits of a and b.  For PWR9 this is just

   * vmsumudm with conditioned inputs.  */

  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);

  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);

  thl = vec_vmuloud (a_swap, (vui64_t)b);

  tlh = vec_vmuleud (a_swap, (vui64_t)b);

  /* sum the two middle products (plus the high 64-bits of the low

   * product.  This will generate a carry that we need to capture.  */

  t0l   = (vui128_t) vec_mrgahd ( (vui128_t) zero, tll);

  tc1 = vec_addcuq (thl, tlh);

  tx   = vec_adduqm (thl, tlh);

  tx   = vec_adduqm (tx, t0l);

  /* result = t[l] || tll[l].  */

  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);

  /* Sum the high product plus the high sum (with carry) of middle

   * partial products.  This can't overflow.  */

  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);

  thq = vec_adduqm ( thh, thq);


  t = (vui32_t) thq;

  tmq = (vui32_t) tlq;

#else

#ifdef _ARCH_PWR8

  vui32_t tsw;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };

  /* We use the Vector Multiple Even/Odd Unsigned Word to compute

   * the 128 x 32 partial (160-bit) product of value a with the

   * word splat of b. This produces four 64-bit (32 x 32)

   * partial products in two vector registers.

   *

   * These results

   * are not aligned for summation as is. So the odd result is

   * shifted right 32-bits before it is summed (via Vector Add

   * Unsigned Quadword Modulo) with the the even result.

   * The low order 32-bits, of the 160-bit product

   * is shifted (right) in to a separate vector (tmq).

   *

   * This is repeated for each (low to high order) words of b.

   * After the first (160-bit) partial product, the high 128-bits

   * (t) of the previous partial product is summed with the current

   * odd multiply result, before this sum (including any carry out)

   * is shifted right 32-bits.  Bits shifted out of the of this sum

   * are shifted (32-bits at a time) into the low order 128-bits

   * of the product (tmq). The shifted odd sum is then added to the

   * current even product, After the 4th step this sum is the

   * final high order 128-bits of the quadword product. */

  tsw = vec_splat ((vui32_t) b, VEC_WE_3);

  t_even = (vui32_t)vec_vmuleuw((vui32_t)a, tsw);

  t_odd = (vui32_t)vec_vmulouw((vui32_t)a, tsw);

  /* Rotate the low 32-bits (right) into tmq. This is actually

   * implemented as 96-bit (12-byte) shift left. */

  tmq = vec_sld (t_odd, z, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_2);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_1);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_0);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#else // _ARCH_PWR7 or earlier and Big Endian only.  */

  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute

   * the 128 x 16 partial (144-bit) product of vector a with a

   * halfword element of b. The (for each halfword of vector b)

   * 8 X 144-bit partial products are  summed to produce the full

   * 256-bit product. */

  vui16_t tsw;

  vui16_t t_odd, t_even;

  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };


  tsw = vec_splat ((vui16_t) b, 7);

  t_even = (vui16_t)vec_vmuleuh((vui16_t)a, tsw);

  t_odd = (vui16_t)vec_vmulouh((vui16_t)a, tsw);


  /* Rotate the low 16-bits (right) into tmq. This is actually

   * implemented as 112-bit (14-byte) shift left. */

  tmq = (vui32_t)vec_sld (t_odd, z, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 6);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 5);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 4);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 3);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 2);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 1);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 0);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

#endif

#endif

  *mulu = (vui128_t) t;

  return ((vui128_t) tmq);

}


static inline vui128_t

vec_madduq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c)

{

  vui128_t ph, pl;

#ifdef _ARCH_PWR9

  vui64_t a_swap = vec_swapd ((vui64_t) a);

  vui128_t thq, tlq, tx;

  vui128_t t0l, tc1, tcl;

  vui128_t thh, thl, tlh, tll;

  /* multiply the low 64-bits of a and b.  For PWR9 this is just

   * vmsumudm with conditioned inputs.  */

  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);

  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);

  thl = vec_vmuloud (a_swap, (vui64_t)b);

  tlh = vec_vmuleud (a_swap, (vui64_t)b);

  /* Add c to lower 128-bits of the partial product. */

  tcl = vec_addcuq (tll, c);

  tll = vec_adduqm (tll, c);

  t0l = (vui128_t) vec_permdi ((vui64_t) tcl, (vui64_t) tll, 2);

  /* sum the two middle products (plus the high 64-bits of the low

   * product.  This will generate a carry that we need to capture.  */

  tc1 = vec_addcuq (thl, tlh);

  tx  = vec_adduqm (thl, tlh);

  tx  = vec_adduqm (tx, t0l);

  /* result = t[l] || tll[l].  */

  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);

  /* Sum the high product plus the high sum (with carry) of middle

   * partial products.  This can't overflow.  */

  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);

  thq = vec_adduqm ( thh, thq);


  pl = tlq;

  ph = thq;

#else

#if _ARCH_PWR8

  vui32_t t, tmq;

  vui32_t tsw;

  vui32_t t_odd, t_even;

  vui32_t z = { 0, 0, 0, 0 };

  /* We use Vector Multiple Even/Odd Unsigned Word to compute

   * a 128 x 32 partial (160-bit) product of value a with the

   * word splat [3,2,1,0] of b in 4 steps. Each step produces

   * four 64-bit (32 x 32) partial products in two vector registers.

   * These must be shifted for alignment and summed (128-bit add)

   * to product the 160-bit partial product.

   *

   * These results

   * are not aligned for summation as is. So the odd result is

   * shifted right 32-bits before it is summed (via Vector Add

   * Unsigned Quadword Modulo) with the the even result.

   * The low order 32-bits, of the 160-bit product

   * is shifted (right) in to a separate vector (tmq).

   * This is repeated for each stage of the multiply, so that tmq

   * accumulates the low order 128-bits of the 256-bit product.

   *

   * This is repeated for each (low to high order) words of b.

   * After the first (160-bit) partial product, the high 128-bits

   * (t) of the previous partial product is summed with the current

   * odd multiply result, before this sum (including any carry out)

   * is shifted right 32-bits.  Bits shifted out of the of this sum

   * are shifted (32-bits at a time) into the low order 128-bits

   * of the product (tmq). The shifted odd sum is then added to the

   * current even product, After the 4th step this sum is the

   * final high order 128-bits of the quadword product. */

  tsw = vec_splat ((vui32_t) b, VEC_WE_3);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw,(vui32_t)c);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, (vui32_t)c);

  /* Rotate the low 32-bits (right) into tmq. This is actually

   * implemented as 96-bit (12-byte) shift left. */

  tmq = vec_sld (t_odd, z, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_2);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_1);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui32_t) b, VEC_WE_0);

  t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);

  t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);

  /* rotate right the low 32-bits into tmq */

  tmq = vec_sld (t_odd, tmq, 12);

  /* shift the low 128 bits of partial product right 32-bits */

  t_odd = vec_sld (z, t_odd, 12);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

  ph = (vui128_t) t;

  pl = (vui128_t) tmq;

#else // _ARCH_PWR7 or earlier and Big Endian only.  */

  /* We use Vector Multiply Even/Odd Unsigned Halfword to compute

   * the 128 x 16 partial (144-bit) product of vector a with a

   * halfword element of b. The (for each halfword of vector b)

   * 8 X 144-bit partial products are  summed to produce the full

   * 256-bit product. */

  vui32_t t, tmq;

  vui16_t tsw;

  vui16_t t_odd, t_even;

  vui16_t z = { 0, 0, 0, 0, 0, 0, 0, 0 };


  tsw = vec_splat ((vui16_t) b, 7);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) c);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) c);

  /* Rotate the low 16-bits (right) into tmq. This is actually

   * implemented as 112-bit (14-byte) shift left. */

  tmq = (vui32_t)vec_sld (t_odd, z, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the high 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 6);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 5);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 4);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 3);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 2);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 1);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);


  tsw = vec_splat ((vui16_t) b, 0);

  t_even = (vui16_t)vec_vmaddeuh((vui16_t) a, tsw, (vui16_t) t);

  t_odd = (vui16_t)vec_vmaddouh((vui16_t) a, tsw, (vui16_t) t);

  /* rotate right the low 16-bits into tmq */

  tmq = (vui32_t)vec_sld (t_odd, (vui16_t)tmq, 14);

  /* shift the low 128 bits of partial product right 16-bits */

  t_odd = vec_sld (z, t_odd, 14);

  /* add the top 128 bits of even / odd partial products */

  t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

  ph = (vui128_t) t;

  pl = (vui128_t) tmq;

#endif

#endif

  *mulu = ph;

  return (pl);

}


static inline vui128_t

vec_madd2uq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c1, vui128_t c2)

{

  vui128_t ph, pl, cs;

#ifdef _ARCH_PWR9

  vui128_t cl;

  // P9 has 3 cycles vadduqm so sum C1/C2 early

  cl = vec_addcuq (c1, c2);

  cs = vec_adduqm (c1, c2);

  // Pass the low 128-bit od C1/C2 sum into madduq

  pl = vec_madduq (&ph, a, b, cs);

  // Deal with C1/C2 carry last

  *mulu = vec_adduqm (ph, cl);

#else

#ifdef _ARCH_PWR8

  vui128_t cl, cl2;

  // P8 has vadduqm but slower, so sum C1/C2 early

  cl = vec_addcuq (c1, c2);

  cs = vec_adduqm (c1, c2);

  // Overlapping execution of vaddcuq/vadduqm with muludq

  pl = vec_muludq (&ph, a, b);

  // Delay sum of product plus (c1 + c2) + (cl + cl2)

  cl2 = vec_addcuq (pl, cs);

  pl = vec_adduqm (pl, cs);


  *mulu = vec_addeuqm (ph, cl, cl2);;

#else

  // P7 and earlier do not have vadduqm, must use vaddcuw/vadduwm

  // so leverage madduq to sum (a * b) + c1

  pl = vec_madduq (&ph, a, b, c1);

  // Then add c2 to the madd sum as last stage.

  pl =  vec_addcq (&cs, pl, c2);

  *mulu = vec_adduqm (ph, cs);

#endif

#endif

  return (pl);

}


static inline vi128_t

vec_negsq (vi128_t int128)

{

  const vui128_t q_zero = (vui128_t) { 0 };

  // Negate 2s complement quadword integer.

  return (vi128_t) vec_subuqm (q_zero, (vui128_t)int128);

}


static inline vui128_t

vec_neguq (vui128_t int128)

{

  const vui128_t q_zero = (vui128_t) { 0 };

  // Negate 2s complement quadword integer.

  return vec_subuqm (q_zero, int128);

}


static inline vui128_t

vec_popcntq (vui128_t vra)

{

  vui64_t result;


#ifdef _ARCH_PWR9

  /*

   * Use the Vector Population Count Doubleword instruction to get

   * the count for the left and right vector halves.  Then sum across

   * the left and right counts to get the final 128-bit vector count

   * (0-128).

   */

  vui64_t vt1, h64, l64;

  const vui64_t vzero = { 0, 0 };


  vt1 = vec_popcntd ((vui64_t)  vra);

  h64 = vec_mrgahd ((vui128_t)vzero, (vui128_t)vt1);

  l64 = vec_mrgald ((vui128_t)vzero, (vui128_t)vt1);

  result = vec_addudm (h64, l64);

#elif defined(_ARCH_PWR8)

  /*

   * Use the Vector Population Count Word instruction to get

   * the count for each word.  Then sum across the words

   * to get the final 128-bit vector count (0-128).

   * For P8 popcntw is 2 cycles faster then popcntd but requires

   * vsumsws (7 cycles) as the best option to sum across words.

   */

  vui32_t vt1;

  const vui64_t vzero = { 0, 0 };


  vt1 = vec_popcntw ((vui32_t) vra);

  result = (vui64_t) vec_vsumsw ((vi32_t) vt1,

                                 (vi32_t) vzero);

#else

  //#warning Implememention pre power8

  vui32_t z= { 0,0,0,0};

  vui32_t x;

  x = vec_popcntw ((vui32_t)vra);

  result = (vui64_t) vec_sums ((vi32_t) x, (vi32_t) z);

#endif

  return ((vui128_t) result);

}


static inline vui128_t

vec_revbq (vui128_t vra)

{

  vui128_t result;


#ifdef _ARCH_PWR9

#if defined (vec_revb) || defined (__clang__)

  result = vec_revb (vra);

#else

  __asm__(

      "xxbrq %x0,%x1;"

      : "=wa" (result)

      : "wa" (vra)

      : );

#endif

#else

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

  const vui64_t vconstp =

      CONST_VINT64_DW(0x0F0E0D0C0B0A0908UL, 0x0706050403020100UL);

#else

  const vui64_t vconstp =

      CONST_VINT64_DW(0x0001020304050607UL, 0x08090A0B0C0D0E0FUL);

#endif

  result = (vui128_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);

#endif


  return (result);

}


static inline vui128_t

vec_rlq (vui128_t vra, vui128_t vrb)

{

  vui128_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  // vrlq takes the shift count from bits 57:63

  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);

  __asm__(

      "vrlq %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#else

  result = vec_sldq (vra, vra, vrb);

#endif

  return ((vui128_t) result);

}


static inline vui128_t

vec_rlqi (vui128_t vra, const unsigned int shb)

{

  vui8_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  if (__builtin_constant_p (shb) && (shb < 8))

    {

      /* When shifting by a constant less then 8, can use bit immediate

       * vec_vsldbi (vra, vra, shb) as rotate left.  */

      result = (vui8_t) vec_vsldbi (vra, vra, shb);

    }

  else

    {

      vui32_t lshift = vec_splats((unsigned int) shb);

      __asm__(

          "vrlq %0,%1,%2;\n"

          : "=v" (result)

          : "v" (vra), "v" (lshift)

          : );

    }

#else

  if (__builtin_constant_p (shb) && ((shb % 8) == 0))

    {

      /* When shifting an multiple of 8 bits (octet), use Vector

       Shift Left Double By Octet Immediate.  This eliminates

       loading the shift const into a VR.  */

      if (shb > 0)

        result = vec_sld ((vui8_t) vra, (vui8_t) vra, ((shb / 8) & 15));

      else

        result = (vui8_t) vra;

    }

  else

    {

      result = (vui8_t) vec_sldqi (vra, vra, shb);

    }

#endif

  return ((vui128_t) result);

}


static inline vi128_t

vec_selsq (vi128_t vra, vi128_t vrb, vb128_t vrc)

{

  return (vi128_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);

}


static inline vui128_t

vec_seluq (vui128_t vra, vui128_t vrb, vb128_t vrc)

{

  return (vui128_t) vec_sel ((vui32_t) vra, (vui32_t)vrb, (vui32_t)vrc);

}


static inline vb128_t

vec_setb_cyq (vui128_t vcy)

{

#ifdef _ARCH_PWR9

  const vui128_t zero = (vui128_t) vec_splat_u32(0);


  return (vb128_t) vec_vsubuqm (zero, vcy);

#else

  const vui32_t ones =  vec_splat_u32(1);

  vui32_t rcy;


  rcy = vec_splat ((vui32_t) vcy, VEC_W_L);

  return (vb128_t) vec_cmpeq (rcy, ones);

#endif

}


static inline vb128_t

vec_setb_ncq (vui128_t vcy)

{

#ifdef _ARCH_PWR9

  const vui128_t zero = (vui128_t) vec_splat_u32(0);


  return (vb128_t) vec_vsubeuqm (zero, zero, vcy);

#else

  const vui32_t zero =  CONST_VINT128_W(0, 0, 0, 0);

  vui32_t rcy;


  rcy = vec_splat ((vui32_t) vcy, VEC_W_L);

  return (vb128_t) vec_cmpeq (rcy, zero);

#endif

}


static inline vb128_t

vec_setb_sq (vi128_t vra)

{

  vb128_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vexpandqm %0,%1"

      : "=v" (result)

      : "v" (vra)

      : );

#else

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splat = vec_splat ((vui8_t) vra, VEC_BYTE_H);


  result = (vb128_t) vec_sra (splat, shift);

#endif

  return result;

}


static inline vui128_t

vec_sldq (vui128_t vrw, vui128_t vrx, vui128_t vrb)

{

  vui8_t result, vt1, vt2, vt3, vbs;

  const vui8_t vzero = vec_splat_u8 (0);


  vt1 = vec_slo ((vui8_t) vrw, (vui8_t) vrb);

  /* The vsl/vsr instruction only works correctly if the bit shift

     value is splatted to each byte of the vector.  */

  vbs = vec_splat ((vui8_t) vrb, VEC_BYTE_L);

  vt1 = vec_sll (vt1, vbs);

  vt3 = vec_sub (vzero, vbs);

  vt2 = vec_sro ((vui8_t) vrx, vt3);

  vt2 = vec_srl (vt2, vt3);

  result = vec_or (vt1, vt2);


  return ((vui128_t) result);

}


static inline vui128_t

vec_sldqi (vui128_t vrw, vui128_t vrx, const unsigned int shb)

{

  vui128_t result;


  if (__builtin_constant_p(shb))

    {

      if ((shb % 8) == 0)

        /* When shifting an multiple of 8 bits (octet), use Vector

         Shift Left Double By Octet Immediate.  This eliminates

         loading the shift const into a VR.  */

        if (shb > 0)

          result = (vui128_t) vec_sld ((vui8_t) vrw, (vui8_t) vrx, (shb / 8));

        else

          result = vrw;

      else // Not just an immediate octet shift

        if (shb < 8)

          // Special case for 0-7 shifts, use vec_vsldbi to exploit P10.

          result = vec_vsldbi (vrw, vrx, shb);

        else

          {

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

            // Special case of P10.

            vui8_t h, l;

            // Shift left double quad (256-bits) by Octet

            h = vec_sld ((vui8_t) vrw, (vui8_t) vrx, (shb / 8));

            l = vec_sld ((vui8_t) vrx, (vui8_t) vrx, (shb / 8));

            // Then Shift Left Double by Bit to complete the shift.

            result = vec_vsldbi ((vui128_t) h, (vui128_t) l, (shb % 8));

#else       // Load shb as vector and use general vec_sldq case.

            const vui8_t vrb = vec_splats ((unsigned char) shb);

            result = vec_sldq (vrw, vrx, (vui128_t) vrb);

#endif

          }

    }

  else

    {

      const vui8_t vrb = vec_splats ((unsigned char) shb);

      result = vec_sldq (vrw, vrx, (vui128_t) vrb);

    }


  return ((vui128_t) result);

}


static inline vui128_t

vec_slq (vui128_t vra, vui128_t vrb)

{

  vui8_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  // vslq takes the shift count from bits 57:63

  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);

  __asm__(

      "vslq %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#else

  vui8_t vshift_splat;

  /* For some reason, the vsl instruction only works

   * correctly if the bit shift value is splatted to each byte

   * of the vector.  */

  vshift_splat = vec_splat ((vui8_t) vrb, VEC_BYTE_L);

  result = vec_slo ((vui8_t) vra, (vui8_t) vrb);

  result = vec_sll (result, vshift_splat);

#endif

  return ((vui128_t) result);

}


static inline vui128_t

vec_slqi (vui128_t vra, const unsigned int shb)

{

  vui8_t result;


  if (shb < 128)

    {

      vui8_t lshift;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

      lshift = (vui8_t) vec_splats((unsigned int) shb);

      __asm__(

          "vslq %0,%1,%2;\n"

          : "=v" (result)

          : "v" (vra), "v" (lshift)

          : );

#else

      if (__builtin_constant_p (shb) && ((shb % 8) == 0))

        {

          /* When shifting an multiple of 8 bits (octet), use Vector

           Shift Left Double By Octet Immediate.  This eliminates

           loading the shift const into a VR, but requires an

           explicit vector of zeros.  */

          vui8_t zero =

            { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

          if (shb > 0)

            result = vec_sld ((vui8_t) vra, zero, (shb / 8));

          else

            result = (vui8_t) vra;

        }

      else

        {

          /* Load the shift const in a vector.  The bit level shifts

           require the shift amount is splatted to all 16-bytes of

           the shift control.  */

          if (__builtin_constant_p (shb) && (shb < 16))

            lshift = (vui8_t) vec_splat_s8(shb);

          else

            lshift = vec_splats ((unsigned char) shb);


          if (shb > 7)

            /* Vector Shift Left By Octet by bits 121-124 of lshift.  */

            result = vec_slo ((vui8_t) vra, lshift);

          else

            result = ((vui8_t) vra);


          /* Vector Shift Left by bits 125-127 of lshift.  */

          result = vec_sll (result, lshift);

        }

#endif

    }

  else

    { /* shifts greater then 127 bits return zeros.  */

      result = vec_xor ((vui8_t) vra, (vui8_t) vra);

    }

  return (vui128_t) result;

}


static inline vi128_t

vec_splat_s128 (const int sim)

{

  vi128_t result;

#ifdef _ARCH_PWR9

  // TBD! No Vector Extend Sign Byte To Qword

  // But does have VSX Vector Splat Immediate Byte (-128 -> 127)

  if (__builtin_constant_p (sim) && ((sim >= -128) && (sim < 128)))

    {

      // Expect the compiler to generate a single xxspltib for this.

      vi8_t vbi = vec_splats ((signed char) sim);


      if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))

        {

          // Special case for -1 and 0. Skip vec_sld().

          result = (vi128_t) vbi;

        }

      else

        {

          if (__builtin_constant_p (sim) && (sim > 0))

            {

              const vui32_t q_zero = {0, 0, 0, 0};

              result = (vi128_t) vec_sld ((vi8_t) q_zero, vbi, 1);

            }

          else

            {

              const vui32_t q_ones = {-1, -1, -1, -1};

              result = (vi128_t) vec_sld ((vi8_t) q_ones, vbi, 1);

            }

        }

    }

  else

    result = vec_splats ((signed __int128) sim);

#else

  if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))

    {

      vui32_t vwi = (vui32_t) vec_splat_s32(sim);


      if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))

        {

          // Special case for -1 and 0. Skip vec_sld().

          result = (vi128_t) vwi;

        }

      else

        {

          if (__builtin_constant_p (sim) && (sim > 0))

            {

              const vui32_t q_zero = {0, 0, 0, 0};

              result = (vi128_t) vec_sld (q_zero, vwi, 4);

            }

          else

            {

              const vui32_t q_ones = {-1, -1, -1, -1};

              result = (vi128_t) vec_sld (q_ones, vwi, 4);

            }

        }

    }

  else

    result = vec_splats ((signed __int128) sim);

#endif

  return (result);

}


static inline vui128_t

vec_splat_u128 (const int sim)

{

  vui128_t result;

#ifdef _ARCH_PWR9

  // No Vector Extend Sign Byte To Qword

  // But does have VSX Vector Splat Immediate Byte (0 -> 255)

  if (__builtin_constant_p (sim) && ((sim >= 0) && (sim < 256)))

    {

      // Expect the compiler to generate a single xxspltib for this.

      vui8_t vbi = vec_splats ((unsigned char) sim);


      if (__builtin_constant_p (sim) && (sim == 0))

        {

          // Special case for 0. Skip vec_sld().

          result = (vui128_t) vbi;

        }

      else

        {

          if (__builtin_constant_p (sim) && (sim < 256))

            {

              const vui32_t q_zero = {0, 0, 0, 0};

              result = (vui128_t) vec_sld ((vui8_t) q_zero, vbi, 1);

            }

          else

            result = vec_splats ((unsigned __int128) sim);

        }

    }

  else

    result = vec_splats ((unsigned __int128) sim);

#else

  if (__builtin_constant_p (sim) && ((sim >= 0) && (sim < 16)))

    {

      const vui32_t q_zero = {0, 0, 0, 0};

      vui32_t vwi = vec_splat_u32 (sim);


      if (__builtin_constant_p (sim) && (sim == 0))

        {

          // Special case for -1 and 0. Skip vec_unpackl().

          result = (vui128_t) vwi;

        } else {

          result = (vui128_t) vec_sld (q_zero, vwi, 4);

        }

    }

  else if (__builtin_constant_p (sim) && (sim == 128))

    {

      // Expect the compiler to generate vspltisw/vslb here.

      vui8_t vbi = vec_splats ((unsigned char) 128);

      // Extent left with 120-bits of 0

      const vui32_t q_zero = {0, 0, 0, 0};

      result = (vui128_t) vec_sld ((vui8_t) q_zero, vbi, 1);

    }

  else

    result = vec_splats ((unsigned __int128) sim);

#endif

  return (result);

}


static inline vi128_t

vec_sraq (vi128_t vra, vui128_t vrb)

{

  vui8_t result;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  // vsraq takes the shift count from bits 57:63

  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);

  __asm__(

      "vsraq %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#else

  vui8_t vsht;

  vui128_t vsgn;

  const vui8_t zero = vec_splat_u8 (0);


  /* For some reason the vsr instruction only works

   * correctly if the bit shift value is splatted to each byte

   * of the vector.  */

  vsgn = (vui128_t) vec_setb_sq (vra);

  vsht = vec_sub (zero, (vui8_t) vrb);

  result = (vui8_t) vec_sldq (vsgn, (vui128_t) vra, (vui128_t) vsht);

#endif

  return ((vi128_t) result);

}


static inline vi128_t

vec_sraqi (vi128_t vra, const unsigned int shb)

{

  vui8_t result;


  if (shb < 127)

    {

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  vui32_t rshift = vec_splats((unsigned int) shb);

  __asm__(

          "vsraq %0,%1,%2;\n"

          : "=v" (result)

          : "v" (vra), "v" (rshift)

          : );

#else

      vui8_t lshift;

      vui128_t vsgn;

      if (__builtin_constant_p (shb) && ((shb % 8) == 0))

        {

          if (shb > 0)

            {

              vsgn = (vui128_t) vec_setb_sq (vra);

              result = vec_sld ((vui8_t) vsgn, (vui8_t) vra, 16 - (shb / 8));

            }

          else

            result = (vui8_t) vra;

        }

      else

        {

#ifdef _ARCH_PWR8

          if (shb < 64)

            {

              vui128_t vrshq;

              vi64_t vrshd;

              vrshq = vec_srqi ((vui128_t) vra, shb);

              vrshd = vec_sradi ((vi64_t) vra, shb);

              result = (vui8_t) vec_pasted ((vui64_t) vrshd, (vui64_t) vrshq);

            }

          else

            {

#endif

              const unsigned int lshb = 128 - shb;

              if (__builtin_constant_p (shb) && (lshb < 16))

                lshift = (vui8_t) vec_splat_s8(shb);

              else

                lshift = vec_splats ((unsigned char) lshb);


              vsgn = (vui128_t) vec_setb_sq (vra);

              result = (vui8_t) vec_sldq (vsgn, (vui128_t) vra,

                                          (vui128_t) lshift);

#ifdef _ARCH_PWR8

            }

#endif

        }

#endif

    }

  else

    { /* shifts greater then 126 bits returns the sign bit.  */

      result = (vui8_t) vec_setb_sq (vra);

    }


  return ((vi128_t) result);

}


static inline vui128_t

vec_srq (vui128_t vra, vui128_t vrb)

{

  vui8_t result;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  // vsrq takes the shift count from bits 57:63

  vrb = (vui128_t) vec_splatd ((vui64_t) vrb, VEC_DW_L);

  __asm__(

      "vsrq %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#else

  vui8_t vsht_splat;

  /* For some reason the vsr instruction only works

   * correctly if the bit shift value is splatted to each byte

   * of the vector.  */

  vsht_splat = vec_splat ((vui8_t) vrb, VEC_BYTE_L);

  result = vec_sro ((vui8_t) vra, (vui8_t) vrb);

  result = vec_srl (result, vsht_splat);

#endif

  return ((vui128_t) result);

}


static inline vui128_t

vec_srqi (vui128_t vra, const unsigned int shb)

{

  vui8_t result;


  if (shb < 128)

    {

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  vui32_t rshift = vec_splats((unsigned int) shb);

  __asm__(

          "vsrq %0,%1,%2;\n"

          : "=v" (result)

          : "v" (vra), "v" (rshift)

          : );

#else

      vui8_t lshift;

      if (__builtin_constant_p (shb) && ((shb % 8)) == 0)

        {

          /* When shifting an multiple of 8 bits (octet), use Vector

           Shift Left Double By Octet Immediate.  This eliminates

           loading the shift const into a VR, but requires an

           explicit vector of zeros.  */

          vui8_t zero =

            { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

          /* The compiler needs to know at compile time that

             0 < shb < 128 is true to insure the constraint (4 bit

             immediate field) of vsldoi is meet.  So the following if

             is required but should not generate any branch code.  */

          if (shb > 0)

            result = vec_sld (zero, (vui8_t) vra, (16 - (shb / 8)));

          else

            result = (vui8_t) vra;

        }

      else

        {

          /* Load the shift const in a vector.  The bit level shifts

           require the shift amount is splatted to all 16-bytes of

           the shift control.  */

          if ((__builtin_constant_p (shb) && (shb < 16)))

            lshift = (vui8_t) vec_splat_s8(shb);

          else

            lshift = vec_splats ((unsigned char) shb);


          if (shb > 7)

            /* Vector Shift right By Octet based on the bits 121-124 of

             lshift.  */

            result = vec_sro ((vui8_t) vra, lshift);

          else

            result = ((vui8_t) vra);


          /* Vector Shift right based on the lower 3-bits of lshift.  */

          result = vec_srl (result, lshift);

        }

#endif

    }

  else

    { /* shifts greater then 127 bits return zeros.  */

      result = vec_xor ((vui8_t) vra, (vui8_t) vra);

    }

  return (vui128_t) result;

}


static inline vui128_t

vec_slq4 (vui128_t vra)

{

  __vector unsigned char result, vsht_splat;


  /* The vsl instruction only works correctly if the bit shift value

   * is splatted to each byte of the vector.  */

  vsht_splat = vec_splat_u8(4);

  result = vec_sll ((__vector unsigned char) vra, vsht_splat);


  return ((vui128_t) result);

}


static inline vui128_t

vec_slq5 (vui128_t vra)

{

  __vector unsigned char result, vsht_splat;


  /* The vsl instruction only works correctly if the bit shift value

   * is splatted to each byte of the vector.  */

  vsht_splat = vec_splat_u8(5);

  result = vec_sll ((__vector unsigned char) vra, vsht_splat);


  return ((vui128_t) result);

}


static inline vui128_t

vec_srq4 (vui128_t vra)

{

  __vector unsigned char result, vsht_splat;


  /* The vsr instruction only works correctly if the bit shift value

   * is splatted to each byte of the vector.  */

  vsht_splat = vec_splat_u8(4);

  result = vec_srl ((__vector unsigned char) vra, vsht_splat);


  return ((vui128_t) result);

}


static inline vui128_t

vec_srq5 (vui128_t vra)

{

  __vector unsigned char result, vsht_splat;


  /* The vsr instruction only works correctly if the bit shift value

   * is splatted to each byte of the vector.  */

  vsht_splat = vec_splat_u8(5);

  result = vec_srl ((__vector unsigned char) vra, vsht_splat);


  return ((vui128_t) result);

}


static inline vui128_t

vec_subcuq (vui128_t vra, vui128_t vrb)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vsubcuq)

  t = (vui32_t) vec_vsubcuq (vra, vrb);

#elif defined (__clang__)

  t = (vui32_t) vec_subc (vra, vrb);

# else

  __asm__(

      "vsubcuq %0,%1,%2;"

      : "=v" (t)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  /* vsubcuq is defined as (vra + NOT(vrb) + 1) >> 128.  */

  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);

  const vui32_t ci= { 0,0,0,1 };


  t = (vui32_t) vec_addecuq (vra, (vui128_t) _b, (vui128_t) ci);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_subecuq (vui128_t vra, vui128_t vrb, vui128_t vrc)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vsubecuq)

  t = (vui32_t) vec_vsubecuq (vra, vrb, vrc);

#elif defined (__clang__)

  t = (vui32_t) vec_subec (vra, vrb, vrc);

# else

  __asm__(

      "vsubecuq %0,%1,%2,%3;"

      : "=v" (t)

      : "v" (vra),

        "v" (vrb),

        "v" (vrc)

      : );

#endif

#else

  /* vsubcuq is defined as (vra + NOT(vrb) + vrc.bit[127]) >> 128.  */

  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);


  t = (vui32_t) vec_addecuq (vra, (vui128_t) _b, vrc);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_subeuqm (vui128_t vra, vui128_t vrb, vui128_t vrc)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vsubeuqm)

  t = (vui32_t) vec_vsubeuqm (vra, vrb, vrc);

#elif defined (__clang__)

  t = (vui32_t) vec_sube (vra, vrb, vrc);

# else

  __asm__(

      "vsubeuqm %0,%1,%2,%3;"

      : "=v" (t)

      : "v" (vra),

        "v" (vrb),

        "v" (vrc)

      : );

#endif

#else

  /* vsubeuqm is defined as vra + NOT(vrb) + vrc.bit[127].  */

  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);


  t = (vui32_t) vec_addeuqm (vra, (vui128_t) _b, vrc);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_subuqm (vui128_t vra, vui128_t vrb)

{

  vui32_t t;

#ifdef _ARCH_PWR8

#if defined (vec_vsubuqm)

  t = (vui32_t) vec_vsubuqm (vra, vrb);

#elif defined (__clang__)

  t = (vui32_t) vec_sub (vra, vrb);

  __asm__(

      "vsubuqm %0,%1,%2;"

      : "=v" (t)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  /* vsubuqm is defined as vra + NOT(vrb) + 1.  */

  vui32_t _b = vec_nor ((vui32_t) vrb, (vui32_t) vrb);

  const vui32_t ci= { 0,0,0,1 };


  t = (vui32_t) vec_addeuqm (vra, (vui128_t) _b, (vui128_t) ci);

#endif

  return ((vui128_t) t);

}


static inline vui128_t

vec_vmuleud (vui64_t a, vui64_t b)

{

  vui64_t res;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vmuleud %0,%1,%2;\n"

      : "=v" (res)

      : "v" (a), "v" (b)

      : );

#elif defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))

  const vui64_t zero = { 0, 0 };

  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);

  __asm__(

      "vmsumudm %0,%1,%2,%3;\n"

      : "=v" (res)

      : "v" (a), "v" (b_eud), "v" (zero)

      : );

#elif defined (_ARCH_PWR8)

  const vui64_t zero = { 0, 0 };

  vui64_t p0, p1, pp10, pp01;

  vui32_t m0, m1;


// Need the endian invariant merge word high here

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

// Nullify the little endian transform

  m0 = vec_mergel ((vui32_t) b, (vui32_t) b);

#else

  m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);

#endif

  m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 0);


  // Need the endian invariant multiply even/odd word here

  p1 = vec_vmulouw (m1, m0);

  p0 = vec_vmuleuw (m1, m0);

  /* res[1] = p1[1];  res[0] = p0[0];  */

  res = vec_pasted (p0, p1);

  /*

   pp10[1] = p1[0]; pp10[0] = 0;

   pp01[1] = p0[1]; pp01[0] = 0;

   */

  // Need the endian invariant merge algebraic high/low here

  pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);

  pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);

  /* pp01 = pp01 + pp10.  */

  pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);


  /* res = res + (pp01 << 32)  */

  pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);

  res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);

#else

  const vui32_t zero = {0,0,0,0};

  vui32_t p0, p1;

  vui32_t resw;

  vui16_t m0, m1, mm;


  m0 = (vui16_t) vec_mergeh (a, (vui64_t) zero);

  mm = (vui16_t) vec_mergeh (b, (vui64_t) zero);


  m1 = vec_splat (mm, 3);


  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  resw = vec_sld (zero, p1, 14);

  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 2);

  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 1);

  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 0);

  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  res = (vui64_t)resw;

#endif

  return ((vui128_t) res);

}


static inline vui128_t

vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c)

{

  const vui64_t zero = { 0, 0 };

#ifdef _ARCH_PWR9

  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);

  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);

  return vec_msumudm(a, b_eud, (vui128_t) c_eud);

#else

  vui128_t res;

  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);

  res = vec_vmuleud (a, b);

  return vec_adduqm (res, (vui128_t) c_eud);

#endif

}


static inline vui128_t

vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)

{

  const vui64_t zero = { 0, 0 };

#ifdef _ARCH_PWR9

  vui128_t cd_sum;

  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);

  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);

  vui64_t d_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) d);

  cd_sum = vec_adduqm ((vui128_t) c_eud, (vui128_t) d_eud);

  return vec_msumudm(a, b_eud, (vui128_t) cd_sum);

#else

  vui128_t res, cd_sum;

  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);

  vui64_t d_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) d);

  cd_sum = vec_adduqm ((vui128_t) c_eud, (vui128_t) d_eud);

  res = vec_vmuleud (a, b);

  return vec_adduqm (res, (vui128_t) cd_sum);

#endif

}


static inline vui128_t

vec_vmuloud (vui64_t a, vui64_t b)

{

  vui64_t res;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vmuloud %0,%1,%2;\n"

      : "=v" (res)

      : "v" (a), "v" (b)

      : );

#elif defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))

  const vui64_t zero = { 0, 0 };

  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t)b);

  __asm__(

      "vmsumudm %0,%1,%2,%3;\n"

      : "=v" (res)

      : "v" (a), "v" (b_oud), "v" (zero)

      : );

#elif defined (_ARCH_PWR8)

  const vui64_t zero = { 0, 0 };

  vui64_t p0, p1, pp10, pp01;

  vui32_t m0, m1;


  // Need the endian invariant merge word low here

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  // Nullify the little endian transform

  m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);

#else

  m0 = vec_mergel ((vui32_t) b, (vui32_t) b);

#endif

  m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 1);


  // Need the endian invariant multiply even/odd word here

  p0 = vec_vmuleuw (m1, m0);

  p1 = vec_vmulouw (m1, m0);


  /* res[1] = p1[1];  res[0] = p0[0];  */

  res = vec_pasted (p0, p1);

  /*

   pp10[0] = p1[0]; pp10[1] = 0;

   pp01[0] = p0[1]; pp01[1] = 0;

   */

  // Need the endian invariant merge algebraic high/low here

  pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);

  pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);


  pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);


  pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);


  res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);

#else

// POWER7 and earlier are big Endian only

  const vui32_t zero = {0,0,0,0};

  vui32_t p0, p1;

  vui32_t resw;

  vui16_t m0, m1, mm;


  m0 = (vui16_t) vec_mergel (a, (vui64_t) zero);

  mm = (vui16_t) vec_mergel (b, (vui64_t) zero);


  m1 = vec_splat (mm, 3);


  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  resw = vec_sld (zero, p1, 14);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 2);


  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);

  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);


    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 1);


  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);


    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  m1 = vec_splat (mm, 0);


  p0 = vec_vmuleuh (m0, m1);

  p1 = vec_vmulouh (m0, m1);


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p1);

    resw = vec_vadduwm (resw, p1);


    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

    resw = vec_sld (c, resw, 14);

  }


  {

    vui32_t c;

    c    = vec_vaddcuw (resw, p0);

    resw = vec_vadduwm (resw, p0);

    c    = vec_sld (c, c, 4);

    resw = vec_vadduwm (resw, c);

  }


  res = (vui64_t)resw;

#endif

  return ((vui128_t) res);

}


static inline vui128_t

vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c)

{

  const vui64_t zero = { 0, 0 };

#ifdef _ARCH_PWR9

  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);

  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);

  return vec_msumudm(a, b_oud, (vui128_t) c_oud);

#else

  vui128_t res;

  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);

  res = vec_vmuloud (a, b);

  return vec_adduqm (res, (vui128_t) c_oud);

#endif

}


static inline vui128_t

vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)

{

  const vui64_t zero = { 0, 0 };

#ifdef _ARCH_PWR9

  vui128_t cd_sum;

  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);

  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);

  vui64_t d_oud = vec_mrgald ((vui128_t) zero, (vui128_t) d);

  cd_sum = vec_adduqm ((vui128_t) c_oud, (vui128_t) d_oud);

  return vec_msumudm(a, b_oud, (vui128_t) cd_sum);

#else

  vui128_t res, cd_sum;

  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);

  vui64_t d_oud = vec_mrgald ((vui128_t) zero, (vui128_t) d);

  cd_sum = vec_adduqm ((vui128_t) c_oud, (vui128_t) d_oud);

  res = vec_vmuloud (a, b);

  return vec_adduqm (res, (vui128_t) cd_sum);

#endif

}


static inline vui128_t

vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c)

{

#ifdef _ARCH_PWR9

  const vui64_t zero = { 0, 0 };

  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);

  return vec_msumudm(a, b_eud, c);

#else

  vui128_t res;

  res = vec_vmuleud (a, b);

  return vec_adduqm (res, c);

#endif

}


static inline vui128_t

vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c)

{

#ifdef _ARCH_PWR9

  const vui64_t zero = { 0, 0 };

  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);

  return vec_msumudm(a, b_oud, (vui128_t) c);

#else

  vui128_t res;

  res = vec_vmuloud (a, b);

  return vec_adduqm (res, c);

#endif

}


static inline vui128_t

vec_vsldbi (vui128_t vra, vui128_t vrb, const unsigned int shb)

{

  vui128_t result;


  if (__builtin_constant_p (shb) && (shb < 8))

    {

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

      __asm__(

          "vsldbi %0,%1,%2,%3;\n"

          : "=v" (result)

          : "v" (vra), "v" (vrb), "K" (shb)

          : );

#else

      /* For Power7/8/9 the quadword bit shift left/right instructions

       * only handle 128-bits.

       * So shift vra and vrb separately then combine those into

       * a single 128-bit result.

       */

      if (shb > 0)

        {

          const vui8_t vshl = vec_splat_u8 (shb);

          const vui8_t vshr = vec_splat_u8 (8 - shb);

          const vui8_t zero = vec_splat_u8 (0);

          vui8_t lowbits, highbits;


          /* Shift left double (vra || 'zero') by 15 octet  to isolate

           * the high order byte of vrb in to the low 8-bits. Then right

           * shift this (8-shb) bits. This provides (128-shb) bits of

           * leading '0's. */

          lowbits = vec_sld (zero, (vui8_t) vrb, 1);

          lowbits = vec_vsrb (lowbits, vshr);

          /* Left shift the quadword vra shifting in shb '0' bits.  */

          highbits = vec_sll ((vui8_t) vra, vshl);

          /* Combine left shifted bits from vra, vrb.  */

          result = (vui128_t) vec_or (highbits, lowbits);

        }

      else

        result = vra;

#endif

    }

  else

    {

      result = vec_sldqi (vra, vrb, (shb & 7));

    }


  return ((vui128_t) result);

}


static inline vui128_t

vec_vsrdbi (vui128_t vra, vui128_t vrb, const unsigned int shb)

{

  vui128_t result;


  if (__builtin_constant_p (shb) && (shb < 8))

    {

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

      __asm__(

          "vsrdbi %0,%1,%2,%3;\n"

          : "=v" (result)

          : "v" (vra), "v" (vrb), "K" (shb)

          : );

#else

      /* For Power7/8/9 the quadword bit shift left/right instructions

       * only handle 128-bits.

       * So shift vra and vrb separately then combine those into

       * a single 128-bit result.

       */

      if (shb > 0)

        {

          const vui8_t vshl = vec_splat_u8 (8 - shb);

          const vui8_t vshr = vec_splat_u8 (shb);

          const vui8_t zero = vec_splat_u8 (0);

          vui8_t lowbits, highbits;


          /* Shift left double (vra || 'zero') by 15 octet to isolate

           * the low order byte of vra in to the high 8-bits. Then left

           * shift this (8-shb) bits. This provides (128-shb) bits of

           * trailing '0's. */

          highbits = vec_sld ((vui8_t) vra, zero, 15);

          highbits = vec_vslb (highbits, vshl);

          /* right shift the quadword vrb shifting in shb '0' bits.  */

          lowbits = vec_srl ((vui8_t) vrb, vshr);

          /* Combine right shifted bits from vra, vrb.  */

          result = (vui128_t) vec_or (highbits, lowbits);

        }

      else

        result = vrb;

#endif

    }

  else

    {

#if defined (__clang__) && (__clang_major__ < 6)

      // A workaround for a constant propagation bug in clang-5

      if (shb == 0)

        result = vrb;

      else

#endif

      result = vec_sldqi (vra, vrb, (128 - (shb & 7)));

    }


  return ((vui128_t) result);

}

#endif /* VEC_INT128_PPC_H_ */