vec__int32__ppc_8h_source.html

/*

 Copyright (c) [2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_int32_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Mar 29, 2018

 */


#ifndef VEC_INT32_PPC_H_

#define VEC_INT32_PPC_H_


#include <pveclib/vec_int16_ppc.h>


#ifdef _ARCH_PWR8

/*

 * Vector Shift Left Doubleword was added to PowerISA 2.07 (PWR8).

 * Operations vec_vsld and vec_sldi are define in vec_int64_ppc.h

 * but using those here would create a circular dependency.

 * So need the equivalent of the altivec.h specific vec_vsld.

 * Currently GCC defines both vec_vsld and vec_sl for type long long.

 * But older GCC versions may not and are more likely to support only

 * vec_vsld. Clang seems to only support the generic vec_sl for the

 * long long type and does not define the macro vec_vsld.

 *

 * The following allows vec_int32_ppc.h to use __pvec_vsld as a work

 * around for clang and possible future versions of GCC that drop

 * support for altivec specific built-ins.

 */

#ifdef vec_vsld

#define __pvec_vsld vec_vsld

#else

#define __pvec_vsld vec_sl

#endif

#endif


static inline vui64_t vec_muleuw (vui32_t a, vui32_t b);

static inline vui64_t vec_mulouw (vui32_t a, vui32_t b);

#ifndef vec_popcntw

static inline vui32_t vec_popcntw (vui32_t vra);

#else

/* Work around for GCC PR85830.  */

#undef vec_popcntw

#define vec_popcntw __builtin_vec_vpopcntw

#endif

static inline vi32_t vec_srawi (vi32_t vra, const unsigned int shb);

static inline vui64_t

vec_vlxsiwzx (const signed long long ra, const unsigned int *rb);

static inline vi64_t

vec_vlxsiwax (const signed long long ra, const signed int *rb);

static inline vui64_t vec_vmuleuw (vui32_t a, vui32_t b);

static inline vui64_t vec_vmulouw (vui32_t a, vui32_t b);

static inline void

vec_vsstwso (vui64_t xs, unsigned int *array,

              const long long offset0, const long long offset1);

static inline void

vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb);


static inline vui32_t

vec_absduw (vui32_t vra, vui32_t vrb)

{

   vui32_t result;

#ifdef _ARCH_PWR9

#ifdef vec_absdw

  result = vec_absdw (vra, vrb);

#else

  __asm__(

      "vabsduw %0,%1,%2;"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#endif

#else

  vui32_t vmin, vmax;


  vmin = vec_min (vra, vrb);

  vmax = vec_max (vra, vrb);

  result = vec_sub (vmax, vmin);

#endif

  return (result);

}


static inline vui32_t

vec_clzw (vui32_t vra)

{

  vui32_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vclzw)

  r = vec_vclzw (vra);

#elif defined (__clang__)

  r = vec_cntlz (vra);

#else

  __asm__(

      "vclzw %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

//#warning Implememention pre POWER8

  vui32_t n, nt, y, x, s, m;

  vui32_t z= {0,0,0,0};

  vui32_t one = {1,1,1,1};


  /* n = 32 s = 16 */

  s = vec_splat_u32(8);

  s = vec_add (s, s);

  n = vec_add (s, s);


  x = vra;

  /* y=x>>16 if (y!=0) (n=n-16 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui32_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x, y, m);

  n = vec_sel (n, nt, m);


  /* y=x>>8 if (y!=0) (n=n-8 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui32_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x, y, m);

  n = vec_sel (n, nt, m);


  /* y=x>>4 if (y!=0) (n=n-4 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui32_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x, y, m);

  n = vec_sel (n, nt, m);


  /* y=x>>2 if (y!=0) (n=n-2 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui32_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x, y, m);

  n = vec_sel (n, nt, m);


  /* y=x>>1 if (y!=0) return (n=n-2)   */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  nt = vec_sub(nt,s);

  m = (vui32_t)vec_cmpgt(y, z);

  n = vec_sel (n, nt, m);


  /* else return (x-n)  */

  nt = vec_sub (n, x);

  n = vec_sel (nt, n, m);

  r = n;

#endif

  return ((vui32_t) r);

}


static inline vui32_t

vec_ctzw (vui32_t vra)

{

  vui32_t r;

#ifdef _ARCH_PWR9

#if defined (vec_cnttz) || defined (__clang__)

  r = vec_cnttz (vra);

#else

  __asm__(

      "vctzw %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

// For _ARCH_PWR8 and earlier. Generate 1's for the trailing zeros

// and 0's otherwise. Then count (popcnt) the 1's. _ARCH_PWR8 uses

// the hardware vpopcntw instruction. _ARCH_PWR7 and earlier use the

// PVECLIB vec_popcntw implementation which runs ~20-28 instructions.

  const vui32_t ones = { 1, 1, 1, 1 };

  vui32_t tzmask;

  // tzmask = (!vra & (vra - 1))

  tzmask = vec_andc (vec_sub (vra, ones), vra);

  // return = vec_popcnt (!vra & (vra - 1))

  r = vec_popcntw (tzmask);

#endif

  return ((vui32_t) r);

}


static inline vui32_t

vec_mrgahw (vui64_t vra, vui64_t vrb)

{

  vui32_t res;

#ifdef _ARCH_PWR8

#ifdef vec_vmrgew

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_vmrgow ((vui32_t)vrb, (vui32_t)vra);

#else

  res = vec_vmrgew ((vui32_t)vra, (vui32_t)vrb);

#endif

#else

  __asm__(

      "vmrgew %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t vconstp =

      CONST_VINT32_W(0x00010203,  0x10111213, 0x08090a0b,  0x18191a1b);

  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);

#endif

  return (res);

}


static inline vui32_t

vec_mrgalw (vui64_t vra, vui64_t vrb)

{

  vui32_t res;

#ifdef _ARCH_PWR8

#ifdef vec_vmrgow

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_vmrgew ((vui32_t)vrb, (vui32_t)vra);

#else

  res = vec_vmrgow ((vui32_t)vra, (vui32_t)vrb);

#endif

#else

  __asm__(

      "vmrgow %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t vconstp =

      CONST_VINT32_W(0x04050607,  0x14151617, 0x0c0d0e0f,  0x1c1d1e1f);

  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);

#endif

  return (res);

}


static inline vui32_t

vec_mrgew (vui32_t vra, vui32_t vrb)

{

  vui32_t res;

#ifdef _ARCH_PWR8

#ifdef vec_vmrgew

  res = vec_vmrgew (vra, vrb);

#else

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vmrgow %0,%2,%1;\n"

#else

      "vmrgew %0,%1,%2;\n"

#endif

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t vconstp =

      CONST_VINT32_W(0x00010203, 0x10111213, 0x08090a0b, 0x18191a1b);

  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);

#endif

  return (res);

}


static inline vui32_t

vec_mrgow (vui32_t vra, vui32_t vrb)

{

  vui32_t res;

#ifdef _ARCH_PWR8

#ifdef vec_vmrgew

  res = vec_vmrgow (vra, vrb);

#else

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vmrgew %0,%2,%1;\n"

#else

      "vmrgow %0,%1,%2;\n"

#endif

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t vconstp =

      CONST_VINT32_W(0x04050607, 0x14151617, 0x0c0d0e0f, 0x1c1d1e1f);

  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);

#endif

  return (res);

}


static inline vi64_t

vec_mulesw (vi32_t a, vi32_t b)

{

  vi64_t res;

#ifdef _ARCH_PWR8

  // The vector vmulosw/vmulesw instructions introduced in PRW8

#if defined __GNUC__ && (__GNUC__ > 7)

  res = vec_mule (a, b);

#else

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vmulosw %0,%1,%2;\n"

#else

      "vmulesw %0,%1,%2;\n"

#endif

      : "=v" (res)

      : "v" (a),

      "v" (b)

      : );

#endif

#else

  // must be PWR7 or older

  vui32_t uia, uib;

  vui32_t amask, bmask, t1, t2, r;

  vui64_t ui_prod;

  const vui32_t zero= { 0,0,0,0};


  uia = (vui32_t) a;

  uib = (vui32_t) b;

  // Generate 32-bit masks from the sign of each input word.

  amask = (vui32_t) vec_srawi (a, 31);

  bmask = (vui32_t) vec_srawi (b, 31);

  // Extend the even masks to the right with zeros to form two 64-bit

  // masks. We need the trailing zeros as the low 32-bits of the

  // product are correct as-is and should not change.

  amask = vec_mrgew (amask, zero);

  bmask = vec_mrgew (bmask, zero);

  // Compute the doubleword even unsigned word product

  ui_prod = vec_muleuw (uia, uib);


  // Generate t1 = amask & b and t2 = bmask & a

  t1 = vec_and (amask, uib);

  t2 = vec_and (bmask, uia);

  // Apply the correction res = ui_prod - t1 - t2

  r = vec_sub ((vui32_t) ui_prod, t1);

  res = (vi64_t) vec_sub (r, t2);

#endif

  return (res);

}


static inline vi64_t

vec_mulosw (vi32_t a, vi32_t b)

{

  vi64_t res;

#ifdef _ARCH_PWR8

  // The vector vmulosw/vmulesw instructions introduced in PRW8

#if defined __GNUC__ && (__GNUC__ > 7) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  res = vec_mulo (a, b);

#else

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vmulesw %0,%1,%2;\n"

#else

      "vmulosw %0,%1,%2;\n"

#endif

      : "=v" (res)

      : "v" (a),

      "v" (b)

      : );

#endif

#else

  // must be PWR7 or older

  vui32_t uia, uib;

  vui32_t amask, bmask, t1, t2, r;

  vui64_t ui_prod;

  const vui32_t zero= { 0,0,0,0};


  // duplicate odd words to even

  uia = (vui32_t) a;

  uib = (vui32_t) b;

  uia = vec_mrgow (uia, uia);

  uib = vec_mrgow (uib, uib);

  // Generate 32-bit masks from the sign of each input word.

  amask = (vui32_t) vec_srawi ((vi32_t) uia, 31);

  bmask = (vui32_t) vec_srawi ((vi32_t) uib, 31);

  // Shift the odd masks to the left 32 and extend to the right with

  // zeros to form two 64-bit masks. We need the trailing zeros as the

  // low 32-bits of the product are correct as-is.

  amask = vec_mrgow (amask, zero);

  bmask = vec_mrgow (bmask, zero);

  // Compute the doubleword odd unsigned word product

  ui_prod = vec_mulouw (uia, uib);


  // Generate t1 = amask & b and t2 = bmask & a

  t1 = vec_and (amask, uib);

  t2 = vec_and (bmask, uia);

  // Apply the correction res = ui_prod - t1 - t2

  r = vec_sub ((vui32_t) ui_prod, t1);

  res = (vi64_t) vec_sub (r, t2);

#endif

  return (res);

}


static inline vui64_t

vec_muleuw (vui32_t a, vui32_t b)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmulouw (a, b);

#else

  return vec_vmuleuw (a, b);

#endif

}


static inline vui64_t

vec_mulouw (vui32_t a, vui32_t b)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmuleuw (a, b);

#else

  return vec_vmulouw (a, b);

#endif

}


static inline vi32_t

vec_mulhsw (vi32_t vra, vi32_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vi32_t) vec_mrgahw ((vui64_t) vec_mulosw (vra, vrb),

                              (vui64_t) vec_mulesw (vra, vrb));

#else

  return (vi32_t) vec_mrgahw ((vui64_t) vec_mulesw (vra, vrb),

                              (vui64_t) vec_mulosw (vra, vrb));

#endif

}


static inline vui32_t

vec_mulhuw (vui32_t vra, vui32_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_mrgahw (vec_mulouw (vra, vrb), vec_muleuw (vra, vrb));

#else

  return vec_mrgahw (vec_muleuw (vra, vrb), vec_mulouw (vra, vrb));

#endif

}


static inline vui32_t

vec_muluwm (vui32_t a, vui32_t b)

{

#if defined __GNUC__ && (__GNUC__ > 7)

  return vec_mul (a, b);

#else

  vui32_t r;

#ifdef _ARCH_PWR8

  __asm__(

      "vmuluwm %0,%1,%2;\n"

      : "=v" (r)

      : "v" (a),

      "v" (b)

      : );

#else

  vui32_t s16 = (vui32_t)vec_vspltisw (-16);

  vui32_t z = (vui32_t)vec_vspltisw (0);

  vui32_t t4;

  vui32_t t2, t3;

  vui16_t t1;


  t1 = (vui16_t)vec_vrlw (b, s16);

  t2 = vec_vmulouh ((vui16_t)a, (vui16_t)b);

  t3 = vec_vmsumuhm ((vui16_t)a, t1, z);

  t4 = vec_vslw (t3, s16);

  r = (vui32_t)vec_vadduwm (t4, t2);

#endif

  return (r);

#endif

}


#ifndef vec_popcntw

static inline vui32_t

vec_popcntw (vui32_t vra)

{

  vui32_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vpopcntw)

  r = vec_vpopcntw (vra);

#elif defined (__clang__)

  r = vec_popcnt (vra);

#else

  __asm__(

      "vpopcntw %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

//#warning Implememention pre power8

  vui32_t z= { 0,0,0,0};

  vui8_t x;

  x = vec_popcntb ((vui8_t)vra);

  r = vec_vsum4ubs (x, z);

#endif

  return (r);

}

#else

/* Work around for GCC PR85830.  */

#undef vec_popcntw

#define vec_popcntw __builtin_vec_vpopcntw

#endif


static inline vui32_t

vec_revbw (vui32_t vra)

{

  vui32_t result;


#ifdef _ARCH_PWR9

#if defined (vec_revb) || defined (__clang__)

  result = vec_revb (vra);

#else

  __asm__(

      "xxbrw %x0,%x1;"

      : "=wa" (result)

      : "wa" (vra)

      : );

#endif

#else

  const vui64_t vconstp =

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

      CONST_VINT64_DW(0x0302010007060504UL, 0x0B0A09080F0E0D0CUL);

#else

      CONST_VINT64_DW(0x0C0D0E0F08090A0BUL, 0x0405060700010203UL);

#endif

  result = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);

#endif


  return (result);

}


static inline vb32_t

vec_setb_sw (vi32_t vra)

{

  vb32_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vexpandwm %0,%1"

      : "=v" (result)

      : "v" (vra)

      : );

#else

  // Compare signed word less than zero

  const vi32_t zero = {0, 0, 0, 0};

  result = vec_cmplt (vra, zero);

#endif

  return result;

}


static inline vui32_t

vec_slwi (vui32_t vra, const unsigned int shb)

{

  vui32_t lshift;

  vui32_t result;


  if (shb < 32)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p (shb) && (shb < 16))

        lshift = (vui32_t) vec_splat_s32(shb);

      else

        lshift = vec_splats ((unsigned int) shb);


      /* Vector Shift right bytes based on the lower 5-bits of

         corresponding element of lshift.  */

      result = vec_vslw (vra, lshift);

    }

  else

    { /* shifts greater then 31 bits return zeros.  */

      result = vec_xor ((vui32_t) vra, (vui32_t) vra);

    }


  return (vui32_t) result;

}


static inline vi32_t

vec_srawi (vi32_t vra, const unsigned int shb)

{

  vui32_t lshift;

  vi32_t result;


  if (shb < 32)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p (shb) && (shb < 16))

        lshift = (vui32_t) vec_splat_s32(shb);

      else

        lshift = vec_splats ((unsigned int) shb);


      /* Vector Shift Right Algebraic Words based on the lower 5-bits

         of corresponding element of lshift.  */

      result = vec_vsraw (vra, lshift);

    }

  else

    { /* shifts greater then 31 bits returns the sign bit propagated to

         all bits.   This is equivalent to shift Right Algebraic of

         31 bits.  */

      lshift = (vui32_t) vec_splats(31);

      result = vec_vsraw (vra, lshift);

    }


  return (vi32_t) result;

}


static inline vui32_t

vec_srwi (vui32_t vra, const unsigned int shb)

{

  vui32_t lshift;

  vui32_t result;


  if (shb < 32)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p (shb) && (shb < 16))

        lshift = (vui32_t) vec_splat_s32(shb);

      else

        lshift = vec_splats ((unsigned int) shb);


      /* Vector Shift right bytes based on the lower 5-bits of

         corresponding element of lshift.  */

      result = vec_vsrw (vra, lshift);

    }

  else

    { /* shifts greater then 31 bits return zeros.  */

      result = vec_xor ((vui32_t) vra, (vui32_t) vra);

    }

  return (vui32_t) result;

}


static inline vui32_t

vec_vgl4wso (unsigned int *array, const long long offset0,

             const long long offset1, const long long offset2,

             const long long offset3)

{

  vui32_t result;


#ifdef _ARCH_PWR8

  vui64_t re0, re1, re2, re3;

  re0 = vec_vlxsiwzx (offset0, array);

  re1 = vec_vlxsiwzx (offset1, array);

  re2 = vec_vlxsiwzx (offset2, array);

  re3 = vec_vlxsiwzx (offset3, array);

  /* Need to handle endian as the vec_vlxsiwzx result is always left

   * justified in VR, while element [0] may be left or right. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't

   * support doubleword vec_merge. */

  re0 = vec_xxpermdi (re0, re2, 3);

  re1 = vec_xxpermdi (re1, re3, 3);

  result = vec_mergee ((vui32_t) re0, (vui32_t) re1);

#else

  re0 = vec_xxpermdi (re0, re2, 0);

  re1 = vec_xxpermdi (re1, re3, 0);

  result = vec_mergeo ((vui32_t) re0, (vui32_t) re1);

#endif

#else //  _ARCH_PWR7

  vui32_t xte0, xte1, xte2, xte3;

  vui8_t perm0, perm1, perm2, perm3;


  perm0 = vec_lvsl (offset0, array);

  xte0 = vec_lde (offset0, array);

  xte0 = vec_perm (xte0, xte0, perm0);


  perm1 = vec_lvsl (offset1, array);

  xte1 = vec_lde (offset1, array);

  xte1 = vec_perm (xte1, xte1, perm1);


  perm2 = vec_lvsl (offset2, array);

  xte2 = vec_lde (offset2, array);

  xte2 = vec_perm (xte2, xte2, perm2);


  perm3 = vec_lvsl (offset3, array);

  xte3 = vec_lde (offset3, array);

  xte3 = vec_perm (xte3, xte3, perm3);


  xte0 = vec_mergeh (xte0, xte2);

  xte1 = vec_mergeh (xte1, xte3);

  result = vec_mergeh (xte0, xte1);

#endif

  return result;

}


static inline

vui32_t

vec_vgl4wwo (unsigned int *array, vi32_t vra)

{

  vui32_t r;


#ifdef _ARCH_PWR8

#if 1

  vi64_t off01, off23;


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  r = vec_vgl4wso (array, vra[0], vra[1], vra[2], vra[3]);

#endif

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;

  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);

  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;

  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);


  r = vec_vgl4wso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline

vui32_t

vec_vgl4wwsx (unsigned int *array, vi32_t vra,

                     const unsigned char scale)

{

  vui32_t r;


#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vi64_t lshift = vec_splats ((long long) (2+ scale));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed long long off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2+ scale);

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2+ scale);

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2+ scale);

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2+ scale);


  r = vec_vgl4wso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline

vui32_t

vec_vgl4wwx (unsigned int *array, vi32_t vra)

{

  vui32_t r;


#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vi64_t lshift = vec_splats ((long long) (2));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  r = vec_vgl4wso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed long long off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;


  r = vec_vgl4wso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline vi64_t

vec_vglswso (signed int *array, const long long offset0,

             const long long offset1)

{

  vi64_t re0, re1, result;


  re0 = vec_vlxsiwax (offset0, array);

  re1 = vec_vlxsiwax (offset1, array);

  /* Need to handle endian as the vec_vlxsiwzx result is always left

   * justified in VR, while element [0] may be left or right. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't

   * support doubleword vec_merge. */

  result = vec_xxpermdi (re0, re1, 3);

#else

#ifdef _ARCH_PWR7

  result = vec_xxpermdi (re0, re1, 0);

#else

  re0 = (vi64_t) vec_sld (re0, re0, 8);

  result = (vi64_t) vec_sld (re0, re1, 8);

#endif

#endif

  return result;

}


static inline

vi64_t

vec_vglswdo (signed int *array, vi64_t vra)

{

  vi64_t r;


#ifdef _ARCH_PWR8

  r = vec_vglswso (array, vra[0], vra[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);


  r = vec_vglswso (array, scalar_extract_uint64_from_high_uint128(gprp),

                   scalar_extract_uint64_from_low_uint128(gprp));

#endif

  return  r;

}


static inline

vi64_t

vec_vglswdsx (signed int *array, vi64_t vra,

                     const unsigned char scale)

{

  vi64_t r;


#ifdef _ARCH_PWR8

  vi64_t lshift = vec_splats ((long long) (2 + scale));

  vi64_t offset;


  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vglswso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);


  r = vec_vglswso (array, offset0, offset1);

#endif

  return  r;

}


static inline

vi64_t

vec_vglswdx (signed int *array, vi64_t vra)

{

  vi64_t r;


#ifdef _ARCH_PWR8

  vi64_t lshift = vec_splats ((long long) 2);

  vi64_t offset;


  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vglswso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;


  r = vec_vglswso (array, offset0, offset1);

#endif

  return  r;

}


static inline vui64_t

vec_vgluwso (unsigned int *array, const long long offset0,

             const long long offset1)

{

  vui64_t re0, re1, result;


  re0 = vec_vlxsiwzx (offset0, array);

  re1 = vec_vlxsiwzx (offset1, array);

  /* Need to handle endian as the vec_vlxsiwzx result is always left

   * justified in VR, while element [0] may be left or right. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't

   * support doubleword vec_merge. */

  result = vec_xxpermdi (re0, re1, 3);

#else

#ifdef _ARCH_PWR7

  result = vec_xxpermdi (re0, re1, 0);

#else

  re0 = (vui64_t) vec_sld (re0, re0, 8);

  result = (vui64_t) vec_sld (re0, re1, 8);

#endif

#endif

  return result;

}


static inline

vui64_t

vec_vgluwdo (unsigned int *array, vi64_t vra)

{

  vui64_t r;


#ifdef _ARCH_PWR8

  r = vec_vgluwso (array, vra[0], vra[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);


  r = vec_vgluwso (array, scalar_extract_uint64_from_high_uint128(gprp),

                   scalar_extract_uint64_from_low_uint128(gprp));

#endif

  return  r;

}


static inline

vui64_t

vec_vgluwdsx (unsigned int *array, vi64_t vra,

                     const unsigned char scale)

{

  vui64_t r;


#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vgluwso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);


  r = vec_vgluwso (array, offset0, offset1);

#endif

  return  r;

}


static inline

vui64_t

vec_vgluwdx (unsigned int *array, vi64_t vra)

{

  vui64_t r;


#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) 2);

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vgluwso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;


  r = vec_vgluwso (array, offset0, offset1);

#endif

  return  r;

}


static inline vi64_t

vec_vlxsiwax (const signed long long ra, const signed int *rb)

{

  vi64_t xt;


#if (defined(__clang__) && __clang_major__ < 8)

  __VEC_U_128 t;


  signed int *p = (signed int *)((char *)rb + ra);

  t.ulong.upper = *p;

  xt = t.vx2;

#elif _ARCH_PWR8

  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768))

    {

      if (ra == 0)

        {

          __asm__(

              "lxsiwax %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*rb)

              : );

        } else {

          unsigned long long rt;

          __asm__(

              "li %0,%1;"

              : "=r" (rt)

              : "I" (ra)

              : );

          __asm__(

              "lxsiwax %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*(signed int *)((char *)rb+rt))

              : );

        }

    } else {

      __asm__(

          "lxsiwax %x0,%y1;"

          : "=wa" (xt)

          : "Z" (*(signed int *)((char *)rb+ra))

          : );

    }

#else // _ARCH_PWR7

  vui32_t const shb = { 31, 0, 0 ,0 };

  vi32_t xte;

  vui8_t perm;


  perm = vec_lvsl (ra, rb);

  xte = vec_lde (ra, rb);

  perm = (vui8_t) vec_mergeh ((vui32_t) perm, (vui32_t) perm);

  xte = vec_perm (xte, xte, perm);

  xt = (vi64_t) vec_sra (xte, shb);

#endif

  return xt;

}


static inline vui64_t

vec_vlxsiwzx (const signed long long ra, const unsigned int *rb)

{

  vui64_t xt;


#if (defined(__clang__) && __clang_major__ < 8)

  __VEC_U_128 t;


  unsigned int *p = (unsigned int *)((char *)rb + ra);

  t.ulong.upper = *p;

  xt = t.vx2;

#elif _ARCH_PWR8

  if (__builtin_constant_p (ra) && (ra <= 32760) && (ra >= -32768))

    {

      if (ra == 0)

        {

          __asm__(

              "lxsiwzx %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*rb)

              : );

        } else {

          unsigned long long rt;

          __asm__(

              "li %0,%1;"

              : "=r" (rt)

              : "I" (ra)

              : );

          __asm__(

              "lxsiwzx %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*(signed int *)((char *)rb+rt))

              : );

        }

    } else {

      __asm__(

          "lxsiwzx %x0,%y1;"

          : "=wa" (xt)

          : "Z" (*(signed int *)((char *)rb+ra))

          : );

    }

#else // _ARCH_PWR7

  const vui32_t zero = {0,0,0,0};

  vui32_t xte;

  vui8_t perm;


  perm = vec_lvsl (ra, rb);

  xte = vec_lde (ra, rb);

  xte = vec_perm (xte, xte, perm);

  xt = (vui64_t) vec_sld (zero, xte, 12);

#endif

  return xt;

}


static inline vui64_t

vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d);


static inline vui64_t

vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d);


static inline vui64_t

vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c);


static inline vui64_t

vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c);


static inline vui64_t

vec_vmsumuwm (vui32_t a, vui32_t b, vui64_t c);


static inline vui64_t

vec_vmuleuw (vui32_t vra, vui32_t vrb)

{

  vui64_t res;

#ifdef _ARCH_PWR8

#if defined __GNUC__ && (__GNUC__ > 7)

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_mulo (vra, vrb);

#else

  res = vec_mule (vra, vrb);

#endif

#else

  __asm__(

      "vmuleuw %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t zero = {0,0,0,0};

  const vui32_t ones = {-1,-1,-1,-1};

  vui32_t wmask01;

  vui32_t p0, p1, pp10, pp01, resw;

  vui16_t m0, m1, mt, mth, mtl;


  /* generate {0,-1,0,-1}  mask.  */

  wmask01 = vec_vmrghw (zero, ones);


  mt = (vui16_t)vrb;

  mtl = vec_mergeh (mt, mt);

  mth = vec_mergel (mt, mt);


#ifdef _ARCH_PWR7

  m0 = (vui16_t)vec_xxpermdi ((vui64_t)mtl, (vui64_t)mth, 0);

#else

  {

    vui32_t temp;

    temp = vec_sld ((vui32_t) mtl, (vui32_t) mth, 8);

    m0 = (vui16_t) vec_sld (temp, (vui32_t) mth, 8);

  }

#endif


  resw = vec_sld (vra, vra, 12);

  resw = vec_sel (vra, resw, wmask01);

  m1 = (vui16_t)resw;


  p0 = vec_vmuleuh (m1, m0);

  p1 = vec_vmulouh (m1, m0);

  resw = vec_sel (p0, p1, wmask01);

  res = (vui64_t)resw;


  pp10 = vec_sld (p1, p1, 12);

  pp01 = p0;

  /* pp01 = vec_addudm (pp01, pp10).  */

  {

    vui32_t c;

    vui32_t xmask;

    xmask = vec_sld (wmask01, wmask01, 2);

    c    = vec_vaddcuw (pp01, pp10);

    pp01 = vec_vadduwm (pp01, pp10);

    c    = vec_sld (c, c, 6);

    pp01 = vec_sld (pp01, pp01, 2);

    pp01 = vec_sel (c, pp01, xmask);

  }

  /* res = vec_addudm (pp01, res).  */

  {

    vui32_t c, r;

    c = vec_vaddcuw (pp01, (vui32_t)res);

    r = vec_vadduwm (pp01, (vui32_t)res);

    c = vec_sld (c, zero, 4);

    res = (vui64_t)vec_vadduwm (r, c);

  }

#endif

  return (res);

}


static inline vui64_t

vec_vmulouw (vui32_t vra, vui32_t vrb)

{

  vui64_t res;

#ifdef _ARCH_PWR8

#if defined __GNUC__ && (__GNUC__ > 7)

  /* Not supported in GCC yet.  ETA GCC-8.  */

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_mule (vra, vrb);

#else

  res = vec_mulo (vra, vrb);

#endif

#else

  __asm__(

      "vmulouw %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

#else

  const vui32_t zero = {0,0,0,0};

  const vui32_t ones = {-1,-1,-1,-1};

  vui32_t wmask01;

  vui32_t p0, p1, pp10, pp01, resw;

  vui16_t m0, m1, mt, mth, mtl;

  /* generate {0,-1,0,-1}  mask.  */

  wmask01 = vec_vmrghw (zero, ones);


  mt = (vui16_t)vrb;

  mtl = vec_mergel (mt, mt);

  mth = vec_mergeh (mt, mt);

#ifdef _ARCH_PWR7

  m0 = (vui16_t)vec_xxpermdi ((vui64_t)mth, (vui64_t)mtl, 3);

#else

  {

    vui32_t temp;

    temp = vec_sld ((vui32_t) mtl, (vui32_t) mtl, 8);

    result = (vui64_t) vec_sld ((vui32_t) mth, temp, 8);

  }

#endif


  resw = vec_sld (vra, vra, 4);

  m1 = (vui16_t)vec_sel (resw, vra, wmask01);


  p0 = vec_vmuleuh (m1, m0);

  p1 = vec_vmulouh (m1, m0);


  resw = vec_sel (p0, p1, wmask01);

  res = (vui64_t)resw;


  pp10 = vec_sld (p1, p1, 12);

  pp01 = p0;

  /* pp01 = vec_addudm (pp01, pp10).  */

  {

    vui32_t c;

    vui32_t xmask;

    xmask = vec_sld (wmask01, wmask01, 2);

    c    = vec_vaddcuw (pp01, pp10);

    pp01 = vec_vadduwm (pp01, pp10);

    c    = vec_sld (c, c, 6);

    pp01 = vec_sld (pp01, pp01, 2);

    pp01 = vec_sel (c, pp01, xmask);

  }

  /* res = vec_addudm (pp01, res).  */

  {

    vui32_t c, r;

    c = vec_vaddcuw (pp01, (vui32_t)res);

    r = vec_vadduwm (pp01, (vui32_t)res);

    c = vec_sld (c, zero, 4);

    res = (vui64_t)vec_vadduwm (r, c);

  }

#endif

  return (res);

}


static inline void

vec_vsst4wso (vui32_t xs, unsigned int *array,

              const long long offset0, const long long offset1,

              const long long offset2, const long long offset3)

{

  vui32_t xs0, xs1, xs2, xs3;


  xs0 = vec_splat (xs, 0);

  xs1 = vec_splat (xs, 1);

  xs2 = vec_splat (xs, 2);

  xs3 = vec_splat (xs, 3);

  vec_ste (xs0, offset0, array);

  vec_ste (xs1, offset1, array);

  vec_ste (xs2, offset2, array);

  vec_ste (xs3, offset3, array);

}


static inline void

vec_vsst4wwo (vui32_t xs, unsigned int *array,

              vi32_t vra)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;

  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);

  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;

  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);


  vec_vsst4wso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsst4wwsx (vui32_t xs, unsigned int *array,

              vi32_t vra, const unsigned char scale)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2 + scale);

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2 + scale);

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2 + scale);

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2 + scale);


  vec_vsst4wso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsst4wwx (vui32_t xs, unsigned int *array,

              vi32_t vra)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vui64_t lshift = vec_splats ((unsigned long long) 2);


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  vec_vsst4wso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;


  vec_vsst4wso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsstwdo (vui64_t xs, unsigned int *array, vi64_t vra)

{

#ifdef _ARCH_PWR8

  vec_vsstwso (xs, array, vra[0], vra[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);


  vec_vsstwso (xs, array,

                scalar_extract_uint64_from_high_uint128(gprp),

                scalar_extract_uint64_from_low_uint128(gprp));

#endif

}


static inline void

vec_vsstwdsx (vui64_t xs, unsigned int *array, vi64_t vra,

              const unsigned char scale)

{

#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  vec_vsstwso (xs, array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);


  vec_vsstwso (xs, array, offset0, offset1);

#endif

}


static inline void

vec_vsstwdx (vui64_t xs, unsigned int *array, vi64_t vra)

{

#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) 2);

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  vec_vsstwso (xs, array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;


  vec_vsstwso (xs, array, offset0, offset1);

#endif

}


static inline void

vec_vsstwso (vui64_t xs, unsigned int *array,

              const long long offset0, const long long offset1)

{

  vui32_t xs0, xs1;


  xs0 = (vui32_t) xs;

  // xs1 = vec_xxswapd (xs);

#ifdef _ARCH_PWR7

  xs1 = (vui32_t) vec_xxpermdi (xs, xs, 2);

#else

  xs1 = vec_sld (xs0, xs0, 8);

#endif

  /* Need to handle endian as vec_vstsxiwx always stores the right word

   * from the left doubleword of the VSR, while word element [1] may in

   * the left or right doubleword. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  vec_vstxsiwx (xs0, offset1, array);

  vec_vstxsiwx (xs1, offset0, array);

#else

  vec_vstxsiwx (xs0, offset0, array);

  vec_vstxsiwx (xs1, offset1, array);

#endif

}


static inline void

vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb)

{

#if (defined(__clang__) && __clang_major__ < 8)

  __VEC_U_128 t;

  unsigned int *p = (unsigned int *)((char *)rb + ra);

  t.vx4 = xs;

  *p = t.ulong.upper;

#elif _ARCH_PWR8

  if (__builtin_constant_p (ra) &&  (ra <= 32760) && (ra >= -32768))

    {

      if (ra == 0)

        {

          __asm__(

              "stxsiwx %x1,%y0;"

              : "=Z" (*rb)

              : "wa" (xs)

              : );

        } else {

          unsigned long long rt;

          __asm__(

              "li %0,%1;"

              : "=r" (rt)

              : "I" (ra)

              : );

          __asm__(

              "stxsiwx %x1,%y0;"

              : "=Z" (*(unsigned int *)((char *)rb+rt))

              : "wa" (xs)

              : );

        }

    } else {

      __asm__(

          "stxsiwx %x1,%y0;"

          : "=Z" (*(unsigned int *)((char *)rb+ra))

          : "wa" (xs)

          : );

    }

#else //_ARCH_PWR8

  // Splat word element 1 to all elements

  vui32_t xss = vec_splat (xs, 1);

  // store a word element at the EA (ra+rb)

  vec_ste (xss, ra, rb);

#endif

}


static inline vi32_t

vec_vsum2sw (vi32_t vra, vi32_t vrb)

{

  vi32_t res;

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

  res = vec_sum2s (vra, vrb);

#else

  __asm__(

      "vsum2sws %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

  return ((vi32_t) res);

}


static inline vi32_t

vec_vsumsw (vi32_t vra, vi32_t vrb)

{

  vi32_t res;

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

  res = vec_sums (vra, vrb);

#else

  __asm__(

      "vsumsws %0,%1,%2;\n"

      : "=v" (res)

      : "v" (vra),

      "v" (vrb)

      : );

#endif

  return ((vi32_t) res);

}


#ifndef vec_vupkhsw

// May be defined as inline function for clang

// But only for _ARCH_PWR8 or higher.

#if !defined(__clang__) || !defined(_ARCH_PWR8)

static inline vi64_t

vec_vupkhsw (vi32_t vra)

{

  vi64_t r;

#ifdef _ARCH_PWR8

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vupklsw %0,%1;\n"

#else

      "vupkhsw %0,%1;\n"

#endif

      : "=v" (r)

      : "v" (vra)

      : );

#else

  vui32_t const shb = { 31, 0, 31 ,0 };

  vi32_t xra;


  xra = vec_mergeh (vra, vra);

  r = (vi64_t) vec_sra (xra, shb);

#endif

  return (r);

}

#endif

#endif


static inline vui64_t

vec_vupkhuw (vui32_t vra)

{

  vui32_t const zero = { 0, 0, 0 ,0 };

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vui64_t) vec_mergeh (vra, zero);

#else

  return (vui64_t) vec_mergeh (zero, vra);

#endif

}


#ifndef vec_vupklsw

// May be defined as inline function for clang

// But only for _ARCH_PWR8 or higher.

#if !defined(__clang__) || !defined(_ARCH_PWR8)

static inline vi64_t

vec_vupklsw (vi32_t vra)

{

  vi64_t r;

#ifdef _ARCH_PWR8

  __asm__(

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      "vupkhsw %0,%1;\n"

#else

      "vupklsw %0,%1;\n"

#endif

      : "=v" (r)

      : "v" (vra)

      : );

#else

  vui32_t const shb = { 31, 0, 31 ,0 };

  vi32_t xra;


  xra = vec_mergel (vra, vra);

  r = (vi64_t) vec_sra (xra, shb);

#endif

  return (r);

}

#endif

#endif


static inline vui64_t

vec_vupkluw (vui32_t vra)

{

  vui32_t const zero = { 0, 0, 0 ,0 };

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vui64_t) vec_mergel (vra, zero);

#else

  return (vui64_t) vec_mergel (zero, vra);

#endif

}


#endif /* VEC_INT32_PPC_H_ */