vec__int16__ppc_8h_source.html

/*

 Copyright (c) [2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_int16_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Apr 06, 2018

 */


#ifndef VEC_INT16_PPC_H_

#define VEC_INT16_PPC_H_


#include <pveclib/vec_char_ppc.h>


#ifndef vec_popcnth

static inline vui16_t vec_popcnth (vui16_t vra);

#else

/* Work around for GCC PR85830.  */

#undef vec_popcnth

#define vec_popcnth __builtin_vec_vpopcnth

#endif

static inline vui16_t vec_vmrgeh (vui16_t vra, vui16_t vrb);

static inline vui16_t vec_vmrgoh (vui16_t vra, vui16_t vrb);


static inline vui16_t

vec_absduh (vui16_t vra, vui16_t vrb)

{

  vui16_t result;

#ifdef _ARCH_PWR9

#ifdef vec_absdh

  result = vec_absdh (vra, vrb);

#else

  __asm__(

      "vabsduh %0,%1,%2;"

      : "=v" (result)

      : "v" (vra), "v" (vrb)

      : );

#endif

#else

  vui16_t vmin, vmax;


  vmin = vec_min (vra, vrb);

  vmax = vec_max (vra, vrb);

  result = vec_sub (vmax, vmin);

#endif

  return (result);

}


static inline vui16_t

vec_clzh (vui16_t vra)

{

  vui16_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vclzh)

  r = vec_vclzh (vra);

#elif defined (__clang__)

  r = vec_cntlz (vra);

#else

  __asm__(

      "vclzh %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

//#warning Implememention pre power8

  vui16_t n, nt, y, x, s, m;

  vui16_t z= { 0,0,0,0, 0,0,0,0};

  vui16_t one = { 1,1,1,1, 1,1,1,1};


  /* n = 16 s = 8 */

  s = vec_splat_u16(8);

  n = vec_add (s, s);

  x = vra;


  /* y=x>>8 if (y!=0) (n=n-8 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui16_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x , y, m);

  n = vec_sel (n , nt, m);


  /* y=x>>4 if (y!=0) (n=n-4 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui16_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x , y, m);

  n = vec_sel (n , nt, m);


  /* y=x>>2 if (y!=0) (n=n-2 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (vui16_t)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x , y, m);

  n = vec_sel (n , nt, m);


  /* y=x>>1 if (y!=0) return (n=n-2)   */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  nt = vec_sub(nt,s);

  m = (vui16_t)vec_cmpgt(y, z);

  n = vec_sel (n , nt, m);


  /* else return (x-n)  */

  nt = vec_sub (n, x);

  r = vec_sel (nt , n, m);

#endif


  return (r);

}


static inline vui16_t

vec_ctzh (vui16_t vra)

{

  vui16_t r;

#ifdef _ARCH_PWR9

#if defined (vec_cnttz) || defined (__clang__)

  r = vec_cnttz (vra);

#else

  __asm__(

      "vctzh %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

// For _ARCH_PWR8 and earlier. Generate 1's for the trailing zeros

// and 0's otherwise. Then count (popcnt) the 1's. _ARCH_PWR8 uses

// the hardware vpopcnth instruction. _ARCH_PWR7 and earlier use the

// PVECLIB vec_popcnth implementation which runs ~20-26 instructions.

  const vui16_t ones = vec_splat_u16 (1);

  vui16_t tzmask;

  // tzmask = (!vra & (vra - 1))

  tzmask = vec_andc (vec_sub (vra, ones), vra);

  // return = vec_popcnt (!vra & (vra - 1))

  r = vec_popcnth (tzmask);

#endif

  return ((vui16_t) r);

}


static inline vui16_t

vec_mrgahh  (vui32_t vra, vui32_t vrb)

{

  return vec_vmrgeh ((vui16_t) vra, (vui16_t) vrb);

}


static inline vui16_t

vec_mrgalh  (vui32_t vra, vui32_t vrb)

{

  return vec_vmrgoh ((vui16_t) vra, (vui16_t) vrb);

}


static inline vui16_t

vec_mrgeh  (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmrgoh ( vrb, vra );

#else

  return vec_vmrgeh ( vra, vrb );

#endif

}


static inline vui16_t

vec_mrgoh  (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmrgeh ( vrb, vra );

#else

  return vec_vmrgoh ( vra, vrb );

#endif

}


static inline vi16_t

vec_mulhsh (vi16_t vra, vi16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vi16_t) vec_mrgahh ((vui32_t)vec_mulo (vra, vrb),

                              (vui32_t)vec_mule (vra, vrb));

#else

  return (vi16_t) vec_mrgahh ((vui32_t)vec_mule (vra, vrb),

                              (vui32_t)vec_mulo (vra, vrb));

#endif

}


static inline vui16_t

vec_mulhuh (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_mrgahh (vec_mulo (vra, vrb), vec_mule (vra, vrb));

#else

  return vec_mrgahh (vec_mule (vra, vrb), vec_mulo (vra, vrb));

#endif

}


static inline vui16_t

vec_muluhm (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_mrgalh (vec_mulo (vra, vrb), vec_mule (vra, vrb));

#else

  return vec_mrgalh (vec_mule (vra, vrb), vec_mulo (vra, vrb));

#endif

}


#ifndef vec_popcnth

static inline vui16_t

vec_popcnth (vui16_t vra)

{

  vui16_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vpopcnth)

  r = vec_vpopcnth (vra);

#elif defined (__clang__)

  r = vec_popcnt (vra);

#else

  __asm__(

      "vpopcnth %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

  //#warning Implememention pre power8

    __vector unsigned short n, x1, x2, x, s;

    __vector unsigned short ones = { 1,1,1,1, 1,1,1,1};

    __vector unsigned short fives =

        {0x5555,0x5555,0x5555,0x5555, 0x5555,0x5555,0x5555,0x5555};

    __vector unsigned short threes =

        {0x3333,0x3333,0x3333,0x3333, 0x3333,0x3333,0x3333,0x3333};

    __vector unsigned short fs =

        {0x0f0f,0x0f0f,0x0f0f,0x0f0f, 0x0f0f,0x0f0f,0x0f0f,0x0f0f};

    /* n = 8 s = 4 */

    s = ones;

    x = vra;


    /* x = x - ((x >> 1) & 0x5555)  */

    x2 = vec_and (vec_sr (x, s), fives);

    n = vec_sub (x, x2);

    s = vec_add (s, s);


    /* x = (x & 0x3333) + ((x & 0xcccc) >> 2)  */

    x1 = vec_and (n, threes);

    x2 = vec_andc (n, threes);

    n = vec_add (x1, vec_sr (x2, s));

    s = vec_add (s, s);


    /* x = (x + (x >> 4)) & 0x0f0f)  */

    x1 = vec_add (n, vec_sr (n, s));

    n  = vec_and (x1, fs);

    s = vec_add (s, s);


    /* This avoids the extra load const.  */

    /* x = (x + (x << 8)) >> 8)  */

    x1 = vec_add (n, vec_sl (n, s));

    r  = vec_sr (x1, s);

#endif

  return (r);

}

#else

/* Work around for GCC PR85830.  */

#undef vec_popcnth

#define vec_popcnth __builtin_vec_vpopcnth

#endif


static inline vui16_t

vec_revbh (vui16_t vra)

{

  vui16_t result;


#ifdef _ARCH_PWR9

#if defined (vec_revb) || defined (__clang__)

  result = vec_revb (vra);

#else

  __asm__(

      "xxbrh %x0,%x1;"

      : "=wa" (result)

      : "wa" (vra)

      : );

#endif

#else

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

  const vui64_t vconstp = CONST_VINT64_DW(0x0100030205040706UL, 0x09080B0A0D0C0F0EUL);

#else

  const vui64_t vconstp =

      CONST_VINT64_DW(0x0E0F0C0D0A0B0809UL, 0x0607040502030001UL);

#endif

  result = (vui16_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);

#endif


  return (result);

}


static inline vb16_t

vec_setb_sh (vi16_t vra)

{

  vb16_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vexpandhm %0,%1"

      : "=v" (result)

      : "v" (vra)

      : );

#else

  const vui16_t rshift =  vec_splat_u16( 15 );

  // Vector Shift Right Algebraic Halfwords 15-bits.

  result = (vb16_t) vec_sra (vra, rshift);

#endif

  return result;

}


static inline vui16_t

vec_slhi (vui16_t vra, const unsigned int shb)

{

  vui16_t lshift;

  vui16_t result;


  if (shb < 16)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui16_t) vec_splat_s16(shb);

      else

        lshift = vec_splats ((unsigned short) shb);


      /* Vector Shift right bytes based on the lower 4-bits of

         corresponding element of lshift.  */

      result = vec_vslh (vra, lshift);

    }

  else

    { /* shifts greater then 15 bits return zeros.  */

      result = vec_xor ((vui16_t) vra, (vui16_t) vra);

    }


  return (vui16_t) result;

}


static inline vui16_t

vec_srhi (vui16_t vra, const unsigned int shb)

{

  vui16_t lshift;

  vui16_t result;


  if (shb < 16)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui16_t) vec_splat_s16(shb);

      else

        lshift = vec_splats ((unsigned short) shb);


      /* Vector Shift right halfword based on the lower 4-bits of

         corresponding element of lshift.  */

      result = vec_vsrh (vra, lshift);

    }

  else

    { /* shifts greater then 15 bits return zeros.  */

      result = vec_xor ((vui16_t) vra, (vui16_t) vra);

    }

  return (vui16_t) result;

}


static inline vi16_t

vec_srahi (vi16_t vra, const unsigned int shb)

{

  vui16_t lshift;

  vi16_t result;


  if (shb < 16)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui16_t) vec_splat_s16(shb);

      else

        lshift = vec_splats ((unsigned short) shb);


      /* Vector Shift Right Algebraic Halfwords based on the lower 4-bits

         of corresponding element of lshift.  */

      result = vec_vsrah (vra, lshift);

    }

  else

    { /* shifts greater then 15 bits returns the sign bit propagated to

         all bits.  This is equivalent to shift Right Algebraic of

         15 bits.  */

      lshift = (vui16_t) vec_splat_s16(15);

      result = vec_vsrah (vra, lshift);

    }


  return (vi16_t) result;

}


static inline vui32_t

vec_vmaddeuh (vui16_t a, vui16_t b, vui16_t c)

{

  const vui16_t zero = { 0, 0, 0, 0,  0, 0, 0, 0 };

  vui32_t res;

  vui16_t c_euh = vec_mrgahh ((vui32_t) zero, (vui32_t) c);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_vmulouh (a, b);

#else

  res = vec_vmuleuh (a, b);

#endif

  return vec_vadduwm (res, (vui32_t) c_euh);

}


static inline vui32_t

vec_vmaddouh (vui16_t a, vui16_t b, vui16_t c)

{

  const vui16_t zero = { 0, 0, 0, 0,  0, 0, 0, 0 };

  vui32_t res;

  vui16_t c_ouh = vec_mrgalh ((vui32_t) zero, (vui32_t) c);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  res = vec_vmuleuh (a, b);

#else

  res = vec_vmulouh (a, b);

#endif

  return vec_vadduwm (res, (vui32_t) c_ouh);

}


static inline vui16_t

vec_vmrgeh (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  const vui16_t permute =

      { 0x0302,0x1312, 0x0706,0x1716, 0x0B0A,0x1B1A, 0x0F0E,0x1F1E };

  return vec_perm (vrb, vra, (vui8_t)permute);

#else

  const vui16_t permute =

      { 0x0001,0x1011, 0x0405,0x1415, 0x0809,0x1819, 0x0C0D,0x1C1D };

  return vec_perm (vra, vrb, (vui8_t)permute);

#endif

}


static inline vui16_t

vec_vmrgoh (vui16_t vra, vui16_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  const vui16_t permute =

      { 0x0100,0x1110, 0x0504,0x1514, 0x0908,0x1918, 0x0D0C,0x1D1C };

  return vec_perm (vrb, vra, (vui8_t)permute);

#else

  const vui16_t permute =

      { 0x0203,0x1213, 0x0607,0x1617, 0x0A0B,0x1A1B, 0x0E0F,0x1E1F };

  return vec_perm (vra, vrb, (vui8_t)permute);

#endif

}


#endif /* VEC_INT16_PPC_H_ */