vec__char__ppc_8h_source.html

/*

 Copyright (c) [2017, 2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_char_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Jul 2, 2015

      Steven Munroe, additional contributions for POWER9.

 */


#ifndef VEC_CHAR_PPC_H_

#define VEC_CHAR_PPC_H_


#include <pveclib/vec_common_ppc.h>


#ifndef vec_popcntb

static inline vui8_t vec_popcntb (vui8_t vra);

#else

/* Work around for GCC PR85830.  */

#undef vec_popcntb

#define vec_popcntb __builtin_vec_vpopcntb

#endif

static inline vui8_t vec_vmrgeb (vui8_t vra, vui8_t vrb);

static inline vui8_t vec_vmrgob (vui8_t vra, vui8_t vrb);


static inline vui8_t

vec_absdub (vui8_t vra, vui8_t vrb)

{

  vui8_t result;

#ifdef _ARCH_PWR9

        __asm__(

            "vabsdub %0,%1,%2;"

            : "=v" (result)

            : "v" (vra), "v" (vrb)

            : );

#else

  vui8_t a, b;

  vui8_t vmin, vmax;


  a = (vui8_t) vra;

  b = (vui8_t) vrb;

  vmin = vec_min (a, b);

  vmax = vec_max (a, b);

  result = vec_sub (vmax, vmin);

#endif

  return (result);

}


static inline vui8_t

vec_clzb (vui8_t vra)

{

  vui8_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vclzb)

  r = vec_vclzb (vra);

#elif defined (__clang__)

  r = vec_cntlz (vra);

#else

  __asm__(

      "vclzb %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

//#warning Implememention pre power8

  __vector unsigned char n, nt, y, x, s, m;

  __vector unsigned char z= { 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};

  __vector unsigned char one = { 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1};


  /* n = 8 s = 4 */

  s = vec_splat_u8(4);

  n = vec_splat_u8(8);

  x = vra;


  /* y=x>>4 if (y!=0) (n=n-4 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (__vector unsigned char)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x , y, m);

  n = vec_sel (n , nt, m);


  /* y=x>>2 if (y!=0) (n=n-2 x=y)  */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  m = (__vector unsigned char)vec_cmpgt(y, z);

  s = vec_sr(s,one);

  x = vec_sel (x , y, m);

  n = vec_sel (n , nt, m);


  /* y=x>>1 if (y!=0) return (n=n-2)   */

  y = vec_sr(x, s);

  nt = vec_sub(n,s);

  nt = vec_sub(nt,s);

  m = (__vector unsigned char)vec_cmpgt(y, z);

  n = vec_sel (n , nt, m);


  /* else return (x-n)  */

  nt = vec_sub (n, x);

  n = vec_sel (nt , n, m);

  r = n;

#endif


  return (r);

}


static inline vui8_t

vec_ctzb (vui8_t vra)

{

  vui8_t r;

#ifdef _ARCH_PWR9

#if defined (vec_cnttz) || defined (__clang__)

  r = vec_cnttz (vra);

#else

  __asm__(

      "vctzb %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#elif _ARCH_PWR8

// For _ARCH_PWR8. Generate 1's for the trailing zeros

// and 0's otherwise. Then count (popcnt) the 1's.

// _ARCH_PWR8 uses the hardware vpopcntb instruction.

  const vui8_t ones = vec_splat_u8 (1);

  vui8_t tzmask;

  // tzmask = (!vra & (vra - 1))

  tzmask = vec_andc (vec_sub (vra, ones), vra);

  // return = vec_popcnt (!vra & (vra - 1))

  r = vec_popcntb (tzmask);

#else

  // For _ARCH_PWR7 and earlier (without hardware clz or popcnt).

  // Generate 1's for the trailing zeros and 0's otherwise.

  // Then count leading 0's using the PVECLIB vec_clzb implementation

  // which minimizes the number of constant loads (vs popcntb).

  // Finally subtract this count from 8.

  const vui8_t ones = vec_splat_u8 (1);

  const vui8_t c8s = vec_splat_u8 (8);

  vui8_t term;

  // term = (!vra & (vra - 1))

  term = vec_andc (vec_sub (vra, ones), vra);

  // return = 8 - vec_clz (!vra & (vra - 1))

  return vec_sub (c8s, vec_clzb (term));

#endif

  return ((vui8_t) r);

}


static inline vui8_t

vec_isalnum (vui8_t vec_str)

{

  vui8_t result;

  const vui8_t UC_FIRST =

    { 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,

        0x40, 0x40, 0x40, 0x40 };

  const vui8_t UC_LAST =

    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a,

        0x5a, 0x5a, 0x5a, 0x5a };

  const vui8_t LC_FIRST =

    { 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,

        0x60, 0x60, 0x60, 0x60 };

  const vui8_t LC_LAST =

    { 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a,

        0x7a, 0x7a, 0x7a, 0x7a };

  const vui8_t DG_FIRST =

    { 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,

        0x2f, 0x2f, 0x2f, 0x2f };

  const vui8_t DG_LAST =

    { 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x5a,

        0x39, 0x39, 0x39, 0x39 };


  vui8_t cmp1, cmp2, cmp3, cmp4, cmp5, cmp6, cmask1, cmask2, cmask3;


  cmp1 = (vui8_t) vec_cmpgt (vec_str, LC_FIRST);

  cmp2 = (vui8_t) vec_cmpgt (vec_str, LC_LAST);


  cmp3 = (vui8_t) vec_cmpgt (vec_str, UC_FIRST);

  cmp4 = (vui8_t) vec_cmpgt (vec_str, UC_LAST);


  cmp5 = (vui8_t) vec_cmpgt (vec_str, DG_FIRST);

  cmp6 = (vui8_t) vec_cmpgt (vec_str, DG_LAST);


  cmask1 = vec_andc (cmp1, cmp2);

  cmask2 = vec_andc (cmp3, cmp4);

  cmask3 = vec_andc (cmp5, cmp6);


  result = vec_or (vec_or (cmask1, cmask2), cmask3);


  return (result);

}


static inline vui8_t

vec_isalpha (vui8_t vec_str)

{

  vui8_t result;

  const vui8_t UC_FIRST =

    { 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,

        0x40, 0x40, 0x40, 0x40 };

  const vui8_t UC_LAST =

    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a,

        0x5a, 0x5a, 0x5a, 0x5a };

  const vui8_t LC_FIRST =

    { 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,

        0x60, 0x60, 0x60, 0x60 };

  const vui8_t LC_LAST =

    { 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a,

        0x7a, 0x7a, 0x7a, 0x7a };


  vui8_t cmp1, cmp2, cmp3, cmp4, cmask1, cmask2;


  cmp1 = (vui8_t) vec_cmpgt (vec_str, LC_FIRST);

  cmp2 = (vui8_t) vec_cmpgt (vec_str, LC_LAST);


  cmp3 = (vui8_t) vec_cmpgt (vec_str, UC_FIRST);

  cmp4 = (vui8_t) vec_cmpgt (vec_str, UC_LAST);


  cmask1 = vec_andc (cmp1, cmp2);

  cmask2 = vec_andc (cmp3, cmp4);


  result = vec_or (cmask1, cmask2);


  return (result);

}


static inline vui8_t

vec_isdigit (vui8_t vec_str)

{

  vui8_t result;

  const vui8_t DG_FIRST =

    { 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,

        0x2f, 0x2f, 0x2f, 0x2f };

  const vui8_t DG_LAST =

    { 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x5a,

        0x39, 0x39, 0x39, 0x39 };


  vui8_t cmp1, cmp2;


  cmp1 = (vui8_t) vec_cmpgt (vec_str, DG_FIRST);

  cmp2 = (vui8_t) vec_cmpgt (vec_str, DG_LAST);


  result = vec_andc (cmp1, cmp2);


  return (result);

}


static inline vui8_t

vec_mrgahb  (vui16_t vra, vui16_t vrb)

{

  return vec_vmrgeb ((vui8_t) vra, (vui8_t) vrb);

}


static inline vui8_t

vec_mrgalb  (vui16_t vra, vui16_t vrb)

{

  return vec_vmrgob ((vui8_t) vra, (vui8_t) vrb);

}


static inline vui8_t

vec_mrgeb  (vui8_t vra, vui8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmrgob ((vui8_t) vrb, (vui8_t) vra);

#else

  return vec_vmrgeb ((vui8_t) vra, (vui8_t) vrb);

#endif

}


static inline vui8_t

vec_mrgob  (vui8_t vra, vui8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_vmrgeb ((vui8_t) vrb, (vui8_t) vra);

#else

  return vec_vmrgob ((vui8_t) vra, (vui8_t) vrb);

#endif

}


static inline vi8_t

vec_mulhsb (vi8_t vra, vi8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vi8_t) vec_mrgahb ((vui16_t)vec_mulo (vra, vrb),

                             (vui16_t)vec_mule (vra, vrb));

#else

  return (vi8_t) vec_mrgahb ((vui16_t)vec_mule (vra, vrb),

                             (vui16_t)vec_mulo (vra, vrb));

#endif

}


static inline vui8_t

vec_mulhub (vui8_t vra, vui8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_mrgahb (vec_mulo (vra, vrb), vec_mule (vra, vrb));

#else

  return vec_mrgahb (vec_mule (vra, vrb), vec_mulo (vra, vrb));

#endif

}


static inline vui8_t

vec_mulubm (vui8_t vra, vui8_t vrb)

{

#if __GNUC__ >= 7

/* Generic vec_mul not supported for vector char until GCC 7.  */

  return vec_mul (vra, vrb);

#else

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return vec_mrgalb (vec_mulo (vra, vrb), vec_mule (vra, vrb));

#else

  return vec_mrgalb (vec_mule (vra, vrb), vec_mulo (vra, vrb));

#endif

#endif

}


#ifndef vec_popcntb

static inline vui8_t

vec_popcntb (vui8_t vra)

{

  vui8_t r;

#ifdef _ARCH_PWR8

#if defined (vec_vpopcntb)

  r = vec_vpopcntb (vra);

#elif defined (__clang__)

  r = vec_popcnt (vra);

#else

  __asm__(

      "vpopcntb %0,%1;"

      : "=v" (r)

      : "v" (vra)

      : );

#endif

#else

//#warning Implememention pre power8

  vui8_t n, x1, x2, x, s;

  vui8_t ones = { 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1};

  vui8_t fives =

      {0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,

          0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55};

  vui8_t threes =

      {0x33,0x33,0x33,0x33, 0x33,0x33,0x33,0x33,

          0x33,0x33,0x33,0x33, 0x33,0x33,0x33,0x33};

  vui8_t fs =

      {0x0f,0x0f,0x0f,0x0f, 0x0f,0x0f,0x0f,0x0f,

          0x0f,0x0f,0x0f,0x0f, 0x0f,0x0f,0x0f,0x0f};

  /* n = 8 s = 4 */

  s = ones;

  x = vra;

  /* x = x - ((x >> 1) & 0x55)  */

  x2 = vec_and (vec_sr (x, s), fives);

  n = vec_sub (x, x2);

  s = vec_add (s, s);

  /* x = (x & 0x33) + ((x & 0xcc) >> 2)  */

  x1 = vec_and (n, threes);

  x2 = vec_andc (n, threes);

  n = vec_add (x1, vec_sr (x2, s));

  s = vec_add (s, s);

  /* x = (x + (x >> 4)) & 0x0f)  */

  x1 = vec_add (n, vec_sr (n, s));

  n  = vec_and (x1, fs);

  r = n;

#endif

  return (r);

}

#else

/* Work around for GCC PR85830.  */

#undef vec_popcntb

#define vec_popcntb __builtin_vec_vpopcntb

#endif


static inline vb8_t

vec_setb_sb (vi8_t vra)

{

  vb8_t result;


#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vexpandbm %0,%1"

      : "=v" (result)

      : "v" (vra)

      : );

#else

  const vui8_t rshift =  vec_splat_u8( 7 );

  // Vector Shift Right Algebraic Bytes 7-bits.

  result = (vb8_t) vec_sra (vra, rshift);

#endif

  return result;

}


static inline vui8_t

vec_slbi (vui8_t vra, const unsigned int shb)

{

  vui8_t lshift;

  vui8_t result;


  if (shb < 8)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui8_t) vec_splat_s8(shb);

      else

        lshift = vec_splats ((unsigned char) shb);


      /* Vector Shift right bytes based on the lower 3-bits of

         corresponding element of lshift.  */

      result = vec_vslb (vra, lshift);

    }

  else

    { /* shifts greater then 7 bits return zeros.  */

      result = vec_xor ((vui8_t) vra, (vui8_t) vra);

    }


  return (vui8_t) result;

}


static inline vi8_t

vec_srabi (vi8_t vra, const unsigned int shb)

{

  vui8_t lshift;

  vi8_t result;


  if (shb < 8)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui8_t) vec_splat_s8(shb);

      else

        lshift = vec_splats ((unsigned char) shb);


      /* Vector Shift Right Algebraic Bytes based on the lower 3-bits

         of corresponding element of lshift.  */

      result = vec_vsrab (vra, lshift);

    }

  else

    { /* shifts greater then 7 bits returns the sign bit propagated to

         all bits.   This is equivalent to shift Right Algebraic of

         7 bits.  */

      lshift = (vui8_t) vec_splat_s8(7);

      result = vec_vsrab (vra, lshift);

    }


  return (vi8_t) result;

}


static inline vui8_t

vec_srbi (vui8_t vra, const unsigned int shb)

{

  vui8_t lshift;

  vui8_t result;


  if (shb < 8)

    {

      /* Load the shift const in a vector.  The element shifts require

         a shift amount for each element. For the immediate form the

         shift constant is splatted to all elements of the

         shift control.  */

      if (__builtin_constant_p(shb))

        lshift = (vui8_t) vec_splat_s8(shb);

      else

        lshift = vec_splats ((unsigned char) shb);


      /* Vector Shift right bytes based on the lower 3-bits of

         corresponding element of lshift.  */

      result = vec_vsrb (vra, lshift);

    }

  else

    { /* shifts greater then 7 bits return zeros.  */

      result = vec_xor ((vui8_t) vra, (vui8_t) vra);

    }


  return (vui8_t) result;

}


static inline vui8_t

vec_shift_leftdo (vui8_t vrw, vui8_t vrx, vui8_t vrb)

{

        vui8_t result, vt1, vt2, vt3;

        const vui8_t vzero = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};


        vt1 = vec_slo (vrw, vrb);

        vt3 = vec_sub (vzero, vrb);

        vt2 = vec_sro (vrx, vt3);

        result = vec_or (vt1, vt2);


        return (result);

}


static inline vui8_t

vec_toupper (vui8_t vec_str)

{

  vui8_t result;

  const vui8_t UC_MASK =

    { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,

        0x20, 0x20, 0x20, 0x20 };

  const vui8_t LC_FIRST =

    { 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,

        0x60, 0x60, 0x60, 0x60 };

  const vui8_t LC_LAST =

    { 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a, 0x7a,

        0x7a, 0x7a, 0x7a, 0x7a };


  vui8_t cmp1, cmp2, cmask;


  cmp1 = (vui8_t) vec_cmpgt (vec_str, LC_FIRST);

  cmp2 = (vui8_t) vec_cmpgt (vec_str, LC_LAST);


  cmask = vec_andc (cmp1, cmp2);

  cmask = vec_and (cmask, UC_MASK);


  result = vec_andc (vec_str, cmask);


  return (result);

}


static inline vui8_t

vec_tolower (vui8_t vec_str)

{

  vui8_t result;

  const vui8_t UC_MASK =

    { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,

        0x20, 0x20, 0x20, 0x20 };

  const vui8_t UC_FIRST =

    { 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,

        0x40, 0x40, 0x40, 0x40 };

  const vui8_t UC_LAST =

    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a,

        0x5a, 0x5a, 0x5a, 0x5a };


  vui8_t cmp1, cmp2, cmask;


  cmp1 = (vui8_t) vec_cmpgt (vec_str, UC_FIRST);

  cmp2 = (vui8_t) vec_cmpgt (vec_str, UC_LAST);


  cmask = vec_andc (cmp1, cmp2);

  cmask = vec_and (cmask, UC_MASK);


  result = vec_or (vec_str, cmask);


  return (result);

}


static inline vui8_t

vec_vmrgeb (vui8_t vra, vui8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  const vui8_t permute =

    { 0x01, 0x11, 0x03, 0x13, 0x05, 0x15, 0x07, 0x17, 0x09, 0x19, 0x0B, 0x1B,

        0x0D, 0x1D, 0x0F, 0x1F };


  return vec_perm (vrb, vra, (vui8_t) permute);

#else

  const vui8_t permute =

    { 0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08, 0x18, 0x0A, 0x1A,

        0x0C, 0x1C, 0x0E, 0x1E};


  return vec_perm (vra, vrb, (vui8_t)permute);

#endif

}


static inline vui8_t

vec_vmrgob (vui8_t vra, vui8_t vrb)

{

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  const vui8_t permute =

      { 0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08, 0x18, 0x0A, 0x1A,

        0x0C, 0x1C, 0x0E, 0x1E};

  return vec_perm (vrb, vra, (vui8_t)permute);

#else

  const vui8_t permute =

      { 0x01, 0x11, 0x03, 0x13, 0x05, 0x15, 0x07, 0x17, 0x09, 0x19, 0x0B, 0x1B,

        0x0D, 0x1D, 0x0F, 0x1F };

  return vec_perm (vra, vrb, (vui8_t)permute);

#endif

}


#endif /* VEC_CHAR_PPC_H_ */