vec__bcd__ppc_8h_source.html

/*

 Copyright (c) [2017] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_bcd_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Aug 12, 2015

 */


#ifndef VEC_BCD_PPC_H_

#define VEC_BCD_PPC_H_

#ifdef PVECLIB_DISABLE_DFP

// clang does not support Decimal Floating Point at this time.

// Pveclib uses decimal FP quadword instructions to fill in functional

// gaps in the vector BCD operations before POWER9.

#ifndef NDEBUG

#warning Support for BCD is disabled until _Decimal128 support is added.

#endif

#else

#include <pveclib/vec_common_ppc.h>

#include <pveclib/vec_char_ppc.h>

#include <pveclib/vec_int128_ppc.h>


#define vBCD_t vui32_t


#define vbBCD_t vb32_t


#define _BCD_CONST_PLUS_NINES  ((vBCD_t) CONST_VINT128_DW128(0x9999999999999999, 0x999999999999999c))


#define _BCD_CONST_PLUS_ONE  ((vBCD_t) CONST_VINT128_DW128(0, 0x1c))


#define _BCD_CONST_MINUS_ONE ((vBCD_t) CONST_VINT128_DW128(0, 0x1d))


#define _BCD_CONST_ZERO  ((vBCD_t) CONST_VINT128_DW128(0, 0x0c))


#define _BCD_CONST_SIGN_MASK  ((vBCD_t) CONST_VINT128_DW128(0, 0xf))


static inline vBCD_t vec_bcdcpsgn (vBCD_t vra, vBCD_t vrb);

static inline vBCD_t vec_bcdcfuq (vui128_t vra);

static inline vui128_t vec_bcdctuq (vBCD_t vra);

static inline vBCD_t vec_bcdsrqi (vBCD_t vra, const unsigned int _N);

static inline vBCD_t vec_bcdsub (vBCD_t a, vBCD_t b);

static inline vBCD_t vec_bcdus (vBCD_t vra, vi8_t vrb);

static inline vf64_t vec_pack_Decimal128 (_Decimal128 lval);

static inline _Decimal128 vec_quantize0_Decimal128 (_Decimal128 val);

static inline vui8_t vec_rdxcf100b (vui8_t vra);

static inline vui8_t vec_rdxcf10kh (vui16_t vra);

static inline vui16_t vec_rdxcf100mw (vui32_t vra);

static inline vui32_t vec_rdxcf10E16d (vui64_t vra);

static inline vui64_t vec_rdxcf10e32q (vui128_t vra);

static inline vui8_t vec_rdxcfzt100b (vui8_t zone00, vui8_t zone16);

static inline vui8_t vec_rdxct100b (vui8_t vra);

static inline vui16_t vec_rdxct10kh (vui8_t vra);

static inline vui32_t vec_rdxct100mw (vui16_t vra);

static inline vui64_t vec_rdxct10E16d (vui32_t vra);

static inline vui128_t vec_rdxct10e32q (vui64_t vra);

static inline vb128_t vec_setbool_bcdsq (vBCD_t vra);

static inline int vec_signbit_bcdsq (vBCD_t vra);

static inline _Decimal128 vec_unpack_Decimal128 (vf64_t lval);


static inline vui64_t

vec_BCD2BIN (vBCD_t val)

{

#ifdef _ARCH_PWR6

  vui64_t t;

  _Decimal128 x, y, z;

  // unpack the vector into a FPRp

  z = vec_unpack_Decimal128 ((vf64_t) val);

  // Convert 2 long int values into 2 _Decimal64 values

  // Then convert each _Decimal64 value into 16-digit BCD

  __asm__(

      "denbcd 0,%1,%2;\n"

      "denbcd 0,%L1,%L2;\n"

      "dctfix %0,%1;\n"

      "dctfix %L0,%L1;\n"

      : "=d" (x),

      "=&d" (y)

      : "d" (z)

      : );

  // Pack the FPRp back into a vector

  t = (vui64_t) vec_pack_Decimal128 (x);

  return (t);

#else

  // todo no solution before P6

#endif

}


static inline _Decimal128

vec_BCD2DFP (vBCD_t val)

{

#ifdef _ARCH_PWR7

  _Decimal128 t;

#if (__GNUC__ < 5)

  __asm__(

      "xxpermdi %0,%x1,%x1,0b00;\n"

      "\txxpermdi %L0,%x1,%x1,0b10;\n"

      "\tdenbcdq 1,%0,%0;\n"

      : "=&d" (t)

      : "v" (val)

      : );

#else

  t = vec_unpack_Decimal128 ((vf64_t) val);

  t = __builtin_denbcdq (1, t);

#endif

  return (t);

#else

  // needs work for P6 without xxpermdi

  __VEC_U_128 t, x;

  x.vx4 = val;

  t.dpd128 = __builtin_denbcdq (1, x.dpd128);

  return (t.dpd128);

#endif

}


static inline vBCD_t

vec_BIN2BCD (vui64_t val)

{

#ifdef _ARCH_PWR6

  vBCD_t t;

  _Decimal128 x, y, z;

  // unpack the vector into a FPRp

  z = vec_unpack_Decimal128 ((vf64_t) val);

  // Convert 2 long int values into 2 _Decimal64 values

  // Then convert each _Decimal64 value into 16-digit BCD

  __asm__(

      "dcffix %1,%2;\n"

      "dcffix %L1,%L2;\n"

      "ddedpd 0,%0,%1;\n"

      "ddedpd 0,%L0,%L1;\n"

      : "=d" (x),

      "=&d" (y)

      : "d" (z)

      : );

  // Pack the FPRp back into a vector

  t = (vBCD_t) vec_pack_Decimal128 (x);

  return (t);

#else

  // todo no solution before P6

#endif

}


static inline vBCD_t

vec_DFP2BCD (_Decimal128 val)

{

#ifdef _ARCH_PWR7

  vBCD_t t;

  _Decimal128 x;

#if (__GNUC__ < 5)

  __asm__(

      "ddedpdq 2,%1,%2;\n"

      "\txxpermdi %x0,%1,%L1,0b00;\n"

      : "=v" (t),

      "=&d" (x)

      : "d" (val)

      : );

#else

  x = __builtin_ddedpdq (2, val);

  t = (vBCD_t) vec_pack_Decimal128 (x);

#endif

  return (t);

#else

  // needs work for P6 without xxpermdi

  __VEC_U_128 t, x;

  t.dpd128 = __builtin_ddedpdq (1, val);

  return (t.vx4);

#endif

}


static inline vBCD_t

vec_bcdadd (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

#if (__GNUC__ < 7)

  __asm__(

      "bcdadd. %0,%1,%2,0;\n"

      : "=v" (t)

      : "v" (a),

      "v" (b)

      : "cr6" );

#else

  t = (vBCD_t) __builtin_bcdadd ((vi128_t) a, (vi128_t) b, 0);

#endif

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_t;

  d_t = vec_BCD2DFP (a) + vec_BCD2DFP (b);

  t = vec_DFP2BCD(d_t);

  // fix up spurious negative zeros

  if (vec_all_eq((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdaddcsq (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

#if defined ( _ARCH_PWR8) && (__GNUC__ > 6)

#ifdef _ARCH_PWR9

  // Generate BCD zero from (a - a), which is 3 cycles on PWR9

  t = vec_bcdsub (a,  a);

#else // Else load a BCD const 0.

  t = _BCD_CONST_ZERO;

#endif

  if (__builtin_expect (__builtin_bcdadd_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      vBCD_t a_b;

      a_b = vec_bcdadd (a, b);

      t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b);

    }

#else

  _Decimal128 d_a, d_b, d_s, d_t;

  const vui32_t mz = CONST_VINT128_W(0, 0, 0, 0x0000000d);

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_s = d_a + d_b;

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq ((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdaddecsq (vBCD_t a, vBCD_t b, vBCD_t c)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

  vBCD_t a_b, a_b_c;


  a_b = vec_bcdadd (a, b);

  if (__builtin_expect (__builtin_bcdadd_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b);

    }

  else // (a + b) did not overflow, what about (a + b + c)

    {

      a_b_c = (vBCD_t) vec_bcdadd (a_b, c);

      if (__builtin_bcdadd_ov ((vi128_t) a_b, (vi128_t) c, 0))

        {

          t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b_c);

        }

      else

        {

#ifdef _ARCH_PWR9

          // Generate BCD zero from (a - a), which is 3 cycles on PWR9

          t = vec_bcdsub (a, a);

#else     // Else load a BCD const 0.

          t = _BCD_CONST_ZERO;

#endif

        }

    }

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_a, d_b, d_c, d_s, d_t;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_c = vec_BCD2DFP (c);

  d_s = d_a + d_b + d_c;

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq ((vui32_t) t, mz))

   t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdaddesqm (vBCD_t a, vBCD_t b, vBCD_t c)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

  t = vec_bcdadd (vec_bcdadd (a, b), c);

#else

  _Decimal128 d_t;

  d_t = vec_BCD2DFP (a) + vec_BCD2DFP (b) + vec_BCD2DFP (c);

  t = vec_DFP2BCD(d_t);

#endif

  return (t);

}


static inline vBCD_t

vec_bcdcfsq (vi128_t vrb)

{

  vBCD_t result;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdcfsq. %0,%1,0;\n"

      : "=v" (result)

      : "v" (vrb)

      : "cr6" );

#else

  const vui128_t zero = (vui128_t) vec_splats ((int) 0);

  vBCD_t ubcd, bcdsign;

  vui128_t uvrb;

  vb128_t negbool;


  bcdsign = _BCD_CONST_PLUS_ONE;

  negbool = vec_setb_sq (vrb);


  uvrb = (vui128_t) vec_sel ((vui32_t) vrb,

                             (vui32_t) vec_subuqm (zero, (vui128_t) vrb),

                             (vb32_t) negbool);

  bcdsign = (vBCD_t) vec_sel ((vui32_t) bcdsign, (vui32_t) _BCD_CONST_MINUS_ONE,

                              (vb32_t) negbool);


  ubcd = vec_bcdcfuq (uvrb);


  result = (vBCD_t) vec_slqi ((vui128_t) ubcd, 4);

  result = vec_bcdcpsgn (result, bcdsign);

#endif

  return (vBCD_t) result;

}


static inline vBCD_t

vec_bcdcfud (vui64_t vrb)

{

#ifdef _ARCH_PWR7

  return vec_BIN2BCD (vrb);

#else

  vui8_t d100;

  vui16_t d10k;

  vui32_t d100m;

  d100m = vec_rdxcf10E16d (vrb);

  d10k = vec_rdxcf100mw (d100m);

  d100 = vec_rdxcf10kh (d10k);

  return (vBCD_t) vec_rdxcf100b (d100);

#endif

}


static inline vBCD_t

vec_bcdcfuq (vui128_t vra)

{

  vui64_t d10e;

  d10e =vec_rdxcf10e32q (vra);

#ifdef _ARCH_PWR7

  return (vBCD_t) vec_BIN2BCD (d10e);

#else

  vui8_t d100;

  vui16_t d10k;

  vui32_t d100m;

  d100m = vec_rdxcf10E16d (d10e);

  d10k = vec_rdxcf100mw (d100m);

  d100 = vec_rdxcf10kh (d10k);

  return (vBCD_t) vec_rdxcf100b (d10e);

#endif

}


static inline vBCD_t

vec_bcdcfz (vui8_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdcfz. %0,%1,0;\n"

      : "=v" (vrt)

      : "v" (vrb)

      : "cr6" );

#else

  const vui8_t dmask = vec_splat_u8(15);

  const vui8_t dx0 = vec_splat_u8(0);

  vui8_t znd_s;

  vui8_t znd_d, znd_t;

  vui8_t bcd, bcd_h, bcd_l;

  // Isolate the BCD digit from each zoned character.

  znd_d = vec_and (vrb, dmask);

  znd_t = (vui8_t) vec_srqi ((vui128_t) znd_d, 4);

  // Isolate the bit (1) that matters in the Zoned sign code.

  znd_s = vec_slbi (vrb, 1);

  znd_s = vec_srbi (znd_s, 7);

  // Convert to BCD preferred sign code 0xC or 0xD

  znd_s = vec_or (znd_s, (vui8_t) _BCD_CONST_ZERO);

  // Pack the even/odd zone digits into a single vector.

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  bcd = vec_pack ((vui16_t) znd_d, (vui16_t) znd_t);

#else

  bcd = vec_pack ((vui16_t) znd_t, (vui16_t) znd_d);

#endif

  // Swap even/odd DWs to low half and OR to get unsigned 16-digit BCD.

  bcd_l = (vui8_t) vec_mrgald ((vui128_t) dx0, (vui128_t) bcd);

  bcd_h = (vui8_t) vec_mrgahd ((vui128_t) dx0, (vui128_t) bcd);

  bcd = vec_or (bcd_h, bcd_l);

  // Shift left to make room for sign code

  vrt = (vBCD_t) vec_slqi ((vui128_t) bcd, 4);

  // Merge sign code from zone code.

  vrt = vec_bcdcpsgn (vrt, (vBCD_t) znd_s);

#endif

  return (vrt);

}


static inline vbBCD_t

vec_bcdcmp_eqsq (vBCD_t vra, vBCD_t vrb)

{

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (0);

#ifdef _ARCH_PWR8

  if (__builtin_bcdsub_eq ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (-1);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  if (d_a == d_b)

    result = (vbBCD_t) vec_splat_s32 (-1);

#endif

  return result;

}


static inline vbBCD_t

vec_bcdcmp_gesq (vBCD_t vra, vBCD_t vrb)

{

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (-1);

#ifdef _ARCH_PWR8

  if (__builtin_bcdsub_lt ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  if (d_a < d_b)

    result = (vbBCD_t) vec_splat_s32 (0);

#endif

  return result;

}


static inline vbBCD_t

vec_bcdcmp_gtsq (vBCD_t vra, vBCD_t vrb)

{

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (0);

#ifdef _ARCH_PWR8

  if (__builtin_bcdsub_gt ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (-1);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  if (d_a > d_b)

    result = (vbBCD_t) vec_splat_s32 (-1);

#endif

  return result;

}


static inline vbBCD_t

vec_bcdcmp_lesq (vBCD_t vra, vBCD_t vrb)

{

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (-1);

#ifdef _ARCH_PWR8

  if (__builtin_bcdsub_gt ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  if (d_a > d_b)

    result = (vbBCD_t) vec_splat_s32 (0);

#endif

  return result;

}


static inline vbBCD_t

vec_bcdcmp_ltsq (vBCD_t vra, vBCD_t vrb)

{

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (0);

#ifdef _ARCH_PWR8

  if (__builtin_bcdsub_lt ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (-1);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  if (d_a < d_b)

    result = (vbBCD_t) vec_splat_s32 (-1);

#endif

  return result;

}


static inline vbBCD_t

vec_bcdcmp_nesq (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  vbBCD_t result = (vbBCD_t) vec_splat_s32 (-1);

  if (__builtin_bcdsub_eq ((vi128_t) vra, (vi128_t) vrb, 0))

    result = (vbBCD_t) vec_splat_s32 (0);


  return result;

#else

  /* vec_cmpneuq works for both signed and unsigned compares.  */

  return (vbBCD_t) vec_cmpneuq ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline int

vec_bcdcmpeq (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return __builtin_bcdsub_eq ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  return (d_a == d_b);

#endif

}


static inline int

vec_bcdcmpge (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return !__builtin_bcdsub_lt ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  return (d_a >= d_b);

#endif

}


static inline int

vec_bcdcmpgt (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return __builtin_bcdsub_gt ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  return (d_a > d_b);

#endif

}


static inline int

vec_bcdcmple (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return !__builtin_bcdsub_gt ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  return (d_a <= d_b);

#endif

}


static inline int

vec_bcdcmplt (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return __builtin_bcdsub_lt ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  _Decimal128 d_a, d_b;

  d_a = vec_BCD2DFP (vra);

  d_b = vec_BCD2DFP (vrb);

  return (d_a < d_b);

#endif

}


static inline int

vec_bcdcmpne (vBCD_t vra, vBCD_t vrb)

{

#ifdef _ARCH_PWR8

  return !__builtin_bcdsub_eq ((vi128_t) vra, (vi128_t) vrb, 0);

#else

  return vec_cmpuq_all_ne ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline vBCD_t

vec_bcdcpsgn (vBCD_t vra, vBCD_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdcpsgn. %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vra),

      "v" (vrb)

      : "cr6" );

#else

  const vui32_t sign_mask = (vui32_t) _BCD_CONST_SIGN_MASK;

  vrt = (vBCD_t) vec_sel ((vui32_t) vra, (vui32_t) vrb, sign_mask);

#endif

  return (vrt);

}


static inline vi128_t

vec_bcdctsq (vBCD_t vra)

{

  vui128_t result;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdctsq. %0,%1;\n"

      : "=v" (result)

      : "v" (vra)

      : "cr6" );

#else

  const vui128_t zero = (vui128_t) vec_splats ((int) 0);

  vBCD_t ubcd;


  ubcd = (vBCD_t) vec_srqi ((vui128_t)vra, 4);

  result = vec_bcdctuq (ubcd);

  if (vec_signbit_bcdsq (vra))

    result = vec_subuqm (zero, result);

#endif

  return (vi128_t) result;

}


static inline vui8_t

vec_bcdctub (vBCD_t vra)

{

  return vec_rdxct100b ((vui8_t) vra);

}


static inline vui16_t

vec_bcdctuh (vBCD_t vra)

{

  vui8_t d100;

  d100 = vec_rdxct100b ((vui8_t) vra);

  return vec_rdxct10kh (d100);

}


static inline vui32_t

vec_bcdctuw (vBCD_t vra)

{

  vui8_t d100;

  vui16_t d10k;

  d100 = vec_rdxct100b ((vui8_t) vra);

  d10k = vec_rdxct10kh (d100);

  return vec_rdxct100mw (d10k);

}


static inline vui64_t

vec_bcdctud (vBCD_t vra)

{

#ifdef _ARCH_PWR7

  return vec_BCD2BIN (vra);

#else

  vui8_t d100;

  vui16_t d10k;

  vui32_t d100m;

  d100 = vec_rdxct100b ((vui8_t) vra);

  d10k = vec_rdxct10kh (d100);

  d100m = vec_rdxct100mw (d10k);

  return vec_rdxct10E16d (d100m);

#endif

}


static inline vui128_t

vec_bcdctuq (vBCD_t vra)

{

  vui128_t vrt;

#ifdef  _ARCH_PWR9

  const vui32_t bcd_one = (vui32_t) _BCD_CONST_PLUS_ONE;

  const vui32_t sign_mask = (vui32_t) _BCD_CONST_SIGN_MASK;

  vui128_t vrd;

  vBCD_t sbcd;

  // Need to convert BCD unsigned to signed for bcdctsq

  // But can't use bcdcpsgn as the unit digit is not a sign code

  // So use vec_and/sel to extract unit digit and insert sign

  vrd = (vui128_t) vec_and ((vui32_t) vra, sign_mask);

  sbcd = (vBCD_t) vec_sel ((vui32_t) vra, bcd_one, sign_mask);

  // Convert top 31 digits to binary

  vrt = (vui128_t) vec_bcdctsq (sbcd);

  // Then X 10 plus the unit digit to complete 32-digit convert

  vrt = vec_mul10euq (vrt, vrd);

#else

  vui64_t d10e;

#ifdef _ARCH_PWR7

  d10e = vec_BCD2BIN (vra);

#else

  vui8_t d100;

  vui16_t d10k;

  vui32_t d100m;

  d100 = vec_rdxct100b ((vui8_t) vra);

  d10k = vec_rdxct10kh (d100);

  d100m = vec_rdxct100mw (d10k);

  d10e = vec_rdxct10E16d (d100m);

#endif

  vrt = vec_rdxct10e32q (d10e);

#endif

  return vrt;

}


static inline vui8_t

vec_bcdctz (vBCD_t vrb)

{

  vui8_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdctz. %0,%1,0;\n"

      : "=v" (vrt)

      : "v" (vrb)

      : "cr6" );

#else

  const vui8_t dmask = vec_splat_u8(15);

  const vui8_t zone_minus = CONST_VINT128_B ( '0', '0', '0', '0',

                                              '0', '0', '0', '0',

                                              '0', '0', '0', '0',

                                              '0', '0', '0', 0x70 );

//  const vui32_t minus_sign = (vui32_t) CONST_VINT128_W(0x0b, 0x0d, 0x0b, 0x0d);

  const vui32_t plus_sign = (vui32_t) CONST_VINT128_W(0x0a, 0x0c, 0x0e, 0x0f);

  vui32_t sign_splat;

  const vui32_t bcd_sign_mask = vec_splat_u32(15);

  vui8_t znd_s, znd_d, znd_t;

  vui8_t bcd_s, bcd_u;

  vui8_t zone_code;

  // Isolate the BCD Sign code

  bcd_s = vec_and ((vui8_t) vrb, (vui8_t) bcd_sign_mask);

  // Replicate the byte containing the sign to words

  sign_splat = vec_splat ((vui32_t) bcd_s, VEC_W_L);

  // Isolate the low 16 digits as unsigned BCD

  bcd_u = (vui8_t) vec_srqi ((vui128_t) vrb, 4);

  // Isolate the even/odd nibbles and merge low bytes for zoned

  znd_d = vec_and (bcd_u, dmask);

  znd_t = vec_srbi (bcd_u, 4);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  znd_s = vec_mergeh (znd_d, znd_t);

#else

  znd_s = vec_mergel (znd_t, znd_d);

#endif

  // Initialize the zone_code with negative zone mask.

  zone_code = zone_minus;

  // SIMD compare for match to any positive sign code

  if (vec_any_eq(sign_splat, plus_sign))

    // Convert to positive zone mask.

    zone_code = (vui8_t) vec_xxspltd ((vui64_t) zone_code, 0);


  // Merge the zone nibbles with the digit nibble to

  vrt = vec_or (znd_s, zone_code);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcddiv (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

  _Decimal128 d_t, d_a, d_b;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_t = vec_quantize0_Decimal128 (d_a / d_b);

  t = vec_DFP2BCD (d_t);

  return (t);

}


static inline vBCD_t

vec_bcddive (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

  _Decimal128 d_t, d_a, d_b;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  // Look into using DFP Insert Biased Exponent here.

  d_a = d_a * 10E31DL;

  d_t = vec_quantize0_Decimal128 (d_a / d_b);

  t = vec_DFP2BCD (d_t);

  return (t);

}


static inline vBCD_t

vec_bcdmul (vBCD_t a, vBCD_t b)

{

#ifndef _ARCH_PWR9

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

#endif

  const vBCD_t dword_mask = (vBCD_t) CONST_VINT128_DW(15, -1);

  vBCD_t t, low_a, low_b, high_a, high_b;

  _Decimal128 d_p, d_t, d_a, d_b;


  low_a = vec_and (a, dword_mask);

  low_b = vec_and (b, dword_mask);

  d_a = vec_BCD2DFP (low_a);

  d_b = vec_BCD2DFP (low_b);

  d_p = d_a * d_b;

  if (__builtin_expect ((vec_cmpuq_all_eq ((vui128_t) low_a, (vui128_t) a)

      && vec_cmpuq_all_eq ((vui128_t) low_b, (vui128_t) b)), 1))

    {

      d_t = d_p;

    }

  else

    {

      _Decimal128 d_ah, d_bh, d_hl, d_lh, d_h;


      high_a = vec_bcdsrqi (a, 16);

      high_b = vec_bcdsrqi (b, 16);


      d_ah = vec_BCD2DFP (high_a);

      d_bh = vec_BCD2DFP (high_b);


      d_hl = d_ah * d_b;

      d_lh = d_a * d_bh;


      d_h = d_hl + d_lh;

      d_h = __builtin_dscliq (d_h, 17);

      d_h = __builtin_dscriq (d_h, 1);


      d_t = d_p + d_h;

    }

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

#ifdef _ARCH_PWR9

  t = vec_bcdadd (t, _BCD_CONST_ZERO);

#else

  if (vec_all_eq((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdmulh (vBCD_t a, vBCD_t b)

{

  const vBCD_t dword_mask = (vBCD_t) CONST_VINT128_DW(15, -1);

#ifndef _ARCH_PWR9

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

#endif

  vBCD_t t, low_a, low_b, high_a, high_b;

  _Decimal128 d_p, d_t, d_al, d_bl;


  low_a = vec_and (a, dword_mask);

  low_b = vec_and (b, dword_mask);

  d_al = vec_BCD2DFP (low_a);

  d_bl = vec_BCD2DFP (low_b);

  d_p = d_al * d_bl;

  if (__builtin_expect ((vec_cmpuq_all_eq ((vui128_t) low_a, (vui128_t) a)

      && vec_cmpuq_all_eq ((vui128_t) low_b, (vui128_t) b)), 1))

    {

      d_t = __builtin_dscriq (d_p, 31);

    }

  else

    {

      _Decimal128 d_ah, d_bh, d_hl, d_lh, d_h, d_ll, d_m;


      high_a = vec_bcdsrqi (a, 16);

      high_b = vec_bcdsrqi (b, 16);

      d_ah = vec_BCD2DFP (high_a);

      d_bh = vec_BCD2DFP (high_b);


      d_hl = d_ah * d_bl;

      d_lh = d_al * d_bh;

      d_ll = __builtin_dscriq (d_p, 16);


      d_m = d_hl + d_lh + d_ll;

      d_m = __builtin_dscriq (d_m, 15);


      d_h = d_ah * d_bh;

      d_h = __builtin_dscliq (d_h, 1);

      d_t = d_m + d_h;

    }

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

#ifdef _ARCH_PWR9

  t = vec_bcdadd (t, _BCD_CONST_ZERO);

#else

  if (vec_all_eq((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcds (vBCD_t vra, vi8_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcds. %0,%1,%2,0;\n"

      : "=v" (vrt)

      : "v" (vrb),

      "v" (vra)

      : "cr6" );

#else

  const vi8_t zero = vec_splat_s8(0);

  vi8_t shd = vec_splat (vrb, VEC_BYTE_L_DWH);

  vui128_t t;

  // Multiply digit shift by 4 to get bit shift count

  shd = vec_add (shd, shd);

  shd = vec_add (shd, shd);

  // Clear sign nibble before shift.

  t = (vui128_t) vec_andc ((vui32_t) vra, (vui32_t) _BCD_CONST_SIGN_MASK);

  // Compare shift positive or negative

  if (vec_all_ge(shd, zero))

    {

      // Positive, shift left

      t = vec_slq (t, (vui128_t) shd);

    }

  else

    {

      // Negative, shift right by absolute value

      shd = vec_sub (zero, shd);

      t = vec_srq (t, (vui128_t) shd);

    }

  // restore original sign nibble

  vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdsetsgn (vBCD_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdsetsgn. %0,%1,0;\n"

      : "=v" (vrt)

      : "v" (vrb)

      : "cr6" );

#else

  const vui32_t match_mask = vec_splat_u32(15);

  // The preferred sign is in the correct position for vec_bcdcpsgn

  const vui32_t minus_sign = (vui32_t) CONST_VINT128_W(0x0b, 0x0d, 0x0b, 0x0d);

  const vui32_t plus_sign = (vui32_t) CONST_VINT128_W(0x0a, 0x0f, 0x0e, 0x0c);

  vui32_t sign_splat;

  vui32_t sign_code;


  // Replicate the byte containing the sign to words

  sign_splat = vec_splat ((vui32_t) vrb, VEC_W_L);

  // Apply the code match mask

  sign_code = vec_and (sign_splat, match_mask);

  // SIMD compare for match to any positive sign code

  if (vec_any_eq (sign_code, plus_sign))

    vrt = vec_bcdcpsgn (vrb, (vBCD_t) plus_sign);

  else

    {

      // SIMD compare for match to any negative sign code

      if (vec_any_eq (sign_code, minus_sign))

        vrt = vec_bcdcpsgn (vrb, (vBCD_t) minus_sign);

      else

        vrt = vrb;

    }

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdslqi (vBCD_t vra, const unsigned int _N)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  vi8_t shd = vec_splats ((const signed char) (_N));

  vrt = vec_bcds (vra, shd);

#else

  vui128_t t;


  t = (vui128_t) vec_andc ((vui32_t) vra, (vui32_t) _BCD_CONST_SIGN_MASK);

  t = vec_slqi (t, (_N*4));

  vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdsluqi (vBCD_t vra, const unsigned int _N)

{

#ifdef _ARCH_PWR9

  vi8_t shd = vec_splats ((const signed char) (_N));

  return vec_bcdus (vra, shd);

#else

  return (vBCD_t) vec_slqi ((vui128_t) vra, (_N*4));

#endif

}


static inline vBCD_t

vec_bcdsr (vBCD_t vra, vi8_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdsr. %0,%1,%2,0;\n"

      : "=v" (vrt)

      : "v" (vrb),

      "v" (vra)

      : "cr6" );

#else

  const vi8_t zero = vec_splat_s8(0);

  vi8_t shd = vec_splat (vrb, VEC_BYTE_L_DWH);

  vui128_t t;

  vui32_t r_d;

  // Multiply digit shift by 4 to get bit shift count

  shd = vec_add (shd, shd);

  shd = vec_add (shd, shd);

  // Clear sign nibble before shift.

  t = (vui128_t) vec_andc ((vui32_t) vra, (vui32_t) _BCD_CONST_SIGN_MASK);

  // Compare shift positive or negative

  if (vec_all_ge(shd, zero))

    {

      // Positive, shift left

      t = vec_slq (t, (vui128_t) shd);

      // restore original sign nibble

      vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

    }

  else

    {

      const vui32_t rnd6 = CONST_VINT128_W (0, 0, 0, (5+6));

      vBCD_t rnd_d;

      // Negative, shift right by absolute value

      shd = vec_sub (zero, shd);

      t = vec_srq (t, (vui128_t) shd);

      // extract the last digit shifted out for rounding.

      r_d = (vui32_t) vec_and ((vui32_t) t, (vui32_t) _BCD_CONST_SIGN_MASK);

      // Add decimal 6's +5 to generate rounding digit

      r_d = vec_add (r_d, rnd6);

      // Set the sign from original value

      rnd_d = vec_bcdcpsgn (r_d, vra);

      // restore original sign nibble

      vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

      // round the last digit

      vrt = vec_bcdadd (vrt, rnd_d);

#ifdef _ARCH_PWR7

      // Special fixup for P7 via DFP. But in case of shift right

      // resulting in 0, the bcdadd above will return the preferred

      // +0, while bcdsr should not change the sign.

      vrt = vec_bcdcpsgn (vrt, vra);

#endif

    }

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdsrqi (vBCD_t vra, const unsigned int _N)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  vi8_t shd = vec_splats ((const signed char) (-_N));

  vrt = vec_bcds (vra, shd);

#else

  vui128_t t;


  t = vec_srqi ((vui128_t) vra, (_N*4));

  vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdsrrqi (vBCD_t vra, const unsigned int _N)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  vi8_t shd = vec_splats ((const signed char) (-_N));

  vrt = vec_bcdsr (vra, shd);

#else

  vui128_t t;

  vui32_t r_d;

  // Compare shift positive or negative

  if (_N < 32)

    {

      const vui32_t rnd6 = CONST_VINT128_W(0, 0, 0, (5 + 6));

      vBCD_t rnd_d;

      // Clear sign nibble before shift.

      t = (vui128_t) vec_andc ((vui32_t) vra, (vui32_t) _BCD_CONST_SIGN_MASK);

      t = vec_srqi (t, (_N * 4));

      // extract the last digit shifted out for rounding.

      r_d = (vui32_t) vec_and ((vui32_t) t, (vui32_t) _BCD_CONST_SIGN_MASK);

      // Add decimal 6's +5 to generate rounding digit

      r_d = vec_add (r_d, rnd6);

      // Set the sign from original value

      rnd_d = vec_bcdcpsgn (r_d, vra);

      // restore original sign nibble

      vrt = vec_bcdcpsgn ((vBCD_t) t, vra);

      // round the last digit

      vrt = vec_bcdadd (vrt, rnd_d);

#ifdef _ARCH_PWR7

      // Special fixup for P7 via DFP. But in case of shift right

      // resulting in 0, the bcdadd above will return the preferred

      // +0, while bcdsr should not change the sign.

      vrt = vec_bcdcpsgn (vrt, vra);

#endif

    }

  else

    {

      vrt = vra;

    }

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdsruqi (vBCD_t vra, const unsigned int _N)

{

#ifdef _ARCH_PWR9

  vi8_t shd = vec_splats ((const signed char) (-_N));

  return vec_bcdus (vra, shd);

#else

  return (vBCD_t) vec_srqi ((vui128_t) vra, (_N*4));

#endif

}


static inline vBCD_t

vec_bcdsub (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

#if (__GNUC__ < 7)

  __asm__(

      "bcdsub. %0,%1,%2,0;\n"

      : "=v" (t)

      : "v" (a),

      "v" (b)

      : "cr6" );

#else

  t = (vBCD_t) __builtin_bcdsub ((vi128_t) a, (vi128_t) b, 0);

#endif

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_t, d_a, d_b;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_t = d_a - d_b;

  t = vec_DFP2BCD(d_t);

  // fix up spurious negative zeros

  if (vec_all_eq((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdsubcsq (vBCD_t a, vBCD_t b)

{

  vBCD_t t;

#if defined (_ARCH_PWR8) && (__GNUC__ > 6)

  vBCD_t a_b;

#ifdef _ARCH_PWR9

  // Generate BCD zero from (a - a), which is 3 cycles on PWR9

  t = vec_bcdsub (a,  a);

#else // Else load a BCD const 0.

  t = _BCD_CONST_ZERO;

#endif

  if (__builtin_expect (__builtin_bcdsub_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      a_b = vec_bcdsub (a, b);

      t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b);

    }

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_a, d_b, d_s, d_t;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_s = d_a - d_b;

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  t = vec_DFP2BCD(d_t);

  // fix up spurious negative zeros

  if (vec_all_eq ((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdsubecsq (vBCD_t a, vBCD_t b, vBCD_t c)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

  vBCD_t a_b, a_b_c;


  a_b = vec_bcdsub (a, b);

  if (__builtin_expect (__builtin_bcdsub_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b);

    }

  else // (a - b) did not overflow, what about (a - b + c)

    {

      a_b_c = vec_bcdadd (a_b, c);

      if (__builtin_bcdadd_ov ((vi128_t) a_b, (vi128_t) c, 0))

        {

          t = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a_b_c);

        }

      else

        {

#ifdef _ARCH_PWR9

          // Generate BCD zero from (a - a), which is 3 cycles on PWR9

          t = vec_bcdsub (a, a);

#else     // Else load a BCD const 0.

          t = _BCD_CONST_ZERO;

#endif

        }

    }

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_a, d_b, d_c, d_s, d_t;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_c = vec_BCD2DFP (c);

  d_s = d_a - d_b + d_c;

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq ((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdsubesqm (vBCD_t a, vBCD_t b, vBCD_t c)

{

  vBCD_t t;

#ifdef _ARCH_PWR8

  t = vec_bcdadd (vec_bcdsub (a, b), c);

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  _Decimal128 d_t;

  d_t = vec_BCD2DFP (a) - vec_BCD2DFP (b) + vec_BCD2DFP (c);

  t = vec_DFP2BCD(d_t);

  // fix up spurious negative zeros

  if (vec_all_eq ((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_bcdtrunc (vBCD_t vra, vui16_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdtrunc. %0,%1,%2,0;\n"

      : "=v" (vrt)

      : "v" (vrb),

      "v" (vra)

      : "cr6" );

#else

  const vui16_t c124 = vec_splats ((unsigned short) 124);

  const vui16_t c4 = vec_splats ((unsigned short) 4);

  vui16_t shd = vec_splat (vrb, VEC_HW_L_DWH);

  vui128_t t;

  // Multiply digit shift by 4 to get bit shift count

  shd = vec_add (shd, shd);

  shd = vec_add (shd, shd);

  vui16_t one_s;

  // compensate for the sign nibble

  shd = vec_add (shd, c4);

  // generation all ones if in range, zeros if greater than

  one_s = (vui16_t) vec_cmple (shd, c124);

  // Generate a mask for the digits we will clear

  t = vec_slq ((vui128_t) one_s, (vui128_t) shd);

  // Clear the digits we are truncating

  vrt = (vBCD_t) vec_andc ((vui32_t)vra, (vui32_t) t);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdtruncqi (vBCD_t vra, const unsigned short _N)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  vui16_t shd = vec_splats ((const unsigned short) (_N));

  vrt = vec_bcdtrunc (vra, shd);

#else

  vui128_t t;

  const vui16_t ones = vec_splat_u16(-1);

  // Compare shift < 32 (128-bits)

  if (_N < 31)

    {

      // Generate a mask for the digits we will keep

      t = vec_srqi ((vui128_t) ones, ((31 -_N) * 4));

      // Clear the digits we are truncating

      vrt = (vBCD_t) vec_and ((vui32_t) t, (vui32_t)vra);

    }

  else

    vrt = vra;

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdus (vBCD_t vra, vi8_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdus. %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vrb),

      "v" (vra)

      : "cr6" );

#else

  const vi8_t zero = vec_splat_s8(0);

  vi8_t shd = vec_splat (vrb, VEC_BYTE_L_DWH);

  vui128_t t;

  // Multiply digit shift by 4 to get bit shift count

  shd = vec_add (shd, shd);

  shd = vec_add (shd, shd);

  t = (vui128_t) vra;

  // Compare shift positive or negative

  if (vec_all_ge(shd, zero))

    {

      // Positive, shift left

      t = vec_slq (t, (vui128_t) shd);

    }

  else

    {

      // Negative, shift right by absolute value

      shd = vec_sub (zero, shd);

      t = vec_srq (t, (vui128_t) shd);

    }

  vrt = (vBCD_t) t;

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdutrunc (vBCD_t vra, vui16_t vrb)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  __asm__(

      "bcdutrunc. %0,%1,%2;\n"

      : "=v" (vrt)

      : "v" (vrb),

      "v" (vra)

      : "cr6" );

#else

  const vui16_t c128 = vec_splats ((unsigned short) 128);

  vui16_t shd = vec_splat (vrb, VEC_HW_L_DWH);

  vui16_t one_s;

  vui128_t t;

  // Multiply digit shift by 4 to get bit shift count

  shd = vec_add (shd, shd);

  shd = vec_add (shd, shd);

  // generation all ones if in range, zeros if greater than

  one_s = (vui16_t) vec_cmplt (shd, c128);

  // Generate a mask for the digits we will clear

  t = vec_slq ((vui128_t) one_s, (vui128_t) shd);

  // Clear the digits we are truncating

  vrt = (vBCD_t) vec_andc ((vui32_t)vra, (vui32_t) t);

#endif

  return (vrt);

}


static inline vBCD_t

vec_bcdutruncqi (vBCD_t vra, const unsigned short _N)

{

  vBCD_t vrt;

#ifdef _ARCH_PWR9

  vui16_t shd = vec_splats ((const unsigned short) (_N));

  vrt = vec_bcdutrunc (vra, shd);

#else

  vui128_t t;

  const vui16_t ones = vec_splat_u16(-1);

  // Compare shift < 32 (128-bits)

  if (_N < 32)

    {

      // Generate a mask for the digits we will keep

      t = vec_srqi ((vui128_t) ones, ((32 -_N) * 4));

      // Clear the digits we are truncating

      vrt = (vBCD_t) vec_and ((vui32_t) t, (vui32_t)vra);

    }

  else

    vrt = vra;

#endif

  return (vrt);

}


static inline vBCD_t

vec_cbcdaddcsq (vBCD_t *cout, vBCD_t a, vBCD_t b)

{

  vBCD_t t, c;

#ifdef _ARCH_PWR8

  vBCD_t sum_ab, sign_ab;


  sum_ab = vec_bcdadd (a, b);

  if (__builtin_expect (__builtin_bcdadd_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_ab);

    }

  else // (a + b) did not overflow, but did it borrow?

    {

      c = _BCD_CONST_ZERO;

      sign_ab = vec_bcdcpsgn (sum_ab, a);

      if (!vec_all_eq(sign_ab, sum_ab) && !vec_all_eq(_BCD_CONST_ZERO, sum_ab))

        {

          vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

          vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

          c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_ab);

          sum_ab = vec_bcdaddesqm (nines, sum_ab, tensc);

        }

    }

  t = sum_ab;

#else

  vBCD_t sign_ab;

  _Decimal128 d_a, d_b, d_s, d_t;

  const vui32_t mz = CONST_VINT128_W(0, 0, 0, 0x0000000d);

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_s = d_a + d_b;

  t = vec_DFP2BCD (d_s);

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  c = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq((vui32_t ) c, mz))

    c = _BCD_CONST_ZERO;

  // (a + b) did not overflow, but did it borrow?

  sign_ab = vec_bcdcpsgn (t, a);

  if (!vec_all_eq(sign_ab, t) && !vec_all_eq(_BCD_CONST_ZERO, t))

    {

      vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

      vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, t);

      t = vec_bcdaddesqm (nines, t, tensc);

    }

#endif

  *cout = c;

  return (t);

}


static inline vBCD_t

vec_cbcdaddecsq (vBCD_t *cout, vBCD_t a, vBCD_t b, vBCD_t cin)

{

  vBCD_t t, c;

#ifdef _ARCH_PWR8

  vBCD_t sum_ab, sum_abc, sign_abc;


  sum_ab = vec_bcdadd (a, b);


  if (__builtin_expect (__builtin_bcdadd_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      sum_abc = vec_bcdadd (sum_ab, cin);

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_abc);

    }

  else // (a + b) did not overflow, but did (a + b + c) overflow?

    {

      sum_abc = vec_bcdadd (sum_ab, cin);

      if (__builtin_expect (__builtin_bcdadd_ov ((vi128_t) sum_ab, (vi128_t) cin, 0), 0))

        {

          c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_abc);

        }

      else // (a + b + c) did not overflow, but did it borrow?

        {

          c = _BCD_CONST_ZERO;

          sign_abc = vec_bcdcpsgn (sum_abc, a);

          if (!vec_all_eq(sign_abc, sum_abc) && !vec_all_eq(_BCD_CONST_ZERO, sum_abc))

            {

              vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

              vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

              c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_abc);

              sum_abc = vec_bcdaddesqm (nines, sum_abc, tensc);

            }

        }

    }

  t = sum_abc;

#else

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

  vBCD_t sign_abc;

  _Decimal128 d_a, d_b, d_c, d_s, d_t;

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_c = vec_BCD2DFP (cin);

  d_s = d_a + d_b + d_c;

  t = vec_DFP2BCD (d_s);

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  c = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq((vui32_t) c, mz))

  c = _BCD_CONST_ZERO;

  // (a + b + c) did not overflow, but did it borrow?

  sign_abc = vec_bcdcpsgn (t, a);

  if (!vec_all_eq(sign_abc, t) && !vec_all_eq(_BCD_CONST_ZERO, t))

    {

      vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

      vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, t);

      t = vec_bcdaddesqm (nines, t, tensc);

    }

#endif

  *cout = c;

  return (t);

}


static inline vBCD_t

vec_cbcdmul (vBCD_t *p_high, vBCD_t a, vBCD_t b)

{

  const vBCD_t dword_mask = (vBCD_t) CONST_VINT128_DW(15, -1);

#ifndef _ARCH_PWR9

  const vui32_t mz = CONST_VINT128_W (0, 0, 0, 0x0000000d);

#endif

  vBCD_t t, ph, low_a, low_b, high_a, high_b;

  _Decimal128 d_p, d_t, d_al, d_bl;


  low_a = vec_and (a, dword_mask);

  low_b = vec_and (b, dword_mask);

  d_al = vec_BCD2DFP (low_a);

  d_bl = vec_BCD2DFP (low_b);

  d_p = d_al * d_bl;

  if (__builtin_expect ((vec_cmpuq_all_eq ((vui128_t) low_a, (vui128_t) a)

      && vec_cmpuq_all_eq ((vui128_t) low_b, (vui128_t) b)), 1))

    {

      d_t = __builtin_dscriq (d_p, 31);

      ph = vec_DFP2BCD (d_t);

      d_t = d_p;

    }

  else

    {

      _Decimal128 d_ah, d_bh, d_hl, d_lh, d_h, d_hh, d_ll, d_m, d_mp;

      high_a = vec_bcdsrqi (a, 16);

      high_b = vec_bcdsrqi (b, 16);

      d_ah = vec_BCD2DFP (high_a);

      d_bh = vec_BCD2DFP (high_b);

      d_hl = d_ah * d_bl;

      d_ll = __builtin_dscriq (d_p, 16);


      d_lh = d_al * d_bh;


      d_mp = d_hl + d_lh;

      d_m = d_mp + d_ll;

      d_m = __builtin_dscriq (d_m, 15);


      d_hh = d_ah * d_bh;

      d_hh = __builtin_dscliq (d_hh, 1);


      d_t = d_m + d_hh;

      ph = vec_DFP2BCD (d_t);


      d_h = __builtin_dscliq (d_mp, 17);

      d_h = __builtin_dscriq (d_h, 1);


      d_t = d_p + d_h;

    }

  t = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

#ifdef _ARCH_PWR9

  ph = vec_bcdadd (ph, _BCD_CONST_ZERO);

#else

  if (vec_all_eq((vui32_t) ph, mz))

    ph = _BCD_CONST_ZERO;

#endif

  *p_high = ph;


  // fix up spurious negative zeros

#ifdef _ARCH_PWR9

  t = vec_bcdadd (t, _BCD_CONST_ZERO);

#else

  if (vec_all_eq((vui32_t) t, mz))

    t = _BCD_CONST_ZERO;

#endif

  return (t);

}


static inline vBCD_t

vec_cbcdsubcsq (vBCD_t *cout, vBCD_t a, vBCD_t b)

{

  vBCD_t t, c;

#ifdef _ARCH_PWR8

  vBCD_t sum_ab, sign_ab;


  sum_ab = vec_bcdsub (a, b);

  if (__builtin_expect (__builtin_bcdsub_ov ((vi128_t) a, (vi128_t) b, 0), 0))

    {

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_ab);

    }

  else // (a + b) did not overflow, but did it borrow?

    {

      c = _BCD_CONST_ZERO;

      sign_ab = vec_bcdcpsgn (sum_ab, a);

      if (!vec_all_eq(sign_ab, sum_ab) && !vec_all_eq(_BCD_CONST_ZERO, sum_ab))

        {

          vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

          vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

          c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, sum_ab);

          sum_ab = vec_bcdaddesqm (nines, sum_ab, tensc);

        }

    }

  t = sum_ab;

#else

  vBCD_t sign_ab;

  _Decimal128 d_a, d_b, d_s, d_t;

  const vui32_t mz = CONST_VINT128_W(0, 0, 0, 0x0000000d);

  d_a = vec_BCD2DFP (a);

  d_b = vec_BCD2DFP (b);

  d_s = d_a - d_b;

  t = vec_DFP2BCD (d_s);

  // Shift right 31 digits, leaving the carry.

  d_t = __builtin_dscriq (d_s, 31);

  c = vec_DFP2BCD (d_t);

  // fix up spurious negative zeros

  if (vec_all_eq((vui32_t ) c, mz))

    c = _BCD_CONST_ZERO;

  // (a + b) did not overflow, but did it borrow?

  sign_ab = vec_bcdcpsgn (t, a);

  if (!vec_all_eq(sign_ab, t) && !vec_all_eq(_BCD_CONST_ZERO, t))

    {

      vBCD_t nines = vec_bcdcpsgn (_BCD_CONST_PLUS_NINES, a);

      vBCD_t tensc = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, a);

      c = vec_bcdcpsgn (_BCD_CONST_PLUS_ONE, t);

      t = vec_bcdaddesqm (nines, t, tensc);

    }

#endif

  *cout = c;

  return (t);

}


static inline vf64_t

vec_pack_Decimal128 (_Decimal128 lval)

{

#ifdef _ARCH_PWR7

  vf64_t  t;

  __asm__(

      "\txxpermdi %x0,%1,%L1,0b00;\n"

      : "=v" (t)

      : "d" (lval)

      : );

  return (t);

#else

  //needs to work for P6 without xxpermdi

  __VEC_U_128   t;

  t.dpd128 = lval;

  return (t.vf2);

#endif

}


static inline _Decimal128

vec_quantize0_Decimal128 (_Decimal128 val)

{

#ifdef _ARCH_PWR7

  _Decimal128 t;

  __asm__(

      "dquaiq 0,%0,%1,0b01;\n"

      : "=d" (t)

      : "d" (val)

      : );

  return (t);

#else

  return (quantized128(val, 0DL));

#endif

}


static inline vui8_t

vec_rdxcf100b (vui8_t vra)

{

  vui8_t result;

  vui8_t x6, high_digit;

  /* Compute the high digit correction factor. For binary 100s to BCD

   * this is the radix 100 value divided by 10 times by the radix

   * difference in binary.  For this stage we use 0x10 - 10 = 6.  */

  high_digit = vra / 10;

#if (__GNUC__ > 6)

  // Allow the compiler to do strength reduction for const 6 multiplier

  x6 = high_digit * 6;

  result = vra + x6;

#else

    {

      vui8_t c6;

      c6 = vec_splats ((unsigned char) 0x06);

      x6 = vec_mulubm (high_digit, c6);

      /* Add the high digit correction bytes to the original

       * radix 100 bytes in binary. */

      result = vec_add (vra, x6);

    }

#endif

  return result;

}


static inline vui8_t

vec_rdxcf10kh (vui16_t vra)

{

  vui8_t result;

  vui16_t x156, c156, high_digit;

  /* Compute the high digit correction factor. For binary 10**4 to 100s

   * this is the radix 10000 value divided by 100 times by the radix

   * difference in binary.  For this stage we use 0x100 - 100 = 156.  */

  high_digit = vra / 100;

  c156 = vec_splats ((unsigned short) 156);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  x156 = vec_vmuleub ((vui8_t) high_digit, (vui8_t) c156);

#else

  x156 = vec_vmuloub ((vui8_t) high_digit, (vui8_t) c156);

#endif

  /* Add the high digit correction bytes from the original

   * radix 10000 hword in binary. */

  result = (vui8_t) vec_add (vra, x156);

  return result;

}


static inline vui16_t

vec_rdxcf100mw (vui32_t vra)

{

  vui16_t result;

  vui32_t high_digit;

  /* Compute the high digit correction factor. For binary 10**8 to 10**4

   * this is the radix 100000000 value divided by 10000 times by the radix

   * difference in binary.  For this stage we use 0x10000 - 10000 = 55536.  */

  const vui32_t c = vec_splats ((unsigned int) 55536);


  high_digit = vra / 10000;

#ifdef _ARCH_PWR8

  // 0 in the even hword of const c reduces vmsumuhm to vmulouh

  result = (vui16_t) vec_msum ((vui16_t) high_digit, (vui16_t) c, vra);

#else

    {

      vui32_t x;

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      x = vec_vmuleuh ((vui16_t) high_digit, (vui16_t) c);

#else

      x = vec_vmulouh ((vui16_t) high_digit, (vui16_t) c);

#endif

      /* Add the high digit correction word to the original

       * radix 10**8 word in binary. */

      result = (vui16_t) vec_add (vra, x);

    }

#endif

  return result;

}


static inline vui32_t

vec_rdxcf10E16d (vui64_t vra)

{

  /* Magic numbers for multiplicative inverse to divide by 10**8

   are 12379400392853802749, no corrective add,

   and shift right 26 bits.  */

  const vui64_t mul_invs_ten8 = CONST_VINT128_DW(

      12379400392853802749UL, 12379400392853802749UL);

  const int shift_ten8 = 26;

  vui32_t result;

  vui64_t x, c, high_digit;

  /* Compute the high digit correction factor. For binary 10**16 to

   * 10**8 this is the radix 10000000000000000 value divided by

   * 100000000 times by the radix difference in binary.  For this

   * stage we use 0x100000000 - 100000000 = 4194967296.  */


  // high_digit = vra / 100000000;

  // Next divide the 16 digits by 10**8.

  // This separates the high 8 digits into words.

  high_digit = vec_mulhud (vra, mul_invs_ten8);

  high_digit = vec_srdi (high_digit, shift_ten8);

  c = vec_splats ((unsigned long)4194967296);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  x = vec_muleuw ((vui32_t) high_digit, (vui32_t) c);

#else

  x = vec_mulouw ((vui32_t) high_digit, (vui32_t) c);

#endif

  /* Add the high digit correction dword to the original

   * radix 10**16 dword in binary. */

  result = (vui32_t) vec_addudm (vra, x);

  return result;

}


/* Convert radix 10**16 binary dwords to radix 10**8 words */


static inline vui64_t

vec_rdxcf10e32q (vui128_t vra)

{

  // Compute the high digit correction factor. For binary 10**32 to

  // 10**16, this is (16**16 - 10**16) = 18436744073709551616.

  const vui64_t c = CONST_VINT128_DW (0, 18436744073709551616UL);


  /* Magic numbers for multiplicative inverse to divide by 10**16

   are 76624777043294442917917351357515459181, no corrective add,

   and shift right 51 bits.  */

  const vui128_t mul_invs_ten16 = (vui128_t) CONST_VINT128_DW(

      0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);

  const int shift_ten16 = 51;


  vui64_t result;

  vui128_t high_digit;


  // high_digit = vra / 10000000000000000;

  // Next divide the 32 digits by 10**16.

  // This separates the high 16 digits into doublewords.

  high_digit = vec_mulhuq (vra, mul_invs_ten16);

  high_digit = vec_srqi (high_digit, shift_ten16);


  // multiply high_digit by the radix difference c and add vra

#ifdef _ARCH_PWR9

  // 0 in the high dword of const c reduces vmsumudm to vmuloud

  result = (vui64_t) vec_msumudm ((vui64_t) high_digit, c, vra);

#else

    {

      vui128_t x;

      x = vec_vmuloud ((vui64_t) high_digit, c);

      /* Add the high digit correction qword to the original

       * radix 10**32 qword in binary. */

      result = (vui64_t) vec_adduqm (vra, x);

    }

#endif

  return result;

}


static inline vui8_t

vec_rdxcfzt100b (vui8_t zone00, vui8_t zone16)

{

  const vui8_t dmask = vec_splat_u8 (15);

  const vui8_t dx10 = vec_splat_u8 (10);

  vui8_t znd00, znd16;

  vui8_t ones, tens;

  vui16_t ten00, ten16;


  /* Isolate the BCD digit from each zoned character. */

  znd00 = vec_and (zone00, dmask);

  znd16 = vec_and (zone16, dmask);

  /* Pack the odd zone digits into a single vector.

     This is the unit digit of each zoned digit pair. */

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  ones = vec_pack ((vui16_t) znd16, (vui16_t) znd00);

#else

  ones = vec_pack ((vui16_t) znd00, (vui16_t) znd16);

#endif

  /* Multiply the even zone digits by 10 before packing

     them into a single vector.

     This is the tens digit of each zoned digit pair. */

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  ten00 = vec_mulo (znd00, dx10);

  ten16 = vec_mulo (znd16, dx10);

  tens = vec_pack (ten16, ten00);

#else

  ten00 = vec_mule (znd00, dx10);

  ten16 = vec_mule (znd16, dx10);

  tens = vec_pack (ten00, ten16);

#endif

  /* sum adjacent tens and unit digit pairs, into a single

   * binary value in the range 0-99.  */

  return vec_add (tens, ones);

}


static inline vui8_t

vec_rdxct100b (vui8_t vra)

{

  vui8_t x6, c6, high_digit;

  /* Compute the high digit correction factor. For BCD to binary 100s

   * this is the isolated high digit multiplied by the radix difference

   * in binary.  For this stage we use 0x10 - 10 = 6.  */

  high_digit = vec_srbi (vra, 4);

  c6 = vec_splats ((unsigned char) 0x06);

#if (__GNUC__ > 7)

  // Allow the compiler to do strength reduction for const 6 multiplier

  x6 = vec_mul (high_digit, c6);

#else

  x6 = vec_mulubm (high_digit, c6);

#endif

  /* Subtract the high digit correction bytes from the original

   * BCD bytes in binary.  This reduces byte range to 0-99. */

  return vec_sub (vra, x6);

}


static inline vui16_t

vec_rdxct10kh (vui8_t vra)

{

  vui8_t c156;

  vui16_t x156;

  /* Compute the high digit correction factor. For 100s to binary 10ks

   * this is the isolated high digit multiplied by the radix difference

   * in binary.  For this stage we use 0x100 - 100 = 156.  */

  c156 = vec_splats ((unsigned char) 156);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  x156 = vec_mulo ((vui8_t) vra, c156);

#else

  x156 = vec_mule ((vui8_t) vra, c156);

#endif

  /* Subtract the high digit correction halfword from the original

   * 100s byte pair in binary.  This reduces the range to 0-9999. */

  return vec_sub ((vui16_t) vra, x156);

}


static inline vui32_t

vec_rdxct100mw (vui16_t vra)

{

  vui16_t c55536;

  vui32_t x55536;

  /* Compute the high digit correction factor. For 10ks to binary 100ms

   * this is the isolated high digit multiplied by the radix difference

   * in binary.  For this stage we use 0x10000 - 10000 = 55536.  */

  c55536 = vec_splats ((unsigned short) 55536);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  x55536 = vec_mulo ((vui16_t) vra, c55536);

#else

  x55536 = vec_mule ((vui16_t) vra, c55536);

#endif

  /* Subtract the high digit correction word from the original

   * 10ks byte pair in binary.  This reduces the range to

   * 0-99999999. */

  return vec_sub ((vui32_t) vra, x55536);

}


static inline vui64_t

vec_rdxct10E16d (vui32_t vra)

{

  vui32_t c4194967296;

  vui64_t x4194967296;

  /* Compute the high digit correction factor. For 100ms to binary 10ts

   * this is the isolated high digit multiplied by the radix difference

   * in binary.  For this stage we use 0x100000000 - 100000000 =

   * 4194967296.  */

  c4194967296 = vec_splats ((unsigned int) 4194967296);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  x4194967296 = vec_mulouw ((vui32_t) vra, c4194967296);

#else

  x4194967296 = vec_muleuw ((vui32_t) vra, c4194967296);

#endif

  /* Subtract the high digit correction doubleword from the original

   * 100m word pair in binary.  This reduces the range to

   * 0-9999999999999999. */

  return vec_subudm ((vui64_t) vra, x4194967296);

}


static inline vui128_t

vec_rdxct10e32q (vui64_t vra)

{

  const vui64_t c18436744073709551616 = CONST_VINT128_DW (18436744073709551616UL, 0);

  vui128_t x18436744073709551616;

  /* Compute the high digit correction factor for 10ts to binary 10e32s

   * This is the isolated high digit multiplied by the radix difference

   * in binary.  For this stage we use

   * 0x10000000000000000 - 10000000000000000 = 18436744073709551616.  */

//  c18436744073709551616 = vec_splats ((unsigned long) 18436744073709551616UL);

#ifdef _ARCH_PWR9

  const vui128_t zero = CONST_VINT128_DW128 (0, 0);

  // 0 in the low dword of const c reduces vmsumudm to vmuleud

  x18436744073709551616 = (vui128_t) vec_msumudm ((vui64_t) vra, c18436744073709551616, zero);

#else

  x18436744073709551616 = vec_vmuleud ((vui64_t) vra, c18436744073709551616);

#endif


  /* Subtract the high digit correction quadword from the original

   * 10e doubleword pair in binary.  This reduces the range to

   * 0-99999999999999999999999999999999. */

  return vec_subuqm ((vui128_t) vra, x18436744073709551616);

}


static inline vb128_t

vec_setbool_bcdinv (vBCD_t vra)

{

  vb128_t result;

#if defined (_ARCH_PWR8) && (__GNUC__ > 6)

  // The OV and INV status results overlay CR.bit[59] for bcdadd/sub.

  // For valid inputs OV will never be set for both bcdadd/sub.

  // So if both bcdadd/bcdsub return OV then must be invalid data.

  if (__builtin_bcdadd_ov ((vi128_t) vra, (vi128_t) _BCD_CONST_ZERO, 0)

   && __builtin_bcdsub_ov ((vi128_t) vra, (vi128_t) _BCD_CONST_ZERO, 0))

    result = (vb128_t) vec_splat_s8 (-1);

  else

    result = (vb128_t) vec_splat_s8 (0);

#else

  const vui16_t sign_mask = vec_splat_u16(15);

  // Load all 6 valid sign nibble values into a vector unsigned short

  // After splatting the sign, we can compare any of six in one op

  const vui16_t sign_vals = CONST_VINT128_H(0x0b, 0x0d, 0x0b, 0x0d, 0x0a, 0x0c,

                                            0x0e, 0x0f);

  const vui8_t max_digit = vec_splat_u8(9);

  const vui8_t msk_digit = vec_splat_u8(15);

  vui16_t sign_splat;

  vui16_t sign_code;

  vui8_t even, odd;


  // Replicate the halfword containing the sign nibble

  sign_splat = vec_splat ((vui16_t) vra, VEC_HW_L);

  // Apply the sign nibble mask

  sign_code = vec_and (sign_splat, sign_mask);

  // SIMD compare for match to any positive/negative sign code

  if (vec_any_eq(sign_code, sign_vals))

    {

      // Split even/odd digits out so there only one digit per byte.

      // This insures the binary compare can detect any digits > 9

      even = vec_andc ((vui8_t) vra, msk_digit);

      odd = vec_and ((vui8_t) vra, msk_digit);

      // Align the compare digits with max_digit

      even = (vui8_t) vec_srqi ((vui128_t) even, 4);

      // And eliminate the sign nibble

      odd = (vui8_t) vec_srqi ((vui128_t) odd, 8);

      if (vec_any_gt (even, max_digit) || vec_any_gt(odd, max_digit))

        result = (vb128_t) vec_splat_s8(-1);

      else

        result = (vb128_t) vec_splat_s8(0);

    }

  else

    result = (vb128_t) vec_splat_s8(-1);

#endif

  return (result);

}


static inline vb128_t

vec_setbool_bcdsq (vBCD_t vra)

{

  vb128_t result;

#if defined (_ARCH_PWR8) && (__GNUC__ > 6)

  if (__builtin_bcdsub_gt ((vi128_t) vra, (vi128_t) _BCD_CONST_MINUS_ONE, 0))

    result = (vb128_t) vec_splat_s8 (0);

  else

    result = (vb128_t) vec_splat_s8 (-1);

#else

  const vui32_t sign_mask = vec_splat_u32(15);

//  const vui32_t neg_sign = (vui32_t) CONST_VINT128_W(0x0b, 0x0d, 0x0b, 0x0d);

  const vui32_t plus_sign = (vui32_t) CONST_VINT128_W(0x0a, 0x0c, 0x0e, 0x0f);

  vui32_t sign_splat;

  vui32_t sign_code;


  // Replicate the byte containing the sign to words

  sign_splat = vec_splat ((vui32_t) vra, VEC_W_L);

  // Apply the code mask

  sign_code = vec_and (sign_splat, sign_mask);

  // SIMD compare for match to any positive sign code

  if (vec_any_eq(sign_code, plus_sign))

    result = (vb128_t) vec_splat_s8(0);

  else

    result = (vb128_t) vec_splat_s8(-1);

#endif

  return (result);

}


static inline int

vec_signbit_bcdsq (vBCD_t vra)

{

  int result;

#if defined (_ARCH_PWR8) && (__GNUC__ > 6)

  result = __builtin_bcdsub_lt ((vi128_t) vra, (vi128_t) _BCD_CONST_ZERO, 0);

#else

  const vui32_t sign_mask = vec_splat_u32(15);

  const vui32_t minus_sign = (vui32_t) CONST_VINT128_W(0x0b, 0x0d, 0x0b, 0x0d);

  vui32_t sign_splat;

  vui32_t sign_code;


  // Replicate the byte containing the sign to words

  sign_splat = vec_splat ((vui32_t) vra, VEC_W_L);

  // Apply the code mask

  sign_code = vec_and (sign_splat, sign_mask);

  // SIMD compare for match to any negative sign code

  result = vec_any_eq(sign_code, minus_sign);

#endif

  return (result);

}


static inline _Decimal128

vec_unpack_Decimal128 (vf64_t lval)

{

#ifdef _ARCH_PWR7

  _Decimal128 t;

  __asm__(

      "xxpermdi %0,%x1,%x1,0b00;\n"

      "\txxpermdi %L0,%x1,%x1,0b10;\n"

      : "=&d" (t)

      : "v" (lval)

      : );

  return (t);

#else

  // needs to work for P6 without xxpermdi

  __VEC_U_128   t;

  t.vf2 = lval;

  return (t.dpd128);

#endif

}


static inline vui128_t

vec_zndctuq (vui8_t zone00, vui8_t zone16)

{

  vui8_t d100;

  vui16_t d10k;

  vui32_t d100m;

  vui64_t d10e;

  d100 = vec_rdxcfzt100b (zone00, zone16);

  d10k = vec_rdxct10kh (d100);

  d100m = vec_rdxct100mw (d10k);

  d10e = vec_rdxct10E16d (d100m);

  return vec_rdxct10e32q (d10e);

}

#endif /* ndef PVECLIB_DISABLE_DFP */

#endif /* VEC_BCD_PPC_H_ */