vec__f128__ppc_8h_source.html

/*

 Copyright (c) [2017-2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_f128_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Apr 11, 2016

 */


#ifndef VEC_F128_PPC_H_

#define VEC_F128_PPC_H_


#include <pveclib/vec_common_ppc.h>

#include <pveclib/vec_int128_ppc.h>

#include <pveclib/vec_f64_ppc.h>


/* __float128 was added in GCC 6.0.  But only with -mfloat128.

   Later compilers typedef __float128 to __ieee128 and

   long double to __ibm128. The intent was to allow the switch of

   long double from __ibm128 to __ieee128 (someday).


   Clang does not define __FLOAT128__ or __float128 without both

   -mcu=power9 and -mfloat128.

   So far clang does not support/define the __ibm128 type. */

#ifdef __FLOAT128__

typedef __float128 __Float128;

#ifndef __clang__

typedef __float128 __binary128;

typedef __float128 __ieee128;

typedef __ibm128 __IBM128;

#else

/* Clang started defining __FLOAT128__ and does not allow redefining

   __float128 or __ieee128. Worse it will give errors if you try to

   use either type. So define __binary128 as if __FLOAT128__ is not

   defined. */

typedef vui128_t __binary128;

/* Clang does not define __ibm128 over IBM long double.

   So defined it here. */

typedef long double __IBM128;

#endif

#else

/* Before GCC 6.0 (or without -mfloat128) we need to fake it.  */

typedef vui128_t vf128_t;

typedef vf128_t __Float128;

typedef vf128_t __binary128;

#ifndef __clang__

// Clang will not allow redefining __float128 even is it not enabled

typedef vf128_t __float128;

#endif


typedef long double __IBM128;

#endif


typedef union

     {

       vui8_t    vx16;

       vui16_t   vx8;

       vui32_t   vx4;

       vui64_t   vx2;

       vui128_t  vx1;

       vb128_t vbool1;

       __binary128 vf1;

       unsigned __int128 ix1;

     } __VF_128;


static inline __binary128 vec_xfer_vui32t_2_bin128 (vui32_t f128);

static inline int vec_all_isnanf128 (__binary128 f128);

static inline vb128_t vec_isnanf128 (__binary128 f128);

static inline vb128_t vec_isunorderedf128 (__binary128 vfa, __binary128 vfb);

static inline vb128_t vec_setb_qp (__binary128 f128);

static inline __binary128 vec_xsiexpqp (vui128_t sig, vui64_t exp);

static inline vui64_t vec_xsxexpqp (__binary128 f128);

static inline vui128_t vec_xsxsigqp (__binary128 f128);

static inline vui64_t vec_xxxexpqpp (__binary128 vfa, __binary128 vfb);


static inline vui64_t

vec_const64_f128_128(void)

{

  //  const vui32_t dw_128 = CONST_VINT128_W(0, 0, 0, 128);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

#if defined (_ARCH_PWR8)

  // Generate {64, 64} from count leading zeros of {0, 0}

  vui64_t dw64 = vec_clzd((vui64_t) q_zero);

  // Generate {128, 128}

  return vec_addudm (dw64, dw64);

#else

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  vui32_t signmask;

  signmask = vec_sl (q_ones, q_ones);

  signmask = vec_sld (q_zero, signmask, 1);

  return vec_mrgald ((vui128_t) signmask, (vui128_t) signmask);

#endif

}


static inline vui32_t

vec_const128_f128_128(void)

{

  //  const vui32_t signmask = CONST_VINT128_W(0, 0, 0, 128);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  vui32_t signmask;

  signmask = vec_sl (q_ones, q_ones);

  return vec_sld (q_zero, signmask, 1);

}


static inline vui64_t

vec_mask64_f128exp (void)

{

  //const vui32_t expmask = CONST_VINT128_W (0, 0x7fff, 0, 0x7fff);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  vui32_t expmask;

  expmask = (vui32_t) vec_splat_u8 (-8);

  expmask = vec_sld (q_zero, expmask, 4);

  return (vui64_t) vec_packpx (expmask, expmask);

}


static inline vui32_t

vec_mask128_f128exp (void)

{

  //  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  vui32_t expmask;


  expmask = (vui32_t) vec_splat_u8 (-8);

  expmask = vec_sld (expmask, q_zero, 12);

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

  return (vui32_t) vec_packpx (q_zero, expmask);

#else

  return (vui32_t) vec_packpx (expmask, q_zero);

#endif

}


static inline vui32_t

vec_mask128_f128mag (void)

{

  //  const vui32_t magmask = CONST_VINT128_W (0x7fffffff, -1, -1, -1);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  return (vui32_t) vec_srqi ((vui128_t) q_ones, 1);

}


static inline vui32_t

vec_mask128_f128sig (void)

{

  //  const vui32_t sigmask = CONST_VINT128_W (0x0000ffff, -1, -1, -1);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  return vec_sld (q_zero, q_ones, 14);

}


static inline vui32_t

vec_mask128_f128sign (void)

{

  //  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0, 0);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  vui32_t signmask;

  signmask = vec_sl (q_ones, q_ones);

  return vec_sld (signmask, q_zero, 12);

}


static inline vui32_t

vec_mask128_f128Cbit (void)

{

  //  const vui32_t carry = CONST_VINT128_W (0x00020000, 0, 0, 0);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  vui32_t carry = vec_splat_u32 (2);

  return vec_sld (carry, q_zero, 14);

}


static inline vui32_t

vec_mask128_f128Lbit (void)

{

  //  const vui32_t hidden = CONST_VINT128_W (0x00010000, 0, 0, 0);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  vui32_t hidden = vec_splat_u32 (1);

  return vec_sld (hidden, q_zero, 14);

}


static inline vui32_t

vec_mask128_f128Qbit (void)

{

  //  const vui32_t QNaNbit = CONST_VINT128_W (0x00008000, 0, 0, 0);

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  vui32_t QNaNbit;

  QNaNbit = vec_sl (q_ones, q_ones);

  return vec_sld (QNaNbit, q_zero, 10);

}


 static inline __binary128

 vec_sel_bin128_2_bin128 (__binary128 vfa, __binary128 vfb, vb128_t mask)

 {

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

      && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

   __binary128 result;

 #ifdef __VSX__

   __asm__(

       "xxsel %x0,%x1,%x2,%x3"

       : "=wa" (result)

       : "wa" (vfa), "wa" (vfb), "wa" (mask)

       : );

 #else

   __asm__(

       "vsel %0,%1,%2,%3"

       : "=v" (result)

       : "v" (vfa), "v" (vfb), "v" (mask)

       : );

 #endif

   return result;

 #else

   __VF_128 ua, ub;

   vui32_t result;


   ua.vf1 = vfa;

   ub.vf1 = vfb;


   result = vec_sel (ua.vx4, ub.vx4, (vb32_t) mask);

   return vec_xfer_vui32t_2_bin128 (result);

 #endif

 }


 static inline vui32_t

 vec_and_bin128_2_vui32t (__binary128 f128, vui32_t mask)

 {

   vui32_t result;

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

 #ifdef __VSX__

   __asm__(

       "xxland %x0,%x1,%x2"

       : "=wa" (result)

       : "wa" (f128), "wa" (mask)

       : );

 #else

   __asm__(

       "vand %0,%1,%2"

       : "=v" (result)

       : "v" (f128), "v" (mask)

       : );

 #endif

 #else

   __VF_128 vunion;


   vunion.vf1 = f128;


   result = (vec_and (vunion.vx4, mask));

 #endif

   return result;

 }


 static inline vui32_t

 vec_andc_bin128_2_vui32t (__binary128 f128, vui32_t mask)

 {

   vui32_t result;

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

 #ifdef __VSX__

   __asm__(

       "xxlandc %x0,%x1,%x2"

       : "=wa" (result)

       : "wa" (f128), "wa" (mask)

       : );

 #else

   __asm__(

       "vandc %0,%1,%2"

       : "=v" (result)

       : "v" (f128), "v" (mask)

       : );

 #endif

 #else

   __VF_128 vunion;


   vunion.vf1 = f128;


   result = (vec_andc (vunion.vx4, mask));

 #endif

   return result;

 }


 static inline vui32_t

 vec_or_bin128_2_vui32t (__binary128 f128, vui32_t mask)

 {

   vui32_t result;

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

 #ifdef __VSX__

   __asm__(

       "xxlor %x0,%x1,%x2"

       : "=wa" (result)

       : "wa" (f128), "wa" (mask)

       : );

 #else

   __asm__(

       "vor %0,%1,%2"

       : "=v" (result)

       : "v" (f128), "v" (mask)

       : );

 #endif

 #else

   __VF_128 vunion;


   vunion.vf1 = f128;


   result = (vec_or (vunion.vx4, mask));

 #endif

   return result;

 }


 static inline vui32_t

 vec_xor_bin128_2_vui32t (__binary128 f128, vui32_t mask)

 {

   vui32_t result;

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

 #ifdef __VSX__

   __asm__(

       "xxlxor %x0,%x1,%x2"

       : "=wa" (result)

       : "wa" (f128), "wa" (mask)

       : );

 #else

   __asm__(

       "vxor %0,%1,%2"

       : "=v" (result)

       : "v" (f128), "v" (mask)

       : );

 #endif

 #else

   __VF_128 vunion;


   vunion.vf1 = f128;


   result = (vec_xor (vunion.vx4, mask));

 #endif

   return result;

 }


 static inline vui128_t

 vec_andc_bin128_2_vui128t (__binary128 f128, vui128_t mask)

 {

   vui128_t result;

 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

   // Work around for GCC PR 100085

 #ifdef __VSX__

   __asm__(

       "xxlandc %x0,%x1,%x2"

       : "=wa" (result)

       : "wa" (f128), "wa" (mask)

       : );

 #else

   __asm__(

       "vandc %0,%1,%2"

       : "=v" (result)

       : "v" (f128), "v" (mask)

       : );

 #endif

 #else

   __VF_128 vunion;


   vunion.vf1 = f128;

   // vec_andc does not accept vector __int128 type

   result = (vui128_t) vec_andc (vunion.vx4, (vui32_t) mask);

 #endif

   return result;

 }


static inline vui8_t

vec_xfer_bin128_2_vui8t (__binary128 f128)

{

  vui8_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

  // Work around for GCC PR 100085

#ifdef __VSX__

  __asm__(

      "xxlor %x0,%x1,%x1"

      : "=wa" (result)

      : "wa" (f128)

      : );

#else

  __asm__(

      "vor %0,%1,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else

  __VF_128 vunion;


  vunion.vf1 = f128;


  result = (vunion.vx16);

#endif

  return result;

}


static inline vui16_t

vec_xfer_bin128_2_vui16t (__binary128 f128)

{

  __VF_128 vunion;


  vunion.vf1 = f128;


  return (vunion.vx8);

}


static inline vui32_t

vec_xfer_bin128_2_vui32t (__binary128 f128)

{

  vui32_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

  // Work around for GCC PR 100085

#ifdef __VSX__

  __asm__(

      "xxlor %x0,%x1,%x1"

      : "=wa" (result)

      : "wa" (f128)

      : );

#else

  __asm__(

      "vor %0,%1,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else

  __VF_128 vunion;


  vunion.vf1 = f128;


  result = (vunion.vx4);

#endif

  return result;

}


static inline vui64_t

vec_mrgh_bin128_2_vui64t (__binary128 vfa, __binary128 vfb)

{

  vui64_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9) && defined (__VSX__)

  // Work around for GCC PR 100085

  __asm__(

      "xxmrghd %x0,%x1,%x2"

      : "=wa" (result)

      : "wa" (vfa), "wa" (vfb)

      : );

#else

  __VF_128 vunion_a, vunion_b;


  vunion_a.vf1 = vfa;

  vunion_b.vf1 = vfb;


  result = vec_mrgahd (vunion_a.vx1, vunion_b.vx1);

#endif

  return result;

}


static inline vui64_t

vec_mrgl_bin128_2_vui64t (__binary128 vfa, __binary128 vfb)

{

  vui64_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9) && defined (__VSX__)

  // Work around for GCC PR 100085

  __asm__(

      "xxmrgld %x0,%x1,%x2"

      : "=wa" (result)

      : "wa" (vfa), "wa" (vfb)

      : );

#else

  __VF_128 vunion_a, vunion_b;


  vunion_a.vf1 = vfa;

  vunion_b.vf1 = vfb;


  result = vec_mrgald (vunion_a.vx1, vunion_b.vx1);

#endif

  return result;

}


static inline vui64_t

vec_xfer_bin128_2_vui64t (__binary128 f128)

{

  vui64_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

  // Work around for GCC PR 100085

#ifdef __VSX__

  __asm__(

      "xxlor %x0,%x1,%x1"

      : "=wa" (result)

      : "wa" (f128)

      : );

#else

  __asm__(

      "vor %0,%1,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else

  __VF_128 vunion;


  vunion.vf1 = f128;


  result = (vunion.vx2);

#endif

  return result;

}


static inline vui128_t

vec_xfer_bin128_2_vui128t (__binary128 f128)

{

  vui128_t result;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && (__GNUC__ > 7) \

    && !defined (_ARCH_PWR9)

  // Work around for GCC PR 100085

#ifdef __VSX__

  __asm__(

      "xxlor %x0,%x1,%x1"

      : "=wa" (result)

      : "wa" (f128)

      : );

#else

  __asm__(

      "vor %0,%1,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else

  __VF_128 vunion;


  vunion.vf1 = f128;


  result = (vunion.vx1);

#endif

  return result;

}


static inline __binary128

vec_xfer_vui8t_2_bin128 (vui8_t f128)

{

  __VF_128 vunion;


  vunion.vx16 = f128;


  return (vunion.vf1);

}


static inline __binary128

vec_xfer_vui16t_2_bin128 (vui16_t f128)

{

  __VF_128 vunion;


  vunion.vx8 = f128;


  return (vunion.vf1);

}


static inline __binary128

vec_xfer_vui32t_2_bin128 (vui32_t f128)

{

  __VF_128 vunion;


  vunion.vx4 = f128;


  return (vunion.vf1);

}


static inline __binary128

vec_xfer_vui64t_2_bin128 (vui64_t f128)

{

  __VF_128 vunion;


  vunion.vx2 = f128;


  return (vunion.vf1);

}


static inline __binary128

vec_xfer_vui128t_2_bin128 (vui128_t f128)

{

  __VF_128 vunion;


  vunion.vx1 = f128;


  return (vunion.vf1);

}


static inline __binary128

vec_absf128 (__binary128 f128)

{

  __binary128 result;

#if _ARCH_PWR9

  __asm__(

      "xsabsqp %0,%1;\n"

      : "=v" (result)

      : "v" (f128)

      :);

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  tmp = vec_andc_bin128_2_vui32t (f128, signmask);

  result = vec_xfer_vui32t_2_bin128 (tmp);

#endif

  return (result);

}


static inline int

vec_all_isfinitef128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return !scalar_test_data_class (f128, 0x70);

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  return !vec_all_eq(tmp, expmask);

#endif

}


static inline int

vec_all_isinff128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return scalar_test_data_class (f128, 0x30);

#else

  vui32_t tmp;

#if 0

  const vui32_t magmask = CONST_VINT128_W (0x7fffffff, -1, -1, -1);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

#else

  vui32_t magmask = vec_mask128_f128mag ();

  vui32_t expmask = vec_mask128_f128exp ();

#endif


  tmp = vec_and_bin128_2_vui32t (f128, magmask);

  return vec_all_eq(tmp, expmask);

#endif

}


static inline int

vec_all_isnanf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return scalar_test_data_class (f128, 0x40);

#elif defined (_ARCH_PWR8)

  vui32_t tmp;

  vui32_t magmask = vec_mask128_f128mag ();

  vui32_t expmask = vec_mask128_f128exp ();


  tmp  = vec_and_bin128_2_vui32t (f128, magmask);

  return vec_cmpuq_all_gt ((vui128_t) tmp, (vui128_t) expmask);

#else

  vui32_t tmp, tmp2;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  tmp  = vec_andc_bin128_2_vui32t (f128, signmask);

  tmp2 = vec_and_bin128_2_vui32t (f128, expmask);

  return (vec_all_eq (tmp2, expmask) && vec_any_gt(tmp, expmask));

#endif

}


static inline int

vec_all_isnormalf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return !scalar_test_data_class (f128, 0x7f);

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

  const vui32_t vec_zero = CONST_VINT128_W (0, 0, 0, 0);


  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  return !(vec_all_eq (tmp, expmask) || vec_all_eq(tmp, vec_zero));

#endif

}


static inline int

vec_all_issubnormalf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return scalar_test_data_class (f128, 0x03);

#else

  const vui64_t minnorm = CONST_VINT128_DW(0x0001000000000000UL, 0UL);

  const vui64_t vec_zero = CONST_VINT128_DW(0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t tmp1;


  // Equivalent to vec_absf128 (f128)

  tmp1 = (vui128_t) vec_andc_bin128_2_vui32t (f128, signmask);


  return vec_cmpuq_all_gt ((vui128_t) minnorm, tmp1)

      && !vec_cmpuq_all_eq (tmp1, (vui128_t) vec_zero);

#endif

}


static inline int

vec_all_isunorderedf128 (__binary128 vfa, __binary128 vfb)

{

  return (vec_all_isnanf128 (vfa) || vec_all_isnanf128 (vfb));

}


static inline int

vec_all_iszerof128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  return scalar_test_data_class (f128, 0x0c);

#else

  vui64_t tmp2;

  const vui64_t vec_zero = CONST_VINT128_DW(0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  // Equivalent to vec_absf128 (f128)

  tmp2 = (vui64_t) vec_andc_bin128_2_vui32t (f128, signmask);

#if _ARCH_PWR8

  return vec_all_eq(tmp2, vec_zero);

#else

  return vec_all_eq((vui32_t)tmp2, (vui32_t)vec_zero);

#endif

#endif

}


static inline __binary128

vec_copysignf128 (__binary128 f128x, __binary128 f128y)

{

  __binary128 result;

#if _ARCH_PWR9

  __asm__(

      "xscpsgnqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (f128x), "v" (f128y)

      :);

#else

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui32_t tmpx, tmpy, tmp;

  tmpx = vec_xfer_bin128_2_vui32t (f128x);

  tmpy = vec_xfer_bin128_2_vui32t (f128y);


  tmp = vec_sel (tmpy, tmpx, signmask);

  result = vec_xfer_vui32t_2_bin128 (tmp);

#endif

  return (result);

}


static inline __binary128

vec_const_huge_valf128 ()

{

  const vui32_t posinf = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  return vec_xfer_vui32t_2_bin128 (posinf);

}


static inline __binary128

vec_const_inff128 ()

{

  const vui32_t posinf = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  return vec_xfer_vui32t_2_bin128 (posinf);

}


static inline __binary128

vec_const_nanf128 ()

{

  const vui32_t posnan = CONST_VINT128_W (0x7fff8000, 0, 0, 0);


  return vec_xfer_vui32t_2_bin128 (posnan);

}


static inline __binary128

vec_const_nansf128 ()

{

  const vui32_t signan = CONST_VINT128_W (0x7fff4000, 0, 0, 0);


  return vec_xfer_vui32t_2_bin128 (signan);

}


static inline vb128_t

vec_cmpeqtoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa == vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vra, vrb;

  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);

  result = vec_cmpequq ( vra,  vrb );

#endif

  return result;

}


static inline vb128_t

vec_cmpequzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa == vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t cmps, or_ab, eq_s;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  eq_s = vec_cmpequq ((vui128_t) or_ab, (vui128_t) signmask);

  cmps = vec_cmpequq ((vui128_t) vra, (vui128_t)vrb);

  result = (vb128_t) vec_or ((vui32_t) cmps, (vui32_t) eq_s);

#endif

  return result;

}


static inline vb128_t

vec_cmpequqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa == vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t cmps, or_ab, eq_s;

  vui64_t vra, vrb;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);

  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  eq_s = vec_cmpequq ((vui128_t) or_ab, (vui128_t) signmask);

  cmps = vec_cmpequq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_or ((vui32_t) cmps, (vui32_t) eq_s);

  result = (vb128_t) vec_andc ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline vb128_t

vec_cmpgetoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa >= vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  agtb = vec_cmpgesq ((vi128_t) vfa128, (vi128_t) vfb128);

  altb = vec_cmpleuq ((vui128_t) vfa128, (vui128_t) vfb128);

  result = (vb128_t) vec_sel ((vui32_t)agtb, (vui32_t)altb, (vui32_t)signbool);

#endif

  return result;

}


static inline vb128_t

vec_cmpgeuzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa >= vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpgeuq ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline vb128_t

vec_cmpgeuqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa >= vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpgeuq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_andc ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline vb128_t

vec_cmpgttoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa > vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  agtb = vec_cmpgtsq ((vi128_t) vfa128, (vi128_t) vfb128);

  altb = vec_cmpltuq ((vui128_t) vfa128, (vui128_t) vfb128);

  result = (vb128_t) vec_sel ((vui32_t)agtb, (vui32_t)altb, (vui32_t)signbool);

#endif

  return result;

}


static inline vb128_t

vec_cmpgtuzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa > vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpgtuq ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline vb128_t

vec_cmpgtuqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa > vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpgtuq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_andc ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline vb128_t

vec_cmpletoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa <= vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  altb = vec_cmplesq ((vi128_t) vfa128, (vi128_t) vfb128);

  agtb = vec_cmpgeuq ((vui128_t) vfa128, (vui128_t) vfb128);

  result = (vb128_t) vec_sel ((vui32_t)altb, (vui32_t)agtb, (vui32_t)signbool);

#endif

  return result;

}


static inline vb128_t

vec_cmpleuzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa <= vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpleuq ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline vb128_t

vec_cmpleuqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgeqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa <= vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpleuq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_andc ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline vb128_t

vec_cmplttoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa < vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  altb = vec_cmpltsq ((vi128_t) vfa128, (vi128_t) vfb128);

  agtb = vec_cmpgtuq ((vui128_t) vfa128, (vui128_t) vfb128);

  result = (vb128_t) vec_sel ((vui32_t)altb, (vui32_t)agtb, (vui32_t)signbool);

#endif

  return result;

}


static inline vb128_t

vec_cmpltuzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa < vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpltuq ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline vb128_t

vec_cmpltuqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpgtqp %0,%2,%1;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa < vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpltuq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_andc ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline vb128_t

vec_cmpnetoqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

  result = (vb128_t) vec_nor ((vui32_t) result, (vui32_t) result);

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vb128_t) vec_splat_u32 (0);

  if (vfa != vfb)

    result= (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  vui128_t vra, vrb;

  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);

  result = vec_cmpneuq ( vra,  vrb );

#endif

  return result;

}


static inline vb128_t

vec_cmpneuzqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

  result = (vb128_t) vec_nor ((vui32_t) result, (vui32_t) result);

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa != vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t cmps, or_ab, eq_s;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  eq_s = vec_cmpequq ((vui128_t) or_ab, (vui128_t) signmask);

  cmps = vec_cmpequq ((vui128_t) vra, (vui128_t)vrb);

  result = (vb128_t) vec_nor ((vui32_t) cmps, (vui32_t) eq_s);

#endif

  return result;

}


static inline vb128_t

vec_cmpneuqp (__binary128 vfa, __binary128 vfb)

{

  vb128_t result;

#if defined (_ARCH_PWR10) && defined (__FLOAT128__)  && (__GNUC__ >= 10)

  __asm__(

      "xscmpeqqp %0,%1,%2;\n"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

  result = (vb128_t) vec_nor ((vui32_t) result, (vui32_t) result);

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vb128_t) vec_splat_u32 (0);

  if (vfa != vfb)

    result = (vb128_t) vec_splat_s32 (-1);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t cmps, or_ab, eq_s;

  vui64_t vra, vrb;

  vb128_t unordered;


  unordered = vec_isunorderedf128 (vfa, vfb);


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  eq_s = vec_cmpequq ((vui128_t) or_ab, (vui128_t) signmask);

  cmps = vec_cmpequq ((vui128_t) vra, (vui128_t) vrb);

  result = (vb128_t) vec_nor ((vui32_t) cmps, (vui32_t) eq_s);

  result = (vb128_t) vec_or ((vui32_t) result, (vui32_t) unordered);

#endif

  return result;

}


static inline int

vec_cmpqp_all_toeq (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vfa == vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vra, vrb;

  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);

  result = vec_cmpuq_all_eq ( vra,  vrb );

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzeq (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa == vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t or_ab;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  result = vec_cmpuq_all_eq ((vui128_t) or_ab, (vui128_t) signmask)

        || vec_cmpuq_all_eq ((vui128_t) vra, (vui128_t)vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_eq (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa == vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t or_ab;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  result = (vec_cmpuq_all_eq ((vui128_t) or_ab, (vui128_t) signmask)

            || vec_cmpuq_all_eq ((vui128_t) vra, (vui128_t)vrb))

         && !vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_toge (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa >= vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;

  vui32_t togt;

  const vui32_t zeros = (vui32_t) vec_splat_u32 (0);


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  agtb = vec_cmpgesq ((vi128_t) vfa128, (vi128_t) vfb128);

  altb = vec_cmpleuq ((vui128_t) vfa128, (vui128_t) vfb128);

  togt = vec_sel ((vui32_t)agtb, (vui32_t)altb, (vui32_t)signbool);

  result = vec_all_ne (togt, zeros);

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzge (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa >= vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_ge ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_ge (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa >= vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_ge ((vui128_t) vra, (vui128_t) vrb)

         && !vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_togt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa > vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;

  vui32_t togt;

  const vui32_t zeros = (vui32_t) vec_splat_u32 (0);


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  agtb = vec_cmpgtsq ((vi128_t) vfa128, (vi128_t) vfb128);

  altb = vec_cmpltuq ((vui128_t) vfa128, (vui128_t) vfb128);

  togt = vec_sel ((vui32_t)agtb, (vui32_t)altb, (vui32_t)signbool);

  result = vec_all_ne (togt, zeros);

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzgt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa > vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_gt ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_gt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa > vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_gt ((vui128_t) vra, (vui128_t) vrb)

         && !vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}

static inline int

vec_cmpqp_all_tole (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa <= vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;

  vui32_t tolt;

  const vui32_t zeros = (vui32_t) vec_splat_u32 (0);


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  altb = vec_cmplesq ((vi128_t) vfa128, (vi128_t) vfb128);

  agtb = vec_cmpgeuq ((vui128_t) vfa128, (vui128_t) vfb128);

  tolt = vec_sel ((vui32_t)altb, (vui32_t)agtb, (vui32_t)signbool);

  result = vec_all_ne (tolt, zeros);

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzle (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa <= vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_le ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_le (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa <= vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_le ((vui128_t) vra, (vui128_t) vrb)

         && !vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_tolt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa < vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vfa128, vfb128;

  vb128_t altb, agtb;

  vb128_t signbool;

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t splatvfa;

  vui32_t tolt;

  const vui32_t zeros = (vui32_t) vec_splat_u32 (0);


  vfa128 = vec_xfer_bin128_2_vui128t (vfa);

  vfb128 = vec_xfer_bin128_2_vui128t (vfb);


  // Replace (vfa >= 0) with (vfa < 0) == vec_setb_qp (vfa)

  splatvfa = vec_splat ((vui8_t) vfa128, VEC_BYTE_H);

  signbool = (vb128_t) vec_sra (splatvfa, shift);


  altb = vec_cmpltsq ((vi128_t) vfa128, (vi128_t) vfb128);

  agtb = vec_cmpgtuq ((vui128_t) vfa128, (vui128_t) vfb128);

  tolt = vec_sel ((vui32_t)altb, (vui32_t)agtb, (vui32_t)signbool);

  result = vec_all_ne (tolt, zeros);

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzlt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa < vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_lt ((vui128_t) vra, (vui128_t) vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_lt (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa < vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vui128_t vra, vrb;

  vb128_t age0, bge0;

  vui128_t vrap, vran;

  vui128_t vrbp, vrbn;


  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);


  age0 = vec_setb_qp (vfa);

  vrap = (vui128_t) vec_xor ((vui32_t) vra, signmask);

  vran = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vra);

  vra  = (vui128_t) vec_sel ((vui32_t)vrap, (vui32_t)vran, (vui32_t)age0);


  bge0 = vec_setb_qp (vfb);

  vrbp = (vui128_t) vec_xor ((vui32_t) vrb, signmask);

  vrbn = (vui128_t) vec_subuqm ((vui128_t) zero, (vui128_t) vrb);

  vrb  = (vui128_t) vec_sel ((vui32_t)vrbp, (vui32_t)vrbn, (vui32_t)bge0);


  result = vec_cmpuq_all_lt ((vui128_t) vra, (vui128_t) vrb)

         && !vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_tone (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result= (vfa != vfb);

#else // defined( _ARCH_PWR8 )

  vui128_t vra, vrb;

  vra = vec_xfer_bin128_2_vui128t (vfa);

  vrb = vec_xfer_bin128_2_vui128t (vfb);

  result = vec_cmpuq_all_ne ( vra,  vrb );

#endif

  return result;

}


static inline int

vec_cmpqp_all_uzne (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa != vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t or_ab;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  result = vec_cmpuq_all_ne ((vui128_t) or_ab, (vui128_t) signmask)

        && vec_cmpuq_all_ne ((vui128_t) vra, (vui128_t)vrb);

#endif

  return result;

}


static inline int

vec_cmpqp_all_ne (__binary128 vfa, __binary128 vfb)

{

  int result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  result = (vfa != vfb);

#else // defined( _ARCH_PWR8 )

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  vb128_t or_ab;

  vui64_t vra, vrb;


  vra = vec_xfer_bin128_2_vui64t (vfa);

  vrb = vec_xfer_bin128_2_vui64t (vfb);


  or_ab = (vb128_t) vec_or ( vra, vrb );

  result = (vec_cmpuq_all_ne ((vui128_t) or_ab, (vui128_t) signmask)

            && vec_cmpuq_all_ne ((vui128_t) vra, (vui128_t)vrb))

         || vec_all_isunorderedf128 (vfa, vfb);

#endif

  return result;

}


static inline int

vec_cmpqp_exp_eq (__binary128 vfa, __binary128 vfb)

{

#if defined (_ARCH_PWR9) && defined (scalar_cmp_exp_gt) \

  && defined (__FLOAT128__) && (__GNUC__ >= 9)

  return scalar_cmp_exp_eq (vfa, vfb);

#else

  vui32_t vra, vrb;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  vra = vec_and_bin128_2_vui32t (vfa, expmask);

  vrb = vec_and_bin128_2_vui32t (vfb, expmask);

  return vec_cmpuq_all_eq ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline int

vec_cmpqp_exp_gt (__binary128 vfa, __binary128 vfb)

{

#if defined (_ARCH_PWR9) && defined (scalar_cmp_exp_gt) \

  && defined (__FLOAT128__) && (__GNUC__ >= 9)

  return scalar_cmp_exp_gt (vfa, vfb);

#else

  vui32_t vra, vrb;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  vra = vec_and_bin128_2_vui32t (vfa, expmask);

  vrb = vec_and_bin128_2_vui32t (vfb, expmask);

  return vec_cmpuq_all_gt ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline int

vec_cmpqp_exp_lt (__binary128 vfa, __binary128 vfb)

{

#if defined (_ARCH_PWR9) && defined (scalar_cmp_exp_gt) \

  && defined (__FLOAT128__) && (__GNUC__ >= 9)

  return scalar_cmp_exp_lt (vfa, vfb);

#else

  vui32_t vra, vrb;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  vra = vec_and_bin128_2_vui32t (vfa, expmask);

  vrb = vec_and_bin128_2_vui32t (vfb, expmask);

  return vec_cmpuq_all_lt ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline int

vec_cmpqp_exp_unordered (__binary128 vfa, __binary128 vfb)

{

#if defined (_ARCH_PWR9) && defined (scalar_cmp_exp_gt) \

  && defined (__FLOAT128__) && (__GNUC__ >= 9)

  return scalar_cmp_exp_unordered (vfa, vfb);

#else

  vui32_t vra, vrb;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  vra = vec_and_bin128_2_vui32t (vfa, expmask);

  vrb = vec_and_bin128_2_vui32t (vfb, expmask);

  return vec_cmpuq_all_lt ((vui128_t) vra, (vui128_t) vrb);

#endif

}


static inline vb128_t

vec_isfinitef128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (-1, -1, -1, -1);


  if (scalar_test_data_class (f128, 0x70))

    result = CONST_VINT128_W (0, 0, 0, 0);


  return (vb128_t)result;

#else

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

  vui32_t tmp;

  vb128_t tmp2, tmp3;


  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  tmp2 = (vb128_t) vec_cmpeq (tmp, expmask);

  tmp3 = (vb128_t) vec_splat ((vui32_t) tmp2, VEC_W_H);

  return (vb128_t) vec_nor ((vui32_t) tmp3, (vui32_t) tmp3); // vec_not

#endif

}


static inline int

vec_isinf_signf128 (__binary128 f128)

{

  int result;

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  if (scalar_test_data_class (f128, 0x20))

    result = 1;

  else if (scalar_test_data_class (f128, 0x10))

    result = -1;

  else

    result = 0;

#else

  vui32_t tmp, t128;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  t128 = vec_xfer_bin128_2_vui32t (f128);

  tmp = vec_andc_bin128_2_vui32t (f128, signmask);


  if (vec_all_eq(tmp, expmask))

    {

      if (vec_any_gt(t128, expmask))

        result = -1;

      else

        result = 1;

    }

  else

    result = 0;

#endif

  return (result);

}


static inline vb128_t

vec_isinff128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (0, 0, 0, 0);


  if (scalar_test_data_class (f128, 0x30))

    result = CONST_VINT128_W (-1, -1, -1, -1);


  return (vb128_t)result;

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  tmp = vec_andc_bin128_2_vui32t (f128, signmask);

  return vec_cmpequq ((vui128_t)tmp , (vui128_t)expmask);

#endif

}


static inline vb128_t

vec_isnanf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (0, 0, 0, 0);


  if (scalar_test_data_class (f128, 0x40))

    result = CONST_VINT128_W (-1, -1, -1, -1);


  return (vb128_t)result;

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);


  tmp = vec_andc_bin128_2_vui32t (f128, signmask);

  return vec_cmpgtuq ((vui128_t)tmp , (vui128_t)expmask);

#endif

}


static inline vb128_t

vec_isnormalf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (-1, -1, -1, -1);


  if (scalar_test_data_class (f128, 0x7f))

    result = CONST_VINT128_W (0, 0, 0, 0);


  return (vb128_t)result;

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

  const vui32_t vec_zero = CONST_VINT128_W (0, 0, 0, 0);

  vb128_t result;


  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  result = (vb128_t) vec_nor (vec_cmpeq (tmp, expmask),

                              vec_cmpeq (tmp, vec_zero));

  return (vb128_t) vec_splat ((vui32_t) result, VEC_W_H);

#endif

}


static inline vb128_t

vec_issubnormalf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (0, 0, 0, 0);


  if (scalar_test_data_class (f128, 0x03))

    result = CONST_VINT128_W (-1, -1, -1, -1);


  return (vb128_t)result;

#else

  vui32_t tmp, tmpz, tmp2;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui32_t vec_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t minnorm = CONST_VINT128_W (0x00010000, 0, 0, 0);


  // Equivalent to vec_absf128 (f128)

  tmp = vec_andc_bin128_2_vui32t (f128, signmask);


  tmp2 = (vui32_t) vec_cmpltuq ((vui128_t)tmp, (vui128_t)minnorm);

  tmpz = (vui32_t) vec_cmpequq ((vui128_t)tmp, (vui128_t)vec_zero);

  return (vb128_t) vec_andc (tmp2, tmpz);

#endif

}


static inline vb128_t

vec_isunorderedf128 (__binary128 vfa, __binary128 vfb)

{

  return (vb128_t) vec_or ((vui32_t) vec_isnanf128 (vfa),

                           (vui32_t) vec_isnanf128 (vfb));

}


static inline vb128_t

vec_iszerof128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_data_class) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui32_t result = CONST_VINT128_W (0, 0, 0, 0);


  if (scalar_test_data_class (f128, 0x0c))

    result = CONST_VINT128_W (-1, -1, -1, -1);


  return (vb128_t)result;

#else

  vui128_t t128;

  const vui64_t vec_zero = CONST_VINT128_DW(0, 0);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  // Equivalent to vec_absf128 (f128)

  t128 = (vui128_t) vec_andc_bin128_2_vui32t (f128, signmask);

  return  (vb128_t)vec_cmpequq (t128, (vui128_t)vec_zero);

#endif

}


static inline __binary128

vec_nabsf128 (__binary128 f128)

{

  __binary128 result;

#if _ARCH_PWR9

  __asm__(

      "xsnabsqp %0,%1;\n"

      : "=v" (result)

      : "v" (f128)

      :);

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  tmp = vec_andc_bin128_2_vui32t (f128, signmask);

  result = vec_xfer_vui32t_2_bin128 (tmp);

#endif

  return (result);

}


static inline __binary128

vec_negf128 (__binary128 f128)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 6)

#if defined (__FLOAT128__) && (__GNUC__ > 7)

  // Let the compilers generate and optimize code.

  result = -f128;

#else

  // If the compiler supports _ARCH_PWR9, must support mnemonics.

  __asm__(

      "xsnegqp %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  tmp = vec_xor_bin128_2_vui32t (f128, signmask);

  result = vec_xfer_vui32t_2_bin128 (tmp);

#endif

  return (result);

}


static inline __binary128

vec_self128 (__binary128 vfa, __binary128 vfb, vb128_t mask)

{

  return vec_sel_bin128_2_bin128 (vfa, vfb, mask);

}


static inline vb128_t

vec_setb_qp (__binary128 f128)

{

  vb128_t result;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "vexpandqm %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#elif defined (_ARCH_PWR9) && defined (scalar_test_neg) && (__GNUC__ > 7)

  result = (vb128_t) {(__int128) 0};


  if (scalar_test_neg (f128))

    result = (vb128_t) {(__int128)-1};


  return (vb128_t)result;

#else

  const vui8_t shift = vec_splat_u8 (7);

  vui8_t t128 = vec_xfer_bin128_2_vui8t (f128);

  vui8_t splat = vec_splat (t128, VEC_BYTE_H);


  result = (vb128_t) vec_sra (splat, shift);

#endif

  return result;

}


static inline int

vec_signbitf128 (__binary128 f128)

{

#if defined (_ARCH_PWR9) && defined (scalar_test_neg) && (__GNUC__ > 7)

  return scalar_test_neg (f128);

#else

  vui32_t tmp;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  tmp = vec_and_bin128_2_vui32t (f128, signmask);

  return vec_all_eq(tmp, signmask);

#endif

}


static inline __binary128

vec_xsaddqpo (__binary128 vfa, __binary128 vfb)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 8)

  // earlier GCC versions generate extra data moves for this.

  result = __builtin_addf128_round_to_odd (vfa, vfb);

#else

  // No extra data moves here.

  __asm__(

      "xsaddqpo %0,%1,%2"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#endif

  return result;

#else // defined (_ARCH_PWR7)

  vui64_t q_exp, a_exp, b_exp, x_exp;

  vui128_t q_sig, a_sig, b_sig, p_tmp, p_odd;

  vui128_t a_mag, b_mag;

  vui32_t q_sign,  a_sign,  b_sign;

  vb128_t a_lt_b;

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  const vui32_t magmask = vec_mask128_f128mag();

  const vui64_t exp_naninf = vec_mask64_f128exp();

  // Vector extract the exponents from vfa, vfb

  x_exp = vec_xxxexpqpp (vfa, vfb);

  // Mask off sign bits so can use integers for magnitude compare.

  a_mag = (vui128_t) vec_and_bin128_2_vui32t (vfa, magmask);

  b_mag = (vui128_t) vec_and_bin128_2_vui32t (vfb, magmask);

  a_sign = vec_andc_bin128_2_vui32t (vfa, magmask);

  b_sign = vec_andc_bin128_2_vui32t (vfb, magmask);

//  if (vec_all_isfinitef128 (vfa) && vec_all_isfinitef128 (vfb))

//  The above can be optimized to the following

  if (__builtin_expect (vec_cmpud_all_lt (x_exp, exp_naninf), 1))

    {

      const vui128_t xbitmask = vec_splat_u128 (1);

      const vui128_t grx_mask = vec_splat_u128 (7);

      const vui64_t exp_min = vec_splat_u64 (1);

      const vui8_t t_sig_L = vec_splat_u8 (7);

      const vui8_t t_sig_C = vec_splat_u8 (15);

      const vui64_t exp_one = exp_min;

      const vui64_t exp_dnrm = (vui64_t) q_zero;

      vui128_t add_sig, sub_sig;

      vui128_t s_sig, x_bits;

      vui32_t diff_sign;

      vui32_t sigmask = vec_mask128_f128sig();

      vui32_t hidden = vec_mask128_f128Lbit();

      vui32_t a_norm, b_norm, x_norm;

      vui32_t a_s32, b_s32;


      // Extract the significand

      // Assume that the sign-bit is already masked off

      // Mask off the significands

      a_s32 = vec_and ((vui32_t) a_mag, sigmask);

      b_s32 = vec_and ((vui32_t) b_mag, sigmask);

      // Assume that exponents are already extracted and merged

      // Compare exponents for denormal, assume finite

      x_norm = (vui32_t) vec_cmpgt ((vui32_t) x_exp, q_zero);

      a_norm = vec_splat (x_norm, VEC_WE_1);

      b_norm = vec_splat (x_norm, VEC_WE_3);

      // For Normal QP insert (hidden) L-bit into significand

      a_sig =  (vui128_t) vec_sel (a_s32, a_norm, hidden);

      b_sig =  (vui128_t) vec_sel (b_s32, b_norm, hidden);

      // Correct exponent for zeros or denormals to E_min

      // will force 0 exponents for zero/denormal results later

      //exp_mask = vec_cmpequd (x_exp, exp_dnrm);

      x_exp = vec_selud ( exp_min, x_exp, (vb64_t) x_norm);

      // Generation sign difference for signed 0.0

      q_sign = vec_xor (a_sign, b_sign);

      // Precondition the significands before add so the GRX bits

      // are in the least significant 3 bit.

      a_sig = vec_slqi (a_sig, 3);

      b_sig = vec_slqi (b_sig, 3);


      // If sign(vfa) != sign(vfb) will need to:

      // 1) Subtract instead of add significands

      // 2) Generate signed zeros

      diff_sign = (vui32_t) vec_setb_sq ((vi128_t) q_sign);

      // If magnitude(b) >  magnitude(a) will need to swap a/b, later

      a_lt_b = vec_cmpltuq (a_mag, b_mag);


      // Now swap operands a/b if necessary so a has greater magnitude.

      {

        vui128_t a_tmp = a_sig;

        vui128_t b_tmp = b_sig;

        vui64_t x_tmp = vec_swapd (x_exp);


        q_sign = vec_sel (a_sign, b_sign, (vui32_t) a_lt_b);


        x_exp = vec_selud (x_exp, x_tmp, (vb64_t) a_lt_b);

        a_exp = vec_splatd (x_exp, VEC_DW_H);

        b_exp = vec_splatd (x_exp, VEC_DW_L);

        q_exp = a_exp;


        a_sig = vec_seluq (a_tmp, b_tmp, (vb128_t) a_lt_b);

        b_sig = vec_seluq (b_tmp, a_tmp, (vb128_t) a_lt_b);

      }

      // At this point we can assume that:

      // The magnitude (vfa) >= magnitude (vfb)

      // 1) Exponents (a_exp, b_exp) in the range E_min -> E_max

      // 2) a_exp >= b_exp

      // 2a) If a_exp == b_exp then a_sig >= b_sig

      // 2b) If a_exp > b_exp then

      //     shift (b_sig) right by (a_exp - b_exp)

      //     any bits shifted out of b_sig are ORed into the X-bit

      if (vec_cmpud_all_lt (b_exp, a_exp))

        {

          vui64_t d_exp, l_exp;

          vui128_t t_sig;

          vb128_t exp_mask;

          const vui64_t exp_128 = vec_const64_f128_128();


          d_exp = vec_subudm (a_exp, b_exp);

          exp_mask = (vb128_t) vec_cmpltud (d_exp, exp_128);

          l_exp = vec_subudm (exp_128, d_exp);

          t_sig = vec_srq (b_sig, (vui128_t) d_exp);

          x_bits = vec_slq (b_sig, (vui128_t) l_exp);

          t_sig = vec_seluq ((vui128_t) q_zero, t_sig, exp_mask);

          x_bits = vec_seluq (b_sig, x_bits, exp_mask);

          p_odd = vec_addcuq (x_bits, (vui128_t) q_ones);

          b_sig = (vui128_t) vec_or ((vui32_t) t_sig, (vui32_t) p_odd);

        }


      // If operands have the same sign then s_sig = a_sig + b_sig

      // Otherwise s_sig = a_sig - b_sig

      add_sig = vec_adduqm (a_sig, b_sig);

      sub_sig = vec_subuqm (a_sig, b_sig);

      s_sig = vec_seluq (add_sig, sub_sig, (vb128_t) diff_sign);


      if (__builtin_expect (vec_cmpuq_all_eq (s_sig, (vui128_t) q_zero), 0))

        { // Special case of both zero with different sign

          q_sign = vec_sel (a_sign, (vui32_t) q_zero, diff_sign);

          return vec_xfer_vui32t_2_bin128 (q_sign);

        }


      // Issolate CL bits from significand too simplify the compare

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      vui8_t t_sig = vec_splat ((vui8_t) s_sig, 14);

#else

      vui8_t t_sig = vec_splat ((vui8_t) s_sig, 1);

#endif

//      if (vec_cmpuq_all_gt (s_sig, (vui128_t) sigov))

        if (vec_all_gt (t_sig, t_sig_C))

        { // Check for carry and adjust

          p_odd = (vui128_t) vec_and ((vui32_t) s_sig, (vui32_t) xbitmask);

          s_sig = vec_srqi (s_sig, 1);

          s_sig = (vui128_t) vec_or ((vui32_t) s_sig, (vui32_t) p_odd);

          q_exp = vec_addudm (q_exp, exp_one);

        }

      else // if (vec_cmpuq_all_le (s_sig, (vui128_t) sigovt))

          if (vec_all_le (t_sig, t_sig_L))

        {

          // Or the significand is below normal range.

          // This can happen with subtraction.

          vui64_t c_exp, d_exp;

          vui128_t c_sig;

          const vui64_t exp_12 = vec_splat_u64 (12);


          c_sig = vec_clzq (s_sig);

          c_exp = vec_splatd ((vui64_t) c_sig, VEC_DW_L);

          // The IR has 12 leading zeros that should not effect the shift count.

          c_exp = vec_subudm (c_exp, exp_12);

          d_exp = vec_subudm (q_exp, (vui64_t) exp_min);

          d_exp = vec_minud (c_exp, d_exp);

          {

            vb64_t nrm_mask = vec_cmpgtsd ((vi64_t) q_exp, (vi64_t) exp_min);

            vb64_t exp_mask = vec_cmpgtud (q_exp, c_exp);


            c_sig = vec_slq (s_sig, (vui128_t) d_exp);

            q_exp = vec_subudm (q_exp, d_exp);

            exp_mask = (vb64_t) vec_and ((vui32_t) exp_mask, (vui32_t) nrm_mask);

            q_exp = vec_selud (exp_dnrm,  q_exp, exp_mask);

            s_sig = vec_seluq (s_sig, c_sig, (vb128_t) nrm_mask);

          }

        }

      // Round to odd from low order GRX-bits

      p_tmp = (vui128_t) vec_and ((vui32_t) s_sig, (vui32_t) grx_mask);

      p_odd = vec_addcuq (p_tmp, (vui128_t) q_ones);

      q_sig = vec_srqi (s_sig, 3);

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, (vui32_t) p_odd);

      // Check for exponent overflow -> __FLT128_MAX__

      if (__builtin_expect ((vec_cmpud_all_ge ( q_exp, exp_naninf)), 0))

        {

          // return maximum finite exponent and significand

          const vui32_t f128_max = CONST_VINT128_W(0x7ffeffff, -1, -1, -1);

          vui32_t f128_smax = vec_or ((vui32_t) f128_max, q_sign);

          return vec_xfer_vui32t_2_bin128 (f128_smax);

        }

      // Merge sign, significand, and exponent into final result

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

      result = vec_xsiexpqp (q_sig, q_exp);

      return result;

    }

  else // One or both operands are NaN or Infinity

    {

      //const vui32_t q_nan = CONST_VINT128_W(0x00008000, 0, 0, 0);

      vui32_t q_nan = vec_mask128_f128Qbit ();

      // One or both operands are NaN

      if (vec_all_isnanf128 (vfa))

        {

          // vfa is NaN, Convert vfa to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfa, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else if (vec_all_isnanf128 (vfb))

        {

          // vfb is NaN, Convert vfb to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfb, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else  // Or one or both operands are Infinity

        {

          a_exp = vec_splatd (x_exp, VEC_DW_H);

          // b_exp = vec_splatd (x_exp, VEC_DW_L);

          if (vec_cmpud_all_eq (x_exp, exp_naninf)

              && vec_cmpud_any_ne ((vui64_t) a_sign, (vui64_t) b_sign))

            { // Both operands infinity and opposite sign

              // Inifinty + Infinity (opposite sign) is Default Quiet NaN

              return vec_const_nanf128 ();

            }

          else

            { // Either both operands infinity and same sign

              // Or one infinity and one finite

              if (vec_cmpud_any_eq (a_exp, exp_naninf))

                {

                  // return infinity

                  return vfa;

                }

              else

                {

                  // return infinity

                  return vfb;

                }

            }

        }

    }

#endif

  return result;

}


static inline __binary128

vec_xssubqpo (__binary128 vfa, __binary128 vfb)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 8)

  // earlier GCC versions generate extra data moves for this.

  result = __builtin_subf128_round_to_odd (vfa, vfb);

#else

  // No extra data moves here.

  __asm__(

      "xssubqpo %0,%1,%2"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#endif

  return result;

#else //  defined (_ARCH_PWR7)

  vui64_t q_exp, a_exp, b_exp, x_exp;

  vui128_t q_sig, a_sig, b_sig, p_tmp, p_odd;

  vui128_t a_mag, b_mag;

  vui32_t q_sign,  a_sign,  b_sign;

  vb128_t a_lt_b;

  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W (-1, -1, -1, -1);

  const vui32_t magmask = vec_mask128_f128mag();

  const vui64_t exp_naninf = vec_mask64_f128exp();

  // Vector extract the exponents from vfa, vfb

  x_exp = vec_xxxexpqpp (vfa, vfb);

  // Mask off sign bits so can use integers for magnitude compare.

  a_mag = (vui128_t) vec_and_bin128_2_vui32t (vfa, magmask);

  b_mag = (vui128_t) vec_and_bin128_2_vui32t (vfb, magmask);

  a_sign = vec_andc_bin128_2_vui32t (vfa, magmask);

  b_sign = vec_andc_bin128_2_vui32t (vfb, magmask);

//  if (vec_all_isfinitef128 (vfa) && vec_all_isfinitef128 (vfb))

//  The above can be optimized to the following

  if (__builtin_expect (vec_cmpud_all_lt (x_exp, exp_naninf), 1))

    {

      const vui128_t xbitmask = vec_splat_u128 (1);

      const vui128_t grx_mask = vec_splat_u128 (7);

      const vui64_t exp_min = vec_splat_u64 (1);

      const vui8_t t_sig_L = vec_splat_u8 (7);

      const vui8_t t_sig_C = vec_splat_u8 (15);

      const vui64_t exp_one = exp_min;

      const vui64_t exp_dnrm = (vui64_t) q_zero;

      // signmask is the complement of the magmask

      const vui32_t signmask = vec_nor(magmask, magmask);

      vui128_t add_sig, sub_sig;

      vui128_t s_sig, x_bits;

      vui32_t diff_sign;

      vui32_t sigmask = vec_mask128_f128sig();

      vui32_t hidden = vec_mask128_f128Lbit();

      vui32_t a_norm, b_norm, x_norm;

      vui32_t a_s32, b_s32;


      // Extract the significand

      // Assume that the sign-bit is already masked off

      // Mask off the significands

      a_s32 = vec_and ((vui32_t) a_mag, sigmask);

      b_s32 = vec_and ((vui32_t) b_mag, sigmask);

      // Assume that exponents are already extracted and merged

      // Compare exponents for denormal, assume finite

      x_norm = (vui32_t) vec_cmpgt ((vui32_t) x_exp, q_zero);

      a_norm = vec_splat (x_norm, VEC_WE_1);

      b_norm = vec_splat (x_norm, VEC_WE_3);

      // For Normal QP insert (hidden) L-bit into significand

      a_sig =  (vui128_t) vec_sel (a_s32, a_norm, hidden);

      b_sig =  (vui128_t) vec_sel (b_s32, b_norm, hidden);

      // Correct exponent for zeros or denormals to E_min

      // will force 0 exponents for zero/denormal results later

      //exp_mask = vec_cmpequd (x_exp, exp_dnrm);

      x_exp = vec_selud ( exp_min, x_exp, (vb64_t) x_norm);

      // Negate sign for subtract, then use add logic

      b_sign = vec_xor (signmask, b_sign);

      // Generation sign difference for signed 0.0

      q_sign = vec_xor (a_sign, b_sign);

      // Precondition the significands before add so the GRX bits

      // are in the least significant 3 bit.

      a_sig = vec_slqi (a_sig, 3);

      b_sig = vec_slqi (b_sig, 3);


      // If sign(vfa) != sign(vfb) will need to:

      // 1) Subtract instead of add significands

      // 2) Generate signed zeros

      diff_sign = (vui32_t) vec_setb_sq ((vi128_t) q_sign);

      // If magnitude(b) >  magnitude(a) will need to swap a/b, later

      a_lt_b = vec_cmpltuq (a_mag, b_mag);


      // Now swap operands a/b if necessary so a has greater magnitude.

      {

        vui128_t a_tmp = a_sig;

        vui128_t b_tmp = b_sig;

        vui64_t x_tmp = vec_swapd (x_exp);


        q_sign = vec_sel (a_sign, b_sign, (vui32_t) a_lt_b);


        x_exp = vec_selud (x_exp, x_tmp, (vb64_t) a_lt_b);

        a_exp = vec_splatd (x_exp, VEC_DW_H);

        b_exp = vec_splatd (x_exp, VEC_DW_L);

        q_exp = a_exp;


        a_sig = vec_seluq (a_tmp, b_tmp, (vb128_t) a_lt_b);

        b_sig = vec_seluq (b_tmp, a_tmp, (vb128_t) a_lt_b);

      }

      // At this point we can assume that:

      // The magnitude (vfa) >= magnitude (vfb)

      // 1) Exponents (a_exp, b_exp) in the range E_min -> E_max

      // 2) a_exp >= b_exp

      // 2a) If a_exp == b_exp then a_sig >= b_sig

      // 2b) If a_exp > b_exp then

      //     shift (b_sig) right by (a_exp - b_exp)

      //     any bits shifted out of b_sig are ORed into the X-bit

      if (vec_cmpud_all_lt (b_exp, a_exp))

        {

          vui64_t d_exp, l_exp;

          vui128_t t_sig;

          vb128_t exp_mask;

          const vui64_t exp_128 = vec_const64_f128_128();


          d_exp = vec_subudm (a_exp, b_exp);

          exp_mask = (vb128_t) vec_cmpltud (d_exp, exp_128);

          l_exp = vec_subudm (exp_128, d_exp);

          t_sig = vec_srq (b_sig, (vui128_t) d_exp);

          x_bits = vec_slq (b_sig, (vui128_t) l_exp);

          t_sig = vec_seluq ((vui128_t) q_zero, t_sig, exp_mask);

          x_bits = vec_seluq (b_sig, x_bits, exp_mask);

          p_odd = vec_addcuq (x_bits, (vui128_t) q_ones);

          b_sig = (vui128_t) vec_or ((vui32_t) t_sig, (vui32_t) p_odd);

        }


      // If operands have the same sign then s_sig = a_sig + b_sig

      // Otherwise s_sig = a_sig - b_sig

      add_sig = vec_adduqm (a_sig, b_sig);

      sub_sig = vec_subuqm (a_sig, b_sig);

      s_sig = vec_seluq (add_sig, sub_sig, (vb128_t) diff_sign);


      if (__builtin_expect (vec_cmpuq_all_eq (s_sig, (vui128_t) q_zero), 0))

        { // Special case of both zero with different sign

          q_sign = vec_sel (a_sign, (vui32_t) q_zero, diff_sign);

          return vec_xfer_vui32t_2_bin128 (q_sign);

        }


      // Issolate CL bits from significand too simplify the compare

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

      vui8_t t_sig = vec_splat ((vui8_t) s_sig, 14);

#else

      vui8_t t_sig = vec_splat ((vui8_t) s_sig, 1);

#endif

//      if (vec_cmpuq_all_gt (s_sig, (vui128_t) sigov))

        if (vec_all_gt (t_sig, t_sig_C))

        { // Check for carry and adjust

          p_odd = (vui128_t) vec_and ((vui32_t) s_sig, (vui32_t) xbitmask);

          s_sig = vec_srqi (s_sig, 1);

          s_sig = (vui128_t) vec_or ((vui32_t) s_sig, (vui32_t) p_odd);

          q_exp = vec_addudm (q_exp, exp_one);

        }

      else // if (vec_cmpuq_all_le (s_sig, (vui128_t) sigovt))

          if (vec_all_le (t_sig, t_sig_L))

        {

          // Or the significand is below normal range.

          // This can happen with subtraction.

          vui64_t c_exp, d_exp;

          vui128_t c_sig;

          const vui64_t exp_12 = vec_splat_u64 (12);


          c_sig = vec_clzq (s_sig);

          c_exp = vec_splatd ((vui64_t) c_sig, VEC_DW_L);

          // The IR has 12 leading zeros that should not effect the shift count.

          c_exp = vec_subudm (c_exp, exp_12);

          d_exp = vec_subudm (q_exp, (vui64_t) exp_min);

          d_exp = vec_minud (c_exp, d_exp);

          {

            vb64_t nrm_mask = vec_cmpgtsd ((vi64_t) q_exp, (vi64_t) exp_min);

            vb64_t exp_mask = vec_cmpgtud (q_exp, c_exp);


            c_sig = vec_slq (s_sig, (vui128_t) d_exp);

            q_exp = vec_subudm (q_exp, d_exp);

            exp_mask = (vb64_t) vec_and ((vui32_t) exp_mask, (vui32_t) nrm_mask);

            q_exp = vec_selud (exp_dnrm,  q_exp, exp_mask);

            s_sig = vec_seluq (s_sig, c_sig, (vb128_t) nrm_mask);

          }

        }

      // Round to odd from low order GRX-bits

      p_tmp = (vui128_t) vec_and ((vui32_t) s_sig, (vui32_t) grx_mask);

      p_odd = vec_addcuq (p_tmp, (vui128_t) q_ones);

      q_sig = vec_srqi (s_sig, 3);

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, (vui32_t) p_odd);

      // Check for exponent overflow -> __FLT128_MAX__

      if (__builtin_expect ((vec_cmpud_all_ge ( q_exp, exp_naninf)), 0))

        {

          // return maximum finite exponent and significand

          const vui32_t f128_max = CONST_VINT128_W(0x7ffeffff, -1, -1, -1);

          vui32_t f128_smax = vec_or ((vui32_t) f128_max, q_sign);

          return vec_xfer_vui32t_2_bin128 (f128_smax);

        }

      // Merge sign, significand, and exponent into final result

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

      result = vec_xsiexpqp (q_sig, q_exp);

      return result;

    }

  else // One or both operands are NaN or Infinity

    {

      //const vui32_t q_nan = CONST_VINT128_W(0x00008000, 0, 0, 0);

      vui32_t q_nan = vec_mask128_f128Qbit ();

      // One or both operands are NaN

      if (vec_all_isnanf128 (vfa))

        {

          // vfa is NaN, Convert vfa to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfa, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else if (vec_all_isnanf128 (vfb))

        {

          // vfb is NaN, Convert vfb to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfb, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else  // Or one or both operands are Infinity

        {

          a_exp = vec_splatd (x_exp, VEC_DW_H);

          // b_exp = vec_splatd (x_exp, VEC_DW_L);

          if (vec_cmpud_all_eq (x_exp, exp_naninf)

              && vec_cmpud_all_eq ((vui64_t) a_sign, (vui64_t) b_sign))

            { // Both operands infinity and opposite sign

              // Inifinty - Infinity (same sign) is Default Quiet NaN

              return vec_const_nanf128 ();

            }

          else

            { // Either both operands infinity and same sign

              // Or one infinity and one finite

              if (vec_cmpud_any_eq (a_exp, exp_naninf))

                {

                  // return infinity

                  return vfa;

                }

              else

                {

                  // return infinity

                  return vec_negf128(vfb);

                }

            }

        }

    }

#endif

  return result;

}


__binary128

static inline vec_xscvdpqp (vf64_t f64)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 9)

  // earlier GCC versions generate extra data moves for this.

  result = f64[VEC_DW_H];

#else

  // No extra data moves here.

  __asm__(

      "xscvdpqp %0,%1"

      : "=v" (result)

      : "v" (f64)

      : );

#endif

#elif  defined (_ARCH_PWR8)

  vui64_t d_exp, d_sig, q_exp;

  vui128_t q_sig;

  vui32_t q_sign;

  const vui64_t exp_delta = (vui64_t) CONST_VINT64_DW ( (0x3fff - 0x3ff), 0 );

  const vui64_t d_naninf = (vui64_t) CONST_VINT64_DW ( 0x7ff, 0 );

  const vui64_t d_denorm = (vui64_t) CONST_VINT64_DW ( 0, 0 );

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  f64[VEC_DW_L] = 0.0; // clear the right most element to zero.

  // Extract the exponent, significand, and sign bit.

  d_exp = vec_xvxexpdp (f64);

  d_sig = vec_xvxsigdp (f64);

  q_sign = vec_and ((vui32_t) f64, signmask);

  // The extract sig operation has already tested for finite/subnormal.

  // So avoid testing isfinite/issubnormal again by simply testing

  // the extracted exponent.

  if (__builtin_expect (!vec_cmpud_all_eq (d_exp, d_naninf), 1))

    {

      if (__builtin_expect (!vec_cmpud_all_eq (d_exp, d_denorm), 1))

        {

          q_sig = vec_srqi ((vui128_t) d_sig, 4);

          q_exp = vec_addudm (d_exp, exp_delta);

        }

      else

        {

          if (vec_cmpud_all_eq (d_sig, d_denorm))

            {

              q_sig = (vui128_t) d_sig;

              q_exp = (vui64_t) d_exp;

            }

          else

            { // Must be subnormal but we need to produce a normal QP.

              // So need to adjust the quad exponent by the f64 denormal

              // exponent (-1023) and any leading '0's in the f64 sig.

              // There will be at least 12.

              vui64_t q_denorm = (vui64_t) CONST_VINT64_DW ( (0x3fff - (1023 - 12)), 0 );

              vui64_t f64_clz;

              f64_clz = vec_clzd (d_sig);

              d_sig = vec_vsld (d_sig, f64_clz);

              q_exp = vec_subudm (q_denorm, f64_clz);

              q_sig = vec_srqi ((vui128_t) d_sig, 15);

            }

        }

    }

  else

    { // isinf or isnan.

      q_sig = vec_srqi ((vui128_t) d_sig, 4);

      q_exp = (vui64_t) CONST_VINT64_DW (0x7fff, 0);

    }

  // Copy Sign-bit to QP significand before insert.

  q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

  // Insert exponent into significand to complete conversion to QP

  result = vec_xsiexpqp (q_sig, q_exp);

#else

  result = f64[VEC_DW_H];

#endif

  return result;

}


static inline vf64_t

vec_xscvqpdpo (__binary128 f128)

{

  vf64_t result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 9)

  // GCC runtime does not convert/round directly from __float128 to

  // vector double. So convert scalar double then copy to vector double.

  result = (vf64_t) { 0.0, 0.0 };

  result [VEC_DW_H] = __builtin_truncf128_round_to_odd (f128);

#else

  // No extra data moves here.

  __asm__(

      "xscvqpdpo %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#endif

#else //  defined (_ARCH_PWR8)

  vui64_t d_exp, d_sig, x_exp;

  vui64_t q_exp;

  vui128_t q_sig;

  vui32_t q_sign;

  const vui128_t q_zero = { 0 };

  const vui128_t q_ones = (vui128_t) vec_splat_s32 (-1);

  const vui64_t qpdp_delta = (vui64_t) CONST_VINT64_DW ( (0x3fff - 0x3ff), 0 );

  const vui64_t exp_tiny = (vui64_t) CONST_VINT64_DW ( (0x3fff - 1022), (0x3fff - 1022) );

  const vui64_t exp_high = (vui64_t) CONST_VINT64_DW ( (0x3fff + 1023), (0x3fff + 1023));

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  const vui64_t q_naninf = (vui64_t) CONST_VINT64_DW ( 0x7fff, 0x7fff );

  const vui64_t d_naninf = (vui64_t) CONST_VINT64_DW ( 0x7ff, 0 );


  q_exp = vec_xsxexpqp (f128);

  x_exp = vec_splatd (q_exp, VEC_DW_H);

  q_sig = vec_xsxsigqp (f128);

  q_sign = vec_and_bin128_2_vui32t (f128, signmask);

  if (__builtin_expect (!vec_cmpud_all_eq (x_exp, q_naninf), 1))

    {

      if (vec_cmpud_all_ge (x_exp, exp_tiny))

        { // Greater than or equal to 2**-1022

          if (vec_cmpud_all_le (x_exp, exp_high))

            { // Less than or equal to 2**+1023

              vui64_t d_X;

              // Convert the significand to double with left shift 4

              q_sig = vec_slqi ((vui128_t) q_sig, 4);

              // The GRX round bits are now in bits 64-127 (DW element 1)

              // For round-to-odd just test for any GRX bits nonzero

              d_X = (vui64_t) vec_cmpgtud ((vui64_t) q_sig, (vui64_t) q_zero);

              d_X = vec_mrgald (q_zero, (vui128_t) d_X);

              d_X = (vui64_t) vec_slqi ((vui128_t) d_X, 1);

              d_sig = (vui64_t) vec_or ((vui32_t) q_sig, (vui32_t) d_X);

              d_exp = vec_subudm (q_exp, qpdp_delta);

            }

          else

            { // To high so return infinity OR double max???

              d_sig = (vui64_t) CONST_VINT64_DW (0x001fffffffffffff, 0);

              d_exp = (vui64_t) CONST_VINT64_DW (0x7fe, 0);

            }

        }

      else

        { // tiny

          vui64_t d_X;

          vui64_t q_delta;

          const vui64_t exp_tinyr = (vui64_t)

              CONST_VINT64_DW ( (0x3fff-(1022+53)), (0x3fff-(1022+53)));

          q_delta = vec_subudm (exp_tiny, x_exp);

          // Set double exp to denormal

          d_exp = (vui64_t) q_zero;

          if (vec_cmpud_all_gt (x_exp, exp_tinyr))

            {

              // Convert the significand to double with left shift 4

              // The GRX round bits are now in bits 64-127 (DW element 1)

              q_sig = vec_slqi ((vui128_t) q_sig, 4);

              d_sig = (vui64_t) vec_srq (q_sig, (vui128_t) q_delta);

              // For round-to-odd just test for any nonzero GRX bits.

              d_X = (vui64_t) vec_cmpgtud ((vui64_t) d_sig, (vui64_t) q_zero);

              // Generate a low order 0b1 in DW[0]

              d_X = vec_mrgald (q_zero, (vui128_t) d_X);

              d_X = (vui64_t) vec_slqi ((vui128_t) d_X, 1);

              d_sig = (vui64_t) vec_or ((vui32_t) d_sig, (vui32_t) d_X);

            }

          else

            { // tinyr

              // For round-to-odd just test for any nonzero GRX bits.

              d_X = (vui64_t) vec_addcuq (q_sig, q_ones);

              d_sig = (vui64_t) vec_swapd (d_X);

            }

        }

    }

  else

    { // isinf or isnan.

      const vui64_t q_quiet   = CONST_VINT64_DW (0x0000800000000000, 0);

      vb128_t is_inf;

      vui128_t x_sig;

      is_inf = vec_cmpequq ((vui128_t) q_sig, (vui128_t) q_zero);

      x_sig = (vui128_t) vec_or ((vui32_t) q_sig, (vui32_t) q_quiet);

      q_sig = (vui128_t) vec_sel ((vui32_t)x_sig, (vui32_t)q_sig, (vui32_t)is_inf);

      d_sig = (vui64_t)vec_slqi (q_sig, 4);

      d_exp = d_naninf;

    }


  d_sig [VEC_DW_L] = 0UL;

  d_sig = (vui64_t) vec_or ((vui32_t) d_sig, q_sign);

  result = vec_xviexpdp (d_sig, d_exp);

#endif

  return result;

}


static inline vui64_t

vec_xscvqpudz (__binary128 f128)

{

  vui64_t result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  __asm__(

      "xscvqpudz %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#else

  vui64_t q_exp, q_delta, x_exp;

  vui128_t q_sig;

  vb128_t b_sign;

  const vui64_t q_zero = { 0, 0 };

  const vui64_t q_ones = { -1, -1 };

  const vui64_t exp_low = (vui64_t) CONST_VINT64_DW ( 0x3fff, 0x3fff );

  const vui64_t exp_high = (vui64_t) CONST_VINT64_DW ( (0x3fff+64), (0x3fff+64) );

  const vui64_t exp_63 = (vui64_t) CONST_VINT64_DW ( (0x3fff+63), (0x3fff+63) );

  const vui64_t q_naninf = (vui64_t) CONST_VINT64_DW ( 0x7fff, 0x7fff );


  result = q_zero;

  q_exp = vec_xsxexpqp (f128);

  q_sig = vec_xsxsigqp (f128);

  x_exp = vec_splatd (q_exp, VEC_DW_H);

  b_sign = vec_setb_qp (f128);

  if (__builtin_expect (!vec_cmpud_all_eq (x_exp, q_naninf), 1))

    {

      if (vec_cmpud_all_ge (x_exp, exp_low)

       && vec_cmpud_all_eq ((vui64_t)b_sign, (vui64_t)q_zero))

        { // Greater than or equal to 1.0

          if (vec_cmpud_all_lt (x_exp, exp_high))

            { // Less than 2**64-1

              q_sig = vec_slqi (q_sig, 15);

              q_delta = vec_subudm (exp_63, x_exp);

              result = vec_vsrd ((vui64_t) q_sig, q_delta);

            }

          else

            { // set result to 2**64-1

              result = q_ones;

            }

        }

      else

        { // less than 1.0 or negative

          result = q_zero;

        }

    }

  else

    { // isinf or isnan.

      vb128_t is_inf;

      // Positive Inf returns all ones

      // else NaN or -Infinity returns zero

      is_inf = vec_cmpequq (q_sig, (vui128_t) q_zero);

      // result = ~NaN | (pos & Inf) -> Inf & (pos & Inf) -> pos & Inf

      result = (vui64_t) vec_andc ((vui32_t) is_inf, (vui32_t) b_sign);

    }

  result = vec_mrgahd ((vui128_t) result, (vui128_t) q_zero);

#endif

  return result;

}


static inline vui128_t

vec_xscvqpuqz (__binary128 f128)

{

  vui128_t result;

#if defined (_ARCH_PWR10) && (__GNUC__ >= 10)

  __asm__(

      "xscvqpuqz %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#else

  vui64_t q_exp, q_delta, x_exp;

  vui128_t q_sig;

  vb128_t b_sign;

  const vui128_t q_zero = { 0 };

  const vui128_t q_ones = (vui128_t) vec_splat_s32 (-1);

  const vui64_t exp_low = (vui64_t) CONST_VINT64_DW ( 0x3fff, 0x3fff );

  const vui64_t exp_high = (vui64_t) CONST_VINT64_DW ( (0x3fff+128), (0x3fff+128) );

  const vui64_t exp_127 = (vui64_t) CONST_VINT64_DW ( (0x3fff+127), (0x3fff+127) );

  const vui64_t q_naninf = (vui64_t) CONST_VINT64_DW ( 0x7fff, 0x7fff );


  result = q_zero;

  q_exp = vec_xsxexpqp (f128);

  q_sig = vec_xsxsigqp (f128);

  x_exp = vec_splatd (q_exp, VEC_DW_H);

  b_sign = vec_setb_qp (f128);

  if (__builtin_expect (!vec_cmpud_all_eq (x_exp, q_naninf), 1))

    {

      if (vec_cmpud_all_ge (x_exp, exp_low)

       && vec_cmpud_all_eq ((vui64_t)b_sign, (vui64_t)q_zero))

        { // Greater than or equal to 1.0

          if (vec_cmpud_all_lt (x_exp, exp_high))

            { // Less than 2**128-1

              q_sig = vec_slqi (q_sig, 15);

              q_delta = vec_subudm (exp_127, x_exp);

              result = vec_srq (q_sig, (vui128_t) q_delta);

            }

          else

            { // set result to 2**128-1

              result = (vui128_t) q_ones;

            }

        }

      else

        { // less than 1.0 or negative

          result = (vui128_t) q_zero;

        }

    }

  else

    { // isinf or isnan.

      vb128_t is_inf;

      // Positive Inf returns all ones

      // else NaN or -Infinity returns zero

      is_inf = vec_cmpequq (q_sig, (vui128_t) q_zero);

      // result = ~NaN | (pos & Inf) -> Inf & (pos & Inf) -> pos & Inf

      result = (vui128_t) vec_andc ((vui32_t) is_inf, (vui32_t) b_sign);

    }

#endif

  return result;

}


__binary128

static inline vec_xscvsdqp (vi64_t int64)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 9)

  // earlier GCC versions generate extra data moves for this.

  result = int64[VEC_DW_H];

#else

  // No extra data moves here.

  __asm__(

      "xscvsdqp %0,%1"

      : "=v" (result)

      : "v" (int64)

      : );

#endif

#elif  defined (_ARCH_PWR8)

  vui64_t d_sig, q_exp, d_sign, d_neg;

  vui128_t q_sig;

  vui32_t q_sign;

  const vui64_t d_zero = (vui64_t) CONST_VINT64_DW ( 0, 0 );

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);


  int64[VEC_DW_L] = 0UL; // clear the right most element to zero.


  if (vec_cmpud_all_eq ((vui64_t) int64, d_zero))

    {

      result = vec_xfer_vui64t_2_bin128 (d_zero);

    }

  else

    {

      // We need to produce a normal QP, so we treat the integer like a

      // denormal, then normalize it.

      // Start with the quad exponent bias + 63 then subtract the count

      // leading '0's. The 64-bit magnitude has 1-63 leading '0's

      vui64_t q_expm = (vui64_t) CONST_VINT64_DW ((0x3fff + 63), 0 );

      vui64_t i64_clz;

      // Convert 2s complement to signed magnitude form.

      q_sign = vec_and ((vui32_t) int64, signmask);

      d_neg  = vec_subudm (d_zero, (vui64_t)int64);

      d_sign = (vui64_t) vec_cmpequd ((vui64_t) q_sign, (vui64_t) signmask);

      d_sig = (vui64_t) vec_sel ((vui32_t) int64, (vui32_t) d_neg, (vui32_t) d_sign);

      // Count leading zeros and normalize.

      i64_clz = vec_clzd (d_sig);

      d_sig = vec_vsld (d_sig, i64_clz);

      q_exp = vec_subudm (q_expm, i64_clz);

      q_sig = vec_srqi ((vui128_t) d_sig, 15);

      // Copy Sign-bit to QP significand before insert.

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

      // Insert exponent into significand to complete conversion to QP

      result = vec_xsiexpqp (q_sig, q_exp);

    }

#else

  result = int64[VEC_DW_H];

#endif

  return result;

}


__binary128

static inline vec_xscvudqp (vui64_t int64)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 9)

  // earlier GCC versions generate extra data moves for this.

  result = int64[VEC_DW_H];

#else

  // No extra data moves here.

  __asm__(

      "xscvudqp %0,%1"

      : "=v" (result)

      : "v" (int64)

      : );

#endif

#elif  defined (_ARCH_PWR8)

  vui64_t d_sig, q_exp;

  vui128_t q_sig;

  const vui64_t d_zero = (vui64_t) CONST_VINT64_DW ( 0, 0 );


  int64[VEC_DW_L] = 0UL; // clear the right most element to zero.

  d_sig = int64;

  // Quick test for 0UL as this case requires a special exponent.

  if (vec_cmpud_all_eq (int64, d_zero))

    {

      result = vec_xfer_vui64t_2_bin128 (d_zero);

    }

  else

    { // We need to produce a normal QP, so we treat the integer like a

      // denormal, then normalize it.

      // Start with the quad exponent bias + 63 then subtract the count of

      // leading '0's. The 64-bit sig can have 0-63 leading '0's.

      const vui64_t q_expm = (vui64_t) CONST_VINT64_DW ((0x3fff + 63), 0 );

      vui64_t i64_clz = vec_clzd (int64);

      d_sig = vec_vsld (int64, i64_clz);

      q_exp = vec_subudm (q_expm, i64_clz);

      q_sig = vec_srqi ((vui128_t) d_sig, 15);

      // Insert exponent into significand to complete conversion to QP

      result = vec_xsiexpqp (q_sig, q_exp);

    }

#else

  result = int64[VEC_DW_H];

#endif

  return result;

}


__binary128

static inline vec_xscvsqqp (vi128_t int128)

{

  __binary128 result;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "xscvsqqp %0,%1"

      : "=v" (result)

      : "v" (int128)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  __binary128 hi64, lo64, i_sign;

  __binary128 two64 = 0x1.0p64;

  vui128_t q_sig;

  vui32_t q_sign;

  vui128_t q_neg;

  vb128_t b_sign;

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  // Collect the sign bit of the input value.

  q_sign = vec_and ((vui32_t) int128, signmask);

  // Convert 2s complement to unsigned magnitude form.

  q_neg  = (vui128_t) vec_negsq (int128);

  b_sign = vec_setb_sq (int128);

  q_sig = vec_seluq ((vui128_t) int128, q_neg, b_sign);

  // Generate a signed 0.0 to use with vec_copysignf128

  i_sign = vec_xfer_vui32t_2_bin128 (q_sign);

  // Convert the unsigned int128 magnitude to __binary128

  vui64_t int64 = (vui64_t) q_sig;

  hi64 = int64[VEC_DW_H];

  lo64 = int64[VEC_DW_L];

  result = (hi64 * two64) + lo64;

  // Copy the __int128's sign into the __binary128 result

  result = vec_copysignf128 (i_sign, result);

#elif  defined (_ARCH_PWR8)

  vui64_t q_exp;

  vui128_t q_sig;

  vui128_t q_neg;

  vui32_t q_sign;

  vb128_t b_sign;

  const vui128_t q_zero = (vui128_t) { 0 };

  const vui32_t lowmask = CONST_VINT128_W ( 0, 0, 0, 1);

  const vui32_t signmask = CONST_VINT128_W (0x80000000, 0, 0, 0);

  // Quick test for 0UL as this case requires a special exponent.

  if (vec_cmpuq_all_eq ((vui128_t) int128, q_zero))

    {

      result = vec_xfer_vui128t_2_bin128 (q_zero);

    }

  else

    { // We need to produce a normal QP, so we treat the integer like a

      // denormal, then normalize it.

      // Collect the sign bit of the input value.

      q_sign = vec_and ((vui32_t) int128, signmask);

      // Convert 2s complement to signed magnitude form.

      q_neg  = (vui128_t) vec_negsq (int128);

      b_sign = vec_setb_sq (int128);

      q_sig = vec_seluq ((vui128_t) int128, q_neg, b_sign);

      // Start with the quad exponent bias + 127 then subtract the count of

      // leading '0's. The 128-bit sig can have 0-127 leading '0's.

      vui64_t q_expm = (vui64_t) CONST_VINT64_DW (0, (0x3fff + 127));

      vui64_t i64_clz = (vui64_t) vec_clzq (q_sig);

      q_sig = vec_slq (q_sig, (vui128_t) i64_clz);

      q_exp = vec_subudm (q_expm, i64_clz);

      // This is the part that might require rounding.


      // The Significand (including the L-bit) is right justified in

      // in the high-order 113-bits of q_sig.

      // The guard, round, and sticky (GRX) bits are in the low-order

      // 15 bits.

      // The sticky-bits are the last 13 bits and are logically ORed

      // (or added to 0x1fff) to produce the X-bit.

      //

      // For "Round to Nearest Even".

      // GRX = 0b001 - 0b011; truncate

      // GRX = 0b100 and bit-112 is odd; round up, otherwise truncate

      // GRX = 0b100 - 0b111; round up

      // We can simplify by copying bit-112 and OR it with bit-X

      // Then add 0x3fff to q_sig will generate a carry into bit-112

      // if and only if GRX > 0b100 or (GRX == 0b100) && (bit-112 == 1)

      const vui32_t RXmask = CONST_VINT128_W ( 0, 0, 0, 0x3fff);

      vui128_t q_carry, q_sigc;

      vb128_t qcmask;

      vui32_t q_odd;

      // Isolate bit-112 and OR into GRX bits if q_sig is odd

      q_odd = (vui32_t) vec_srhi ((vui16_t)q_sig, 15);

      q_odd = vec_and (q_odd, lowmask);

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_odd);

      // We add 0x3fff to GRX-bits which may carry into low order sig-bit

      // This may result in a carry out of bit L into bit-C.

      q_carry = vec_addcuq (q_sig, (vui128_t) RXmask);

      q_sig = vec_adduqm (q_sig, (vui128_t) RXmask);

      // Generate a bool mask from the carry to use in the vsel

      qcmask = vec_setb_cyq (q_carry);

      // Two cases; 1) We did carry so shift (double) left 112 bits

      q_sigc = vec_sldqi (q_carry, q_sig, 112);

      // 2) no carry so shift left 15 bits

      q_sig = vec_srqi ((vui128_t) q_sig, 15);

      // Select which based on carry

      q_sig = (vui128_t) vec_sel ((vui32_t) q_sig, (vui32_t) q_sigc, (vui32_t) qcmask);

      // Increment the exponent based on the carry

      q_exp = vec_addudm (q_exp, (vui64_t) q_carry);


      q_exp = vec_swapd (q_exp);

      // Copy Sign-bit to QP significand before insert.

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

      result = vec_xsiexpqp (q_sig, q_exp);

    }

#else

  result = int128[0];

#endif

  return result;

}


__binary128

static inline vec_xscvuqqp (vui128_t int128)

{

  __binary128 result;

#if defined (_ARCH_PWR10)  && (__GNUC__ >= 10)

  __asm__(

      "xscvuqqp %0,%1"

      : "=v" (result)

      : "v" (int128)

      : );

#elif defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui64_t int64 = (vui64_t) int128;

  __binary128 hi64, lo64;

  __binary128 two64 = 0x1.0p64;

  hi64 = int64[VEC_DW_H];

  lo64 = int64[VEC_DW_L];

  result = (hi64 * two64) + lo64;

#elif  defined (_ARCH_PWR8)

  vui64_t q_exp;

  vui128_t q_sig;

  const vui128_t q_zero = (vui128_t) { 0 };

  const vui32_t lowmask = CONST_VINT128_W ( 0, 0, 0, 1);


  q_sig = int128;

  // Quick test for 0UL as this case requires a special exponent.

  if (vec_cmpuq_all_eq (q_sig, q_zero))

    {

      result = vec_xfer_vui128t_2_bin128 (q_zero);

    }

  else

    { // We need to produce a normal QP, so we treat the integer like a

      // denormal, then normalize it.

      // Start with the quad exponent bias + 127 then subtract the count of

      // leading '0's. The 128-bit sig can have 0-127 leading '0's.

      vui64_t q_expm = (vui64_t) CONST_VINT64_DW (0, (0x3fff + 127));

      vui64_t i64_clz = (vui64_t) vec_clzq (q_sig);

      q_sig = vec_slq (q_sig, (vui128_t) i64_clz);

      q_exp = vec_subudm (q_expm, i64_clz);

      // This is the part that might require rounding.

      // The Significand (including the L-bit) is right justified in

      // in the high-order 113-bits of q_sig.

      // The guard, round, and sticky (GRX) bits are in the low-order

      // 15 bits.

      // The sticky-bits are the last 13 bits and are logically ORed

      // (or added to 0x1fff) to produce the X-bit.

      //

      // For "Round to Nearest Even".

      // GRX = 0b001 - 0b011; truncate

      // GRX = 0b100 and bit-112 is odd; round up, otherwise truncate

      // GRX = 0b100 - 0b111; round up

      // We can simplify by copying bit-112 and OR it with bit-X

      // Then add 0x3fff to q_sig will generate a carry into bit-112

      // if and only if GRX > 0b100 or (GRX == 0b100) && (bit-112 == 1)

      const vui32_t RXmask = CONST_VINT128_W ( 0, 0, 0, 0x3fff);

      vui128_t q_carry, q_sigc;

      vb128_t qcmask;

      vui32_t q_odd;

      // Isolate bit-112 and OR into GRX bits if q_sig is odd

      q_odd = (vui32_t) vec_srhi ((vui16_t)q_sig, 15);

      q_odd = vec_and (q_odd, lowmask);

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_odd);

      // We add 0x3fff to GRX-bits which may carry into low order sig-bit

      // This may result in a carry out of bit L into bit-C.

      q_carry = vec_addcuq (q_sig, (vui128_t) RXmask);

      q_sig = vec_adduqm (q_sig, (vui128_t) RXmask);

      // Generate a bool mask from the carry to use in the vsel

      qcmask = vec_setb_cyq (q_carry);

      // Two cases; 1) We did carry so shift (double) left 112 bits

      q_sigc = vec_sldqi (q_carry, q_sig, 112);

      // 2) no carry so shift left 15 bits

      q_sig = vec_srqi ((vui128_t) q_sig, 15);

      // Select which based on carry

      q_sig = (vui128_t) vec_sel ((vui32_t) q_sig, (vui32_t) q_sigc, (vui32_t) qcmask);

      // Increment the exponent based on the carry

      q_exp = vec_addudm (q_exp, (vui64_t) q_carry);

      q_exp = vec_swapd (q_exp);

      result = vec_xsiexpqp (q_sig, q_exp);

    }

#else

  result = int128[0];

#endif

  return result;

}


static inline __binary128

vec_xsmulqpo (__binary128 vfa, __binary128 vfb)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && (__GNUC__ > 7)

#if defined (__FLOAT128__) && (__GNUC__ > 8)

  // earlier GCC versions generate extra data moves for this.

  result = __builtin_mulf128_round_to_odd (vfa, vfb);

#else

  // No extra data moves here.

  __asm__(

      "xsmulqpo %0,%1,%2"

      : "=v" (result)

      : "v" (vfa), "v" (vfb)

      : );

#endif

  return result;

#else  //_ARCH_PWR8 or _ARCH_PWR7

  vui64_t q_exp, a_exp, b_exp, x_exp;

  vui128_t q_sig, a_sig, b_sig, p_sig_h, p_sig_l, p_odd;

  vui32_t q_sign, a_sign, b_sign;

  vui128_t a_mag, b_mag;

  const vui32_t q_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t q_ones = CONST_VINT128_W(-1, -1, -1, -1);

  //const vui64_t exp_naninf = (vui64_t) { 0x7fff, 0x7fff };

  const vui64_t exp_naninf = vec_mask64_f128exp ();

  const vui32_t magmask = vec_mask128_f128mag ();


  // Vector extract the exponents from vfa, vfb

  x_exp = vec_xxxexpqpp (vfa, vfb);

  // Mask off sign bits so can use integers for magnitude compare.

  a_mag = (vui128_t) vec_and_bin128_2_vui32t (vfa, magmask);

  b_mag = (vui128_t) vec_and_bin128_2_vui32t (vfb, magmask);

  a_sign = vec_andc_bin128_2_vui32t (vfa, magmask);

  b_sign = vec_andc_bin128_2_vui32t (vfb, magmask);

  q_sign = vec_xor (a_sign, b_sign);


//  if (vec_all_isfinitef128 (vfa) && vec_all_isfinitef128 (vfb))

  if (__builtin_expect (vec_cmpud_all_lt (x_exp, exp_naninf), 1))

    {

      const vui64_t exp_dnrm = (vui64_t) q_zero;

      vui64_t exp_min, exp_one, exp_bias;

      vui128_t p_tmp;

      // const vui64_t exp_min, exp_one = { 1, 1 };

      // exp_min = exp_one = vec_splat_u64 (1);

        { // Extract the significands and insert the Hidden bit

          //const vui32_t q_zero = CONST_VINT128_W(0, 0, 0, 0);

          const vui32_t sigmask = vec_mask128_f128sig ();

          vui32_t a_s32, b_s32;

          vui16_t a_e16, b_e16, x_hidden;

          vb16_t a_norm, b_norm;


          //const vui32_t hidden = vec_mask128_f128Lbit();

          x_hidden = vec_splat_u16(1);

          // Assume that the operands are finite magnitudes

          // Mask off the significands

          // Applying sigmask to orignal inputs can save 2 cycles here

          a_s32 = vec_and_bin128_2_vui32t (vfa, sigmask);

          b_s32 = vec_and_bin128_2_vui32t (vfb, sigmask);

          // But still need a/b_mag for exp extract to clear sign-bit

          // Mask off the exponents in high halfword

          a_e16 = (vui16_t) vec_andc ((vui32_t) a_mag, sigmask);

          b_e16 = (vui16_t) vec_andc ((vui32_t) b_mag, sigmask);

          // Compare exponents for finite i.e. > denomal (q_zero)

          a_norm = vec_cmpgt (a_e16, (vui16_t) q_zero);

          b_norm = vec_cmpgt (b_e16, (vui16_t) q_zero);

          // For Normal QP insert (hidden) L-bit into significand

          a_sig = (vui128_t) vec_sel ((vui16_t) a_s32, x_hidden, a_norm);

          b_sig = (vui128_t) vec_sel ((vui16_t) b_s32, x_hidden, b_norm);

        }


      // Precondition the significands before multiply so that the

      // high-order 114-bits (C,L,FRACTION) of the product are right

      // adjusted in p_sig_h. And the Low-order 112-bits are left

      // justified in p_sig_l.

      // Logically this (multiply) step could be moved after the zero

      // test. But this uses a lot of registers and the compiler may

      // see this as register pressure and decide to spill and reload

      // unrelated data around this block.

      // The zero multiply is rare so on average performance is better

      // if we get this started now.

      a_sig = vec_slqi (a_sig, 8);

      b_sig = vec_slqi (b_sig, 8);

      p_sig_l = vec_muludq (&p_sig_h, a_sig, b_sig);


      // check for zero significands in multiply

      if (__builtin_expect (

          (vec_all_eq((vui32_t ) a_sig, (vui32_t ) q_zero)

              || vec_all_eq((vui32_t ) b_sig, (vui32_t ) q_zero)),

          0))

        { // Multiply by zero, return QP signed zero

          result = vec_xfer_vui32t_2_bin128 (q_sign);

          return result;

        }


      // const vui64_t exp_min, exp_one = { 1, 1 };

      exp_min = exp_one = vec_splat_u64 (1);

      //const vui64_t exp_bias = (vui64_t) { 0x3fff, 0x3fff };

      exp_bias = (vui64_t) vec_srhi ((vui16_t) exp_naninf, 1);

        { // Compute product exponent q_exp

          // Operand exponents should >= Emin for computation

          vb64_t exp_mask;

          exp_mask = vec_cmpequd (x_exp, exp_dnrm);

          x_exp = vec_selud (x_exp, exp_min, (vb64_t) exp_mask);

          // sum exponents across x_exp

          q_exp = vec_addudm (x_exp, vec_swapd (x_exp));

          // Sum includes 2 x exp_bias, So subtract 1 x exp_bias

          q_exp = vec_subudm (q_exp, exp_bias);

        }


      // Check for carry; shift right 1 and adjust exp +1

        {

          vb128_t carry_mask;

          vui128_t sig_h, sig_l;

          // Test Carry-bit (greater than L-bit)

          vui16_t sig_l_mask = vec_splat_u16(1);

          vui16_t t_sig = vec_splat ((vui16_t) p_sig_h, VEC_HW_H);

          carry_mask = (vb128_t) vec_cmpgt (t_sig, sig_l_mask);

          // Shift double quadword right 1 bit

          p_tmp = vec_sldqi (p_sig_h, p_sig_l, 120);

          sig_h = vec_srqi (p_sig_h, 1);

          sig_l = vec_slqi (p_tmp, 7);

          // Increment the exponent

          x_exp = vec_addudm (q_exp, exp_one);

          // Select original or normalized exp/sig

          p_sig_h = vec_seluq (p_sig_h, sig_h, carry_mask);

          p_sig_l = vec_seluq (p_sig_l, sig_l, carry_mask);

          q_exp = vec_selud (q_exp, x_exp, (vb64_t) carry_mask);

        }


      // There are two cases for denormal

      // 1) The sum of unbiased exponents is less the E_min (tiny).

      // 2) The significand is less then 1.0 (C and L-bits are zero).

      //  2a) The exponent is > E_min

      //  2b) The exponent is == E_min

      //

      q_sig = p_sig_h;

      // Check for Tiny exponent

      if (__builtin_expect (

          (vec_cmpsd_all_lt ((vi64_t) q_exp, (vi64_t) exp_min)), 0))

        {

          //const vui64_t exp_128 = (vui64_t) { 128, 128 };

          const vui64_t exp_128 = vec_const64_f128_128 ();

          const vui64_t too_tiny = (vui64_t

                )

                  { 116, 116 };

          // const vui32_t xmask = CONST_VINT128_W(0x1fffffff, -1, -1, -1);

          vui32_t xmask = (vui32_t) vec_srqi ((vui128_t) q_ones, 3);

          vui32_t tmp;


          // Intermediate result is tiny, unbiased exponent < -16382

          //x_exp = vec_subudm ((vui64_t) exp_tiny, q_exp);

          x_exp = vec_subudm (exp_min, q_exp);


          if (vec_cmpud_all_gt ((vui64_t) x_exp, too_tiny))

            {

              // Intermediate result is too tiny, the shift will

              // zero the fraction and the GR-bit leaving only the

              // Sticky bit. The X-bit needs to include all bits

              // from p_sig_h and p_sig_l

              p_sig_l = vec_srqi (p_sig_l, 8);

              p_sig_l = (vui128_t) vec_or ((vui32_t) p_sig_l,

                                           (vui32_t) p_sig_h);

              // generate a carry into bit-2 for any nonzero bits 3-127

              p_sig_l = vec_adduqm (p_sig_l, (vui128_t) xmask);

              q_sig = (vui128_t) q_zero;

              p_sig_l = (vui128_t) vec_andc ((vui32_t) p_sig_l, xmask);

            }

          else

            { // Normal tiny, right shift may loose low order bits

              // from p_sig_l. So collect any 1-bits below GRX and

              // OR them into the X-bit, before the right shift.

              vui64_t l_exp;


              // Propagate low order bits into the sticky bit

              // GRX left adjusted in p_sig_l

              // Issolate bits below GDX (bits 3-128).

              tmp = vec_and ((vui32_t) p_sig_l, xmask);

              // generate a carry into bit-2 for any nonzero bits 3-127

              tmp = (vui32_t) vec_adduqm ((vui128_t) tmp, (vui128_t) xmask);

              // Or this with the X-bit to propagate any sticky bits into X

              p_sig_l = (vui128_t) vec_or ((vui32_t) p_sig_l, tmp);

              p_sig_l = (vui128_t) vec_andc ((vui32_t) p_sig_l, xmask);


              l_exp = vec_subudm (exp_128, x_exp);

              p_sig_l = vec_sldq (p_sig_h, p_sig_l, (vui128_t) l_exp);

              p_sig_h = vec_srq (p_sig_h, (vui128_t) x_exp);

              q_sig = p_sig_h;

            }

          // Set the exponent for denormal

          q_exp = exp_dnrm;

        }

      // Exponent is not tiny but significand may be denormal

      // Isolate sig CL bits and compare

      vui16_t t_sig = vec_splat ((vui16_t) p_sig_h, VEC_HW_H);

      if (__builtin_expect ((vec_all_eq(t_sig, (vui16_t ) q_zero)), 0))

        {

          // Is below normal range. This can happen when

          // multiplying a denormal by a normal.

          // So try to normalize the significand.

          //const vui64_t exp_15 = { 15, 15 };

          const vui64_t exp_15 = vec_splat_u64 (15);

          vui64_t c_exp, d_exp;

          vui128_t c_sig;

          vb64_t exp_mask;

          c_sig = vec_clzq (p_sig_h);

          c_exp = vec_splatd ((vui64_t) c_sig, VEC_DW_L);

          c_exp = vec_subudm (c_exp, exp_15);

          d_exp = vec_subudm (q_exp, exp_min);

          d_exp = vec_minud (c_exp, d_exp);

          exp_mask = vec_cmpgtud (q_exp, c_exp);


          // Intermediate result <= tiny, unbiased exponent <= -16382

          if (vec_cmpsd_all_gt ((vi64_t) q_exp, (vi64_t) exp_min))

            {

              // Try to normalize the significand.

              p_sig_h = vec_sldq (p_sig_h, p_sig_l, (vui128_t) d_exp);

              p_sig_l = vec_slq (p_sig_l, (vui128_t) d_exp);

              q_sig = p_sig_h;

              // Compare computed exp to shift count to normalize.

              //exp_mask = vec_cmpgtud (q_exp, c_exp);

              q_exp = vec_subudm (q_exp, d_exp);

              q_exp = vec_selud (exp_dnrm, q_exp, exp_mask);

            }

          else

            { // sig is denormal range (L-bit is 0). Set exp to zero.

              q_exp = exp_dnrm;

            }

        }

      // Merge sign early will not effect rounding for this mode

      // q_ssig = vec_or ((vui32_t) q_sig, q_sign);

      // Round to odd from lower product bits

      p_odd = vec_addcuq (p_sig_l, (vui128_t) q_ones);

      q_sig = (vui128_t) vec_or ((vui32_t) q_sig, (vui32_t) p_odd);


      // Check for exponent overflow -> __FLT128_MAX__ (round to odd)

      if (__builtin_expect ((vec_cmpud_all_ge (q_exp, exp_naninf)), 0))

        {

          // Intermediate result is huge, unbiased exponent > 16383

          // so return __FLT128_MAX__ with the appropriate sign.

          const vui32_t f128_max = CONST_VINT128_W(0x7ffeffff, -1, -1, -1);

          vui32_t f128_smax = vec_or ((vui32_t) f128_max, q_sign);

          return vec_xfer_vui32t_2_bin128 (f128_smax);

        }

      else // combine sign, exp, and significand for return

        {

          // Merge sign, significand, and exponent into final result

          q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

          vui32_t tmp, t128;

          // convert DW exp_naninf to QW expmask

          vui32_t expmask = vec_sld ((vui32_t) exp_naninf, q_zero, 14);

          // convert q_exp from DW to QW for QP format

          tmp = vec_sld ((vui32_t) q_exp, q_zero, 14);

          t128 = vec_sel ((vui32_t) q_sig, tmp, expmask);

          result = vec_xfer_vui32t_2_bin128 (t128);

          return result;

        }

    }

  else

    { // One or both operands are NaN or Infinity

      //const vui32_t q_nan = CONST_VINT128_W(0x00008000, 0, 0, 0);

      vui32_t q_nan = vec_mask128_f128Qbit ();

      vui32_t q_inf = vec_mask128_f128exp ();

      // One or both operands are NaN

      if (vec_all_isnanf128 (vfa))

        {

          // vfa is NaN, Convert vfa to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfa, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else if (vec_all_isnanf128 (vfb))

        {

          // vfb is NaN, Convert vfb to QNaN and return

          vui32_t vf128 = vec_or_bin128_2_vui32t (vfb, q_nan);

          return vec_xfer_vui32t_2_bin128 (vf128);

        }

      else  // Or one or both operands are Infinity

        {

          if (vec_cmpud_all_eq (x_exp, (vui64_t) exp_naninf))

            {

              // Infinity x Infinity == signed Infinity

              q_sig = (vui128_t) q_inf;

            }

          else

            {

              // One each Infinity/Finite value, check for 0.0

              if (vec_cmpuq_all_eq (a_mag, (vui128_t) q_zero)

                  || vec_cmpuq_all_eq (b_mag, (vui128_t) q_zero))

                {

                  // Inifinty x Zero is Default Quiet NaN

                  return vec_const_nanf128 ();

                }

              else // an Infinity and a Nonzero finite number

                {

                  // Return Infinity with product sign.

                  q_sig = (vui128_t) q_inf;

                }

            }

          // Merge sign, exp/sig into final result

          q_sig = (vui128_t) vec_or ((vui32_t) q_sig, q_sign);

          return vec_xfer_vui128t_2_bin128 (q_sig);

        }

    }

#endif

  return result;

}


static inline __binary128

vec_xsiexpqp (vui128_t sig, vui64_t exp)

{

  __binary128 result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  __asm__(

      "xsiexpqp %0,%1,%2"

      : "=v" (result)

      : "v" (sig), "v" (exp)

      : );


#else

  vui32_t tmp, t128;

  vui32_t expmask = vec_mask128_f128exp();


  tmp = vec_sld ((vui32_t) exp, (vui32_t) exp, 6);

  t128 =  vec_sel ((vui32_t) sig, tmp, expmask);

  result = vec_xfer_vui32t_2_bin128 (t128);

#endif

  return result;

}


static inline vui64_t

vec_xsxexpqp (__binary128 f128)

{

  vui64_t result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  __asm__(

      "xsxexpqp %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );


#else

  vui32_t tmp;

  vui32_t expmask = vec_mask128_f128exp();


  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  result = (vui64_t) vec_sld (tmp, tmp, 10);

#endif

  return result;

}


static inline vui128_t

vec_xsxsigqp (__binary128 f128)

{

  vui128_t result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  __asm__(

      "xsxsigqp %0,%1"

      : "=v" (result)

      : "v" (f128)

      : );

#else

  vui32_t t128, tmp, normal;

  const vui32_t zero = CONST_VINT128_W (0, 0, 0, 0);

#if 1

  const vui32_t sigmask = vec_mask128_f128sig();

  const vui32_t expmask = vec_mask128_f128exp();

  const vui32_t hidden = vec_mask128_f128Lbit();

#else

  const vui32_t sigmask = CONST_VINT128_W (0x0000ffff, -1, -1, -1);

  const vui32_t expmask = CONST_VINT128_W (0x7fff0000, 0, 0, 0);

  const vui32_t hidden = CONST_VINT128_W (0x00010000, 0, 0, 0);

#endif


  // Check if f128 is normal. Normal values need the hidden bit

  // restored to the significand. We use a simpler sequence here as

  // vec_isnormalf128 does more then we need.

  tmp = vec_and_bin128_2_vui32t (f128, expmask);

  normal = (vui32_t) vec_nor (vec_cmpeq (tmp, expmask),

                              vec_cmpeq (tmp, zero));

  t128 = vec_and_bin128_2_vui32t (f128, sigmask);

  result = (vui128_t) vec_sel (t128, normal, hidden);

#endif

  return result;

}


static inline vui64_t

vec_xxxexpqpp (__binary128 vfa, __binary128 vfb)

{

  vui64_t result;

#if defined (_ARCH_PWR9) && defined (__FLOAT128__) && (__GNUC__ > 7)

  vui64_t exp_a, exp_b;

  __asm__(

      "xsxexpqp %0,%2;"

      "xsxexpqp %1,%3"

      : "=v" (exp_a), "=v" (exp_b)

      : "v" (vfa), "v" (vfb)

      : );

  result = vec_mrgahd ((vui128_t) exp_a, (vui128_t) exp_b);

#else

  vui32_t tmp, rtmp, exp_mask;

  //const vui32_t expmask = CONST_VINT128_W (0, 0x7fff, 0, 0x7fff);

  exp_mask = (vui32_t) vec_mask64_f128exp();

  tmp = (vui32_t) vec_mrgh_bin128_2_vui64t (vfa, vfb);

  rtmp = vec_sld (tmp, tmp, 10);

  result = (vui64_t) vec_and (rtmp, exp_mask);

#endif

  return result;

}


#endif /* VEC_F128_PPC_H_ */