vec__f32__ppc_8h_source.html

/*

 Copyright (c) [2017] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_f32_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

      Created on: Apr 13, 2016

 */


#ifndef VEC_F32_PPC_H_

#define VEC_F32_PPC_H_


#include <pveclib/vec_common_ppc.h>

#include <pveclib/vec_int128_ppc.h>


static inline vf64_t

vec_vglfsso (float *array, const long long offset0,

             const long long offset1);

static inline vf64_t

vec_vlxsspx (const signed long long ra, const float *rb);

static inline void

vec_vsstfsso (vf64_t xs, float *array,

              const long long offset0, const long long offset1);

static inline void

vec_vstxsspx (vf64_t xs, const signed long long ra, float *rb);


typedef vf32_t __vbinary32;


static inline vf32_t

vec_absf32 (vf32_t vf32x)

{

#if _ARCH_PWR7

  /* Requires VSX but eliminates a const load. */

  return vec_abs (vf32x);

#else

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,

      0x80000000, 0x80000000);

  return (vf32_t)vec_andc ((vui32_t)vf32x, signmask);

#endif

}


static inline int

vec_all_isfinitef32 (vf32_t vf32)

{

  vui32_t tmp;

#if _ARCH_PWR9

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x70);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x70;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_zero);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  tmp = vec_and ((vui32_t)vf32, expmask);

  return !vec_any_eq(tmp, expmask);

#endif

}


static inline int

vec_all_isinff32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9 && !(defined(__clang__) && __clang_major__ < 9)

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x30);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x30;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_ones);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_all_eq(tmp, expmask);

#endif

}


static inline int

vec_all_isnanf32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9 && !(defined(__clang__) && __clang_major__ < 9)

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x40);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x40;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_ones);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_all_gt(tmp, expmask);

#endif

}


static inline int

vec_all_isnormalf32 (vf32_t vf32)

{

  vui32_t tmp;

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

#if _ARCH_PWR9

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x7f);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x7f;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_zero);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  tmp = vec_and ((vui32_t) vf32, expmask);

  return !(vec_any_eq (tmp, expmask) || vec_any_eq(tmp, vec_zero));

#endif

}


static inline int

vec_all_issubnormalf32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x03);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x03;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_ones);

#else

  const vui32_t explow = CONST_VINT128_W(0x00800000, 0x00800000, 0x00800000,

                                         0x00800000);

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_all_lt (tmp, explow) && vec_all_ne (tmp, vec_zero);

#endif

}


static inline int

vec_all_iszerof32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x0c);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x0c;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_all_eq(tmp, vec_ones);

#else

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_all_eq(tmp, vec_zero);

#endif

}


static inline int

vec_any_isfinitef32 (vf32_t vf32)

{

  vui32_t tmp;

#if _ARCH_PWR9

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x70);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x70;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_zero);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  tmp = vec_and ((vui32_t)vf32, expmask);

  return !vec_all_eq(tmp, expmask);

#endif

}


static inline int

vec_any_isinff32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9 && !(defined(__clang__) && __clang_major__ < 9)

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x30);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x30;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_ones);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_any_eq(tmp, expmask);

#endif

}


static inline int

vec_any_isnanf32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9 && !(defined(__clang__) && __clang_major__ < 9)

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x40);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x40;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_ones);

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_any_gt(tmp, expmask);

#endif

}


static inline int

vec_any_isnormalf32 (vf32_t vf32)

{

  vui32_t tmp;

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

#if _ARCH_PWR9

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x7f);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x7f;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_zero);

#else

  vui32_t res;

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  tmp = vec_and ((vui32_t) vf32, expmask);

  res = (vui32_t) vec_nor (vec_cmpeq (tmp, expmask), vec_cmpeq (tmp, vec_zero));


  return vec_any_gt(res, vec_zero);

#endif

}


static inline int

vec_any_issubnormalf32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x03);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x03;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_ones);

#else

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  const vui32_t explow = CONST_VINT128_W(0x00800000, 0x00800000, 0x00800000,

                                         0x00800000);

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  vui32_t tmpz, tmp2;

  vb32_t vsubnorm;


  tmp2 = vec_andc ((vui32_t)vf32, signmask);

  tmp = (vui32_t) vec_cmplt(tmp2, explow);

  tmpz = (vui32_t) vec_cmpeq (tmp2, vec_zero);

  vsubnorm = (vb32_t ) vec_andc (tmp, tmpz);

  return vec_any_ne(vsubnorm, vec_zero);

#endif

}


static inline int

vec_any_iszerof32 (vf32_t vf32)

{

  vui32_t tmp;


#if _ARCH_PWR9

  const vui32_t vec_ones = CONST_VINT128_W(-1, -1, -1, -1);

#ifdef vec_test_data_class

  tmp = (vui32_t)vec_test_data_class (vf32, 0x0c);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x0c;\n"

      : "=wa" (tmp)

      : "wa" (vf32)

      :);

#endif

  return vec_any_eq(tmp, vec_ones);

#else

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  return vec_any_eq(tmp, vec_zero);

#endif

}


static inline vf32_t

vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)

{

#if _ARCH_PWR7

#ifdef PVECLIB_CPSGN_FIXED

  return (vec_cpsgn (vf32x, vf32y));

#else

  vf32_t result;

  __asm__(

      "xvcpsgnsp %x0,%x1,%x2;\n"

      : "=wa" (result)

      : "wa" (vf32x), "wa" (vf32y)

      :);

  return (result);

#endif

#else

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,

      0x80000000, 0x80000000);

  vf32_t result;


  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);

  return (result);

#endif

}


static inline vb32_t

vec_isfinitef32 (vf32_t vf32)

{

  vb32_t tmp2;

#if defined (_ARCH_PWR9)

#ifdef vec_test_data_class

  tmp2 = vec_test_data_class (vf32, 0x70);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x70;\n"

      : "=wa" (tmp2)

      : "wa" (vf32)

      :);

#endif

  return vec_nor (tmp2, tmp2); // vec_not

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  vui32_t tmp;


  tmp = vec_and ((vui32_t)vf32, expmask);

  tmp2 = vec_cmpeq (tmp, expmask);

  return vec_nor (tmp2, tmp2); // vec_not

#endif

}


static inline vb32_t

vec_isinff32 (vf32_t vf32)

{

  vb32_t result;

#if _ARCH_PWR9  && !(defined(__clang__) && __clang_major__ < 9)

#ifdef vec_test_data_class

  result = vec_test_data_class (vf32, 0x30);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x30;\n"

      : "=wa" (result)

      : "wa" (vf32)

      :);

#endif

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp = vec_andc ((vui32_t)vf32, signmask);

  result = vec_cmpeq (tmp, expmask);

#endif

  return (result);

}


static inline vb32_t

vec_isnanf32 (vf32_t vf32)

{

  vb32_t result;

#if _ARCH_PWR9 && !(defined(__clang__) && __clang_major__ < 9)

#ifdef vec_test_data_class

  result = vec_test_data_class (vf32, 0x40);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x40;\n"

      : "=wa" (result)

      : "wa" (vf32)

      :);

#endif

#else

  vui32_t tmp2;

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp2 = vec_andc ((vui32_t)vf32, signmask);

  result = vec_cmpgt (tmp2, expmask);

#endif

  return (result);

}


static inline vb32_t

vec_isnormalf32 (vf32_t vf32)

{

#if _ARCH_PWR9

  vb32_t tmp2;

#ifdef vec_test_data_class

  tmp2 = vec_test_data_class (vf32, 0x7f);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x7f;\n"

      : "=wa" (tmp2)

      : "wa" (vf32)

      :);

#endif

  return vec_nor (tmp2, tmp2); // vec_not

#else

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000, 0x7f800000,

                                          0x7f800000);

  const vui32_t veczero = CONST_VINT128_W(0, 0, 0, 0);

  vui32_t tmp;


  tmp = vec_and ((vui32_t) vf32, expmask);

  return vec_nor (vec_cmpeq (tmp, expmask), vec_cmpeq (tmp, veczero));

#endif

}


static inline vb32_t

vec_issubnormalf32 (vf32_t vf32)

{

  vb32_t result;


#if _ARCH_PWR9

#ifdef vec_test_data_class

  result = vec_test_data_class (vf32, 0x03);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x03;\n"

      : "=wa" (result)

      : "wa" (vf32)

      :);

#endif

#else

  vui32_t tmp, tmpz, tmp2;

  const vui32_t explow = CONST_VINT128_W(0x00800000, 0x00800000, 0x00800000,

                                         0x00800000);

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp2 = vec_andc ((vui32_t)vf32, signmask);

  tmp = (vui32_t) vec_cmplt(tmp2, explow);

  tmpz = (vui32_t) vec_cmpeq (tmp2, vec_zero);

  result = (vb32_t) vec_andc (tmp, tmpz);

#endif

  return (result);

}


static inline vb32_t

vec_iszerof32 (vf32_t vf32)

{

  vb32_t result;

#if _ARCH_PWR9

#ifdef vec_test_data_class

  result = vec_test_data_class (vf32, 0x0c);

#else

  __asm__(

      "xvtstdcsp %x0,%x1,0x0c;\n"

      : "=wa" (result)

      : "wa" (vf32)

      :);

#endif

#else

  vui32_t tmp2;

  const vui32_t vec_zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000, 0x80000000,

                                           0x80000000);

  tmp2 = vec_andc ((vui32_t)vf32, signmask);

  result = vec_cmpeq (tmp2, vec_zero);

#endif

  return (result);

}


static inline vb32_t

vec_setb_sp (vf32_t vra)

{

  return vec_setb_sw ((vi32_t) vra);

}


static inline vf32_t

vec_vgl4fsso (float *array, const long long offset0,

             const long long offset1, const long long offset2,

             const long long offset3)

{

  vf32_t result;


#ifdef _ARCH_PWR8

  vui64_t re0, re1, re2, re3;

  re0 = vec_vlxsiwzx (offset0, (unsigned int *) array);

  re1 = vec_vlxsiwzx (offset1, (unsigned int *) array);

  re2 = vec_vlxsiwzx (offset2, (unsigned int *) array);

  re3 = vec_vlxsiwzx (offset3, (unsigned int *) array);

  /* Need to handle endian as the vec_vlxsiwzx result is always left

   * justified in VR, while element [0] may be left or right. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't

   * support doubleword vec_merge. */

  re0 = vec_xxpermdi (re0, re2, 3);

  re1 = vec_xxpermdi (re1, re3, 3);

  result = (vf32_t) vec_mergee ((vui32_t) re0, (vui32_t) re1);

#else

  re0 = vec_xxpermdi (re0, re2, 0);

  re1 = vec_xxpermdi (re1, re3, 0);

  result = (vf32_t) vec_mergeo ((vui32_t) re0, (vui32_t) re1);

#endif

#else //  _ARCH_PWR7

  vf32_t xte0, xte1, xte2, xte3;

  vui8_t perm0, perm1, perm2, perm3;


  perm0 = vec_lvsl (offset0, array);

  xte0 = vec_lde (offset0, array);

  xte0 = vec_perm (xte0, xte0, perm0);


  perm1 = vec_lvsl (offset1, array);

  xte1 = vec_lde (offset1, array);

  xte1 = vec_perm (xte1, xte1, perm1);


  perm2 = vec_lvsl (offset2, array);

  xte2 = vec_lde (offset2, array);

  xte2 = vec_perm (xte2, xte2, perm2);


  perm3 = vec_lvsl (offset3, array);

  xte3 = vec_lde (offset3, array);

  xte3 = vec_perm (xte3, xte3, perm3);


  xte0 = vec_mergeh (xte0, xte2);

  xte1 = vec_mergeh (xte1, xte3);

  result = vec_mergeh (xte0, xte1);

#endif

  return (vf32_t)result;

}


static inline

vf32_t

vec_vgl4fswo (float *array, vi32_t vra)

{

  vf32_t r;


#ifdef _ARCH_PWR8

  vi64_t off01, off23;


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  r = vec_vgl4fsso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;

  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);

  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;

  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);


  r = vec_vgl4fsso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline

vf32_t

vec_vgl4fswsx (float *array, vi32_t vra,

                     const unsigned char scale)

{

  vf32_t r;


#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vi64_t lshift = vec_splats ((long long) (2+ scale));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  r = vec_vgl4fsso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed long long off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2+ scale);

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2+ scale);

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2+ scale);

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2+ scale);


  r = vec_vgl4fsso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline

vf32_t

vec_vgl4fswx (float *array, vi32_t vra)

{

  vf32_t r;


#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vi64_t lshift = vec_splats ((long long) (2));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  r = vec_vgl4fsso (array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed long long off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;


  r = vec_vgl4fsso (array, off0, off1, off2, off3);

#endif

  return  r;

}


static inline

vf64_t

vec_vglfsdo (float *array, vi64_t vra)

{

  vf64_t result;


#ifdef _ARCH_PWR8

  result = vec_vglfsso (array, vra[0], vra[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);


  result = vec_vglfsso (array, scalar_extract_uint64_from_high_uint128(gprp),

                   scalar_extract_uint64_from_low_uint128(gprp));

#endif

  return  result;

}


static inline

vf64_t

vec_vglfsdsx (float *array, vi64_t vra,

                     const unsigned char scale)

{

  vf64_t r;


#ifdef _ARCH_PWR8

  vi64_t lshift = vec_splats ((long long) (2 + scale));

  vi64_t offset;


  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vglfsso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);


  r = vec_vglfsso (array, offset0, offset1);

#endif

  return  r;

}


static inline

vf64_t

vec_vglfsdx (float *array, vi64_t vra)

{

  vf64_t r;


#ifdef _ARCH_PWR8

  vi64_t lshift = vec_splats ((long long) 2);

  vi64_t offset;


  offset = (vi64_t) __pvec_vsld (vra, (vui64_t) lshift);

  r = vec_vglfsso (array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;


  r = vec_vglfsso (array, offset0, offset1);

#endif

  return  r;

}


static inline vf64_t

vec_vglfsso (float *array, const long long offset0,

             const long long offset1)

{

  vf64_t re0, re1, result;


  re0 = vec_vlxsspx (offset0, array);

  re1 = vec_vlxsspx (offset1, array);

  /* Need to handle endian as the vec_vlxsspx result is always left

   * justified in VR, while element [0] may be left or right. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  /* Can't use vec_mergeo here as GCC 7 (AT11) and earlier don't

   * support doubleword vec_merge. */

  result = vec_xxpermdi (re0, re1, 3);

#else

#ifdef _ARCH_PWR7

  result = vec_xxpermdi (re0, re1, 0);

#else

  re0 = (vi64_t) vec_sld (re0, re0, 8);

  result = (vi64_t) vec_sld (re0, re1, 8);

#endif

#endif

  return result;

}


static inline vf64_t

vec_vlxsspx (const signed long long ra, const float *rb)

{

  vf64_t xt;


#if (defined(__clang__) && __clang_major__ < 8)

  __VEC_U_128 t;


  float *p = (float *)((char *)rb + ra);

  // Splat the load, otherwise some compilers will treat this as dead code.

  t.vf2[0] = t.vf2[1] = *p;

  xt = t.vf2;

#elif _ARCH_PWR8

  if (__builtin_constant_p (ra) && (ra < 32760) && (ra >= -32768)

      && ((ra & 3) == 0))

    {

  #if defined (_ARCH_PWR9)

        __asm__(

          "lxssp%X1 %0,%1;"

          : "=v" (xt)

          : "m" (*(float*)((char *)rb + ra))

          : );

  #else

      if (ra == 0)

        {

          __asm__(

              "lxsspx %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*rb)

              : );

        } else {

          unsigned long long rt;

          __asm__(

              "li %0,%1;"

              : "=r" (rt)

              : "I" (ra)

              : );

          __asm__(

              "lxsspx %x0,%y1;"

              : "=wa" (xt)

              : "Z" (*(float *)((char *)rb+rt))

              : );

        }

  #endif

    } else {

      __asm__(

          "lxsspx %x0,%y1;"

          : "=wa" (xt)

          : "Z" (*(float *)((char *)rb+ra))

          : );

    }

#else // _ARCH_PWR7

  __VEC_U_128 t;


  float *p = (float *)((char *)rb + ra);

  // Let the compiler generate a Load Float Single Indexed

  t.vf2[0] = t.vf2[1] = *p;

  xt = t.vf2;

#endif

  return xt;

}


static inline void

vec_vsst4fsso (vf32_t xs, float *array,

              const long long offset0, const long long offset1,

              const long long offset2, const long long offset3)

{

  vf32_t xs0, xs1, xs2, xs3;


  xs0 = vec_splat (xs, 0);

  xs1 = vec_splat (xs, 1);

  xs2 = vec_splat (xs, 2);

  xs3 = vec_splat (xs, 3);

  vec_ste (xs0, offset0, array);

  vec_ste (xs1, offset1, array);

  vec_ste (xs2, offset2, array);

  vec_ste (xs3, offset3, array);

}


static inline void

vec_vsst4fswo (vf32_t xs, float *array,

              vi32_t vra)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  vec_vsst4fsso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = scalar_extract_uint64_from_high_uint128(gprp) >> 32;

  off1 = (int) scalar_extract_uint64_from_high_uint128(gprp);

  off2 = scalar_extract_uint64_from_low_uint128(gprp) >> 32;

  off3 = (int) scalar_extract_uint64_from_low_uint128(gprp);


  vec_vsst4fsso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsst4fswsx (vf32_t xs, float *array,

              vi32_t vra, const unsigned char scale)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  vec_vsst4fsso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << (2 + scale);

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << (2 + scale);

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << (2 + scale);

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << (2 + scale);


  vec_vsst4fsso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsst4fswx (vf32_t xs, float *array,

              vi32_t vra)

{

#ifdef _ARCH_PWR8

  vi64_t off01, off23;

  vui64_t lshift = vec_splats ((unsigned long long) 2);


  off01 = vec_vupkhsw (vra);

  off23 = vec_vupklsw (vra);


  off01 = (vi64_t) __pvec_vsld (off01, (vui64_t) lshift);

  off23 = (vi64_t) __pvec_vsld (off23, (vui64_t) lshift);


  vec_vsst4fsso (xs, array, off01[0], off01[1], off23[0], off23[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  signed int off0, off1, off2, off3;


  off0 = (scalar_extract_uint64_from_high_uint128(gprp) >> 32) << 2;

  off1 = ((int) scalar_extract_uint64_from_high_uint128(gprp)) << 2;

  off2 = (scalar_extract_uint64_from_low_uint128(gprp) >> 32) << 2;

  off3 = ((int) scalar_extract_uint64_from_low_uint128(gprp)) << 2;


  vec_vsst4fsso (xs, array, off0, off1, off2, off3);

#endif

}


static inline void

vec_vsstfsdo (vf64_t xs, float *array, vi64_t vra)

{

#ifdef _ARCH_PWR8

  vec_vsstfsso (xs, array, vra[0], vra[1]);

#else

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);


  vec_vsstfsso (xs, array,

                scalar_extract_uint64_from_high_uint128(gprp),

                scalar_extract_uint64_from_low_uint128(gprp));

#endif

}


static inline void

vec_vsstfsdsx (vf64_t xs, float *array, vi64_t vra,

              const unsigned char scale)

{

#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) (2 + scale));

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  vec_vsstfsso (xs, array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << (2 + scale);

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << (2 + scale);


  vec_vsstfsso (xs, array, offset0, offset1);

#endif

}


static inline void

vec_vsstfsdx (vf64_t xs, float *array, vi64_t vra)

{

#ifdef _ARCH_PWR8

  vui64_t lshift = vec_splats ((unsigned long long) 2);

  vui64_t offset;


  offset = (vui64_t) __pvec_vsld (vra, (vui64_t) lshift);

  vec_vsstfsso (xs, array, offset[0], offset[1]);

#else

  long long offset0, offset1;

  // Need to explicitly manage the VR/GPR xfer for PWR7

  unsigned __int128 gprp = vec_transfer_vui128t_to_uint128 ((vui128_t) vra);

  offset0 = scalar_extract_uint64_from_high_uint128(gprp) << 2;

  offset1 = scalar_extract_uint64_from_low_uint128(gprp) << 2;


  vec_vsstfsso (xs, array, offset0, offset1);

#endif

}


static inline void

vec_vsstfsso (vf64_t xs, float *array,

              const long long offset0, const long long offset1)

{

  vf64_t xs0, xs1;


  xs0 = xs;

  // xs1 = vec_xxswapd (xs);

#ifdef _ARCH_PWR7

  xs1 = vec_xxpermdi (xs, xs, 2);

#else

  xs1 = vec_sld (xs0, xs0, 8);

#endif

  /* Need to handle endian as vec_vstxsspx always stores the

   * left doubleword of the VSR, while doubleword element [0] may in

   * the left or right doubleword. */

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

  vec_vstxsspx (xs0, offset1, array);

  vec_vstxsspx (xs1, offset0, array);

#else

  vec_vstxsspx (xs0, offset0, array);

  vec_vstxsspx (xs1, offset1, array);

#endif

}


static inline void

vec_vstxsspx (vf64_t xs, const signed long long ra, float *rb)

{

#if (defined(__clang__) && __clang_major__ < 8)

  __VEC_U_128 t;

  float *p = (float *)((char *)rb + ra);

  t.vf2 = xs;

  *p = t.vf2[0];

#elif _ARCH_PWR8

  if (__builtin_constant_p (ra) && (ra < 32760) && (ra >= -32768)

      && ((ra & 3) == 0))

    {

#if defined (_ARCH_PWR9)

      __asm__(

          "stxssp%X0 %1,%0;"

          : "=m" (*(float*)((char *)rb + ra))

          : "v" (xs)

          : );

#else

      if (ra == 0)

        {

          __asm__(

              "stxsspx %x1,%y0;"

              : "=Z" (*rb)

              : "wa" (xs)

              : );

        } else {

          unsigned long long rt;

          __asm__(

              "li %0,%1;"

              : "=r" (rt)

              : "I" (ra)

              : );

          __asm__(

              "stxsspx %x1,%y0;"

              : "=Z" (*(float *)((char *)rb+rt))

              : "wa" (xs)

              : );

        }

#endif

    } else {

      __asm__(

          "stxsspx %x1,%y0;"

          : "=Z" (*(float *)((char *)rb+ra))

          : "wa" (xs)

          : );

    }

#else //_ARCH_PWR7

  // Let the compiler generate a Store Float Single Indexed

  __VEC_U_128 t;

  float *p = (float *)((char *)rb + ra);

  t.vf2 = xs;

  *p = t.vf2[0];

#endif

}


static inline vf32_t

vec_xviexpsp (vui32_t sig, vui32_t exp)

{

  vf32_t result;

#if defined (_ARCH_PWR9) && defined (__VSX__) && (__GNUC__ > 7)

#if defined (vec_insert_exp)

  result = vec_insert_exp (sig, exp);

#else

  __asm__(

      "xviexpsp %x0,%x1,%x2"

      : "=wa" (result)

      : "wa" (sig), "wa" (exp)

      : );

#endif

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000,

                                          0x7f800000, 0x7f800000);


  tmp = vec_slwi (exp, 23);

  result = (vf32_t) vec_sel ((vui32_t) sig, tmp, expmask);

#endif

  return result;

}


static inline vui32_t

vec_xvxexpsp (vf32_t vrb)

{

  vui32_t result;

#if defined (_ARCH_PWR9) && defined (__VSX__) && (__GNUC__ > 7)

#if defined (vec_extract_exp)

  result = vec_extract_exp (vrb);

#else

  __asm__(

      "xvxexpsp %x0,%x1"

      : "=wa" (result)

      : "wa" (vrb)

      : );

#endif

#else

  vui32_t tmp;

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000,

                                          0x7f800000, 0x7f800000);


  tmp = vec_and ((vui32_t) vrb, expmask);

  result = vec_srwi (tmp, 23);

#endif

  return result;

}


static inline vui32_t

vec_xvxsigsp (vf32_t vrb)

{

  vui32_t result;

#if defined (_ARCH_PWR9) && defined (__VSX__) && (__GNUC__ > 7)

#if defined (vec_extract_sig)

  result = vec_extract_sig (vrb);

#else

  __asm__(

      "xvxsigsp %x0,%x1"

      : "=wa" (result)

      : "wa" (vrb)

      : );

#endif

#else

  vui32_t t128, tmp;

  vui32_t normal;

  const vui32_t zero = CONST_VINT128_W(0, 0, 0, 0);

  const vui32_t sigmask = CONST_VINT128_W(0x007fffff, 0x007fffff,

                                          0x007fffff, 0x007fffff);

  const vui32_t expmask = CONST_VINT128_W(0x7f800000, 0x7f800000,

                                          0x7f800000, 0x7f800000);

  const vui32_t hidden = CONST_VINT128_W(0x00800000, 0x00800000,

                                         0x00800000, 0x00800000);


  // Check if vrb is normal. Normal values need the hidden bit

  // restored to the significand. We use a simpler sequence here as

  // vec_isnormalf32 does more then we need.

  tmp = vec_and ((vui32_t) vrb, expmask);

  normal = vec_nor ((vui32_t) vec_cmpeq (tmp, expmask),

                    (vui32_t) vec_cmpeq (tmp, zero));

  t128 = vec_and ((vui32_t) vrb, sigmask);

  result = (vui32_t) vec_sel (t128, normal, hidden);

#endif

  return result;

}


#endif /* VEC_F32_PPC_H_ */