vec__common__ppc_8h_source.html

/*

 Copyright (c) [2017, 2018] IBM Corporation.


 Licensed under the Apache License, Version 2.0 (the "License");

 you may not use this file except in compliance with the License.

 You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0


 Unless required by applicable law or agreed to in writing, software

 distributed under the License is distributed on an "AS IS" BASIS,

 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 See the License for the specific language governing permissions and

 limitations under the License.


 vec_common_ppc.h


 Contributors:

      IBM Corporation, Steven Munroe

 */


#ifndef VEC_COMMON_PPC_H_

#define VEC_COMMON_PPC_H_


#include <stdint.h>

#include <altivec.h>


typedef __vector unsigned char vui8_t;

typedef __vector unsigned short vui16_t;

typedef __vector unsigned int vui32_t;

typedef __vector unsigned long long vui64_t;


typedef __vector signed char vi8_t;

typedef __vector short vi16_t;

typedef __vector int vi32_t;

typedef __vector long long vi64_t;

typedef __vector float vf32_t;

typedef __vector double vf64_t;


typedef __vector __bool char vb8_t;

typedef __vector __bool short vb16_t;

typedef __vector __bool int vb32_t;

typedef __vector __bool long long vb64_t;


/* did not get vector __int128 until GCC4.8.  */

#ifndef PVECLIB_DISABLE_INT128


typedef __vector __int128 vi128_t;

typedef __vector unsigned __int128 vui128_t;

#ifndef PVECLIB_DISABLE_BOOLINT128


typedef __vector __bool __int128 vb128_t;

#else


typedef __vector __bool int vb128_t;

#endif

#else


typedef __vector int vi128_t;

typedef __vector unsigned int vui128_t;

typedef __vector __bool int vb128_t;

#endif


typedef union

{

  signed __int128 i128;

  unsigned __int128 ui128;

#ifndef PVECLIB_DISABLE_DFP


  _Decimal128 dpd128;

#endif


  long double ldbl128;

  vui8_t vx16;

  vui16_t vx8;

  vui32_t vx4;

  vui64_t vx2;

  vui128_t vx1;

  vf64_t vf2;

  struct

  {

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

    uint64_t lower;

    uint64_t upper;

#else

    uint64_t upper;

    uint64_t lower;

#endif

  } ulong;

} __VEC_U_128;


#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__


#define CONST_VINT64_DW(__dw0, __dw1) {__dw1, __dw0}


#define CONST_VINT128_DW(__dw0, __dw1) (vui64_t){__dw1, __dw0}


#define CONST_VINT128_DW128(__dw0, __dw1) (vui128_t)((vui64_t){__dw1, __dw0})


#define CONST_VINT128_W(__w0, __w1, __w2, __w3) (vui32_t){__w3, __w2, __w1, __w0}


#define CONST_VINT32_W(__w0, __w1, __w2, __w3) {__w3, __w2, __w1, __w0}


#define CONST_VINT128_H(__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7) \

    (vui16_t){__hw7, __hw6, __hw5, __hw4, __hw3, __hw2, __hw1, __hw0}


#define CONST_VINT16_H(__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7) \

    {__hw7, __hw6, __hw5, __hw4, __hw3, __hw2, __hw1, __hw0}


#define CONST_VINT128_B(_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15) \

    (vui8_t){_b15, _b14, _b13, _b12, _b11, _b10, _b9, _b8, _b7, _b6, _b5, _b4, _b3, _b2, _b1, _b0}


#define CONST_VINT8_B(_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15) \

    {_b15, _b14, _b13, _b12, _b11, _b10, _b9, _b8, _b7, _b6, _b5, _b4, _b3, _b2, _b1, _b0}


#define VEC_DW_H 1


#define VEC_DW_L 0


#define VEC_W_H 3


#define VEC_W_L 0


#define VEC_WE_0 3


#define VEC_WE_1 2


#define VEC_WE_2 1


#define VEC_WE_3 0


#define VEC_HW_H 7


#define VEC_HW_L_DWH 4


#define VEC_HW_L 0


#define VEC_BYTE_L 0


#define VEC_BYTE_L_DWH 8


#define VEC_BYTE_L_DWL 0


#define VEC_BYTE_H 15


#define VEC_BYTE_HHW 14

#else

#define CONST_VINT64_DW(__dw0, __dw1) {__dw0, __dw1}

#define CONST_VINT128_DW(__dw0, __dw1) (vui64_t){__dw0, __dw1}

#define CONST_VINT128_DW128(__dw0, __dw1) (vui128_t)((vui64_t){__dw0, __dw1})

#define CONST_VINT128_W(__w0, __w1, __w2, __w3) (vui32_t){__w0, __w1, __w2, __w3}

#define CONST_VINT32_W(__w0, __w1, __w2, __w3) {__w0, __w1, __w2, __w3}


#define CONST_VINT128_H(__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7) \

    (vui16_t){__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7}


#define CONST_VINT16_H(__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7) \

    {__hw0, __hw1, __hw2, __hw3, __hw4, __hw5, __hw6, __hw7}


#define CONST_VINT128_B(_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15) \

    (vui8_t){_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15}


#define CONST_VINT8_B(_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15) \

    {_b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7, _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15}

#define VEC_DW_H 0

#define VEC_DW_L 1

#define VEC_W_H 0

#define VEC_W_L 3

#define VEC_WE_0 0

#define VEC_WE_1 1

#define VEC_WE_2 2

#define VEC_WE_3 3

#define VEC_HW_H 0


#define VEC_HW_L_DWH 3

#define VEC_HW_L 7

#define VEC_BYTE_L 15


#define VEC_BYTE_L_DWH 7


#define VEC_BYTE_L_DWL 15

#define VEC_BYTE_H 0

#define VEC_BYTE_HHW 1

#endif


extern const vui128_t vtipowof10[];

extern const vui128_t vtifrexpof10[];


#ifndef PVECLIB_DISABLE_DFP


extern const _Decimal128 decpowof2[];

#endif


static inline unsigned __int128

vec_transfer_vui128t_to_uint128 (vui128_t vra)

{

  __VEC_U_128 t;

  unsigned __int128 result;

#if defined(_ARCH_PWR8) || defined (__clang__)

  // PWR8/9 should generate Move From VSR Doubleword instructions.

  t.vx1 = vra;

  result = t.ui128;

#else

#ifdef  _ARCH_PWR7

  /* PWR7 and earlier must transfer through storage.  This requires

   * care as we want to avoid load-hit-store flushes in the pipeline.

   * First split the vector into a pair of dword FPRs (vra_u, vra_l). */

  vui64_t vra_u = (vui64_t) vra;

  vui64_t vra_l = vec_xxpermdi ((vui64_t) vra, (vui64_t) vra, 2);

  /* Store this pair as adjacent dwords, followed by a group ending

   * nop. This prevents the hardware from dispatching the stores in the

   * same cycle as the following loads (a guaranteed pipeline flush).

   * Also the load addresses and data size will match these stores and

   * increase the possibility of store forwarding from the store queue.

   */

  __asm__(

      "stxsdx %x2,%y0;"

      "stxsdx %x3,%y1;"

      "ori  2,2,0;"

      : "=Z" (t.ulong.lower),

        "=Z" (t.ulong.upper)

      : "wa" (vra_l), "wa" (vra_u)

      : );

#else //_ARCH_PWR6/970

  /* Just have to go through storage and let the hardware deal with

   * load/store ordering. */

  t.vx1 = vra;

#endif

  // Load the dwords into a pair of GPRs for the __int128 result.

  result = t.ui128;

#endif

  return (result);

}


static inline vui128_t

vec_transfer_uint128_to_vui128t (unsigned __int128 gprp)

{

  __VEC_U_128 t;

  t.ui128 = gprp;

  return t.vx1;

}


static inline unsigned long long

scalar_extract_uint64_from_low_uint128 (unsigned __int128 gprp)

{

  __VEC_U_128 t;

  t.ui128 = gprp;

  return t.ulong.lower;

}


static inline unsigned long long

scalar_extract_uint64_from_high_uint128 (unsigned __int128 gprp)

{

  __VEC_U_128 t;

  t.ui128 = gprp;

  return t.ulong.upper;

}


static inline unsigned __int128

scalar_insert_uint64_to_uint128 (unsigned long long high,

                                 unsigned long long low)

{

  __VEC_U_128 t;

  t.ulong.lower = low;

  t.ulong.upper = high;

  return t.ui128;

}


#endif /* VEC_COMMON_PPC_H_ */