POWER Vector Library Manual  1.0.4
Typedefs | Functions
vec_f32_ppc.h File Reference

Header package containing a collection of 128-bit SIMD operations over 4x32-bit floating point elements. More...

#include <pveclib/vec_common_ppc.h>
#include <pveclib/vec_int128_ppc.h>

Go to the source code of this file.

Typedefs

typedef vf32_t __vbinary32
 typedef __vbinary32 to vector of 4 xfloat elements.
 

Functions

static vf32_t vec_absf32 (vf32_t vf32x)
 Vector float absolute value. More...
 
static int vec_all_isfinitef32 (vf32_t vf32)
 Return true if all 4x32-bit vector float values are Finite (Not NaN nor Inf). More...
 
static int vec_all_isinff32 (vf32_t vf32)
 Return true if all 4x32-bit vector float values are infinity. More...
 
static int vec_all_isnanf32 (vf32_t vf32)
 Return true if all of 4x32-bit vector float values are NaN. More...
 
static int vec_all_isnormalf32 (vf32_t vf32)
 Return true if all of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero). More...
 
static int vec_all_issubnormalf32 (vf32_t vf32)
 Return true if all of 4x32-bit vector float values is subnormal (denormal). More...
 
static int vec_all_iszerof32 (vf32_t vf32)
 Return true if all of 4x32-bit vector float values are +-0.0. More...
 
static int vec_any_isfinitef32 (vf32_t vf32)
 Return true if any 4x32-bit vector float values are Finite (Not NaN nor Inf). More...
 
static int vec_any_isinff32 (vf32_t vf32)
 Return true if any 4x32-bit vector float values are infinity. More...
 
static int vec_any_isnanf32 (vf32_t vf32)
 Return true if any of 4x32-bit vector float values are NaN. More...
 
static int vec_any_isnormalf32 (vf32_t vf32)
 Return true if any of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero). More...
 
static int vec_any_issubnormalf32 (vf32_t vf32)
 Return true if any of 4x32-bit vector float values is subnormal (denormal). More...
 
static int vec_any_iszerof32 (vf32_t vf32)
 Return true if any of 4x32-bit vector float values are +-0.0. More...
 
static vf32_t vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 Copy the sign bit from vf32x merged with magnitude from vf32y and return the resulting vector float values. More...
 
static vb32_t vec_isfinitef32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values for each float element that is Finite (Not NaN nor Inf). More...
 
static vb32_t vec_isinff32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values for each float, if infinity. More...
 
static vb32_t vec_isnanf32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values, for each float NaN value. More...
 
static vb32_t vec_isnormalf32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values, for each float value, if normal (Not NaN, Inf, denormal, or zero). More...
 
static vb32_t vec_issubnormalf32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values, for each float value that is subnormal (denormal). More...
 
static vb32_t vec_iszerof32 (vf32_t vf32)
 Return 4x32-bit vector boolean true values, for each float value that is +-0.0. More...
 
static vb32_t vec_setb_sp (vf32_t vra)
 Vector Set Bool from Sign, Single Precision. More...
 
static vf32_t vec_vgl4fsso (float *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
 Vector Gather-Load 4 Words from scalar Offsets. More...
 
static vf32_t vec_vgl4fswo (float *array, vi32_t vra)
 Vector Gather-Load 4 Words from Vector Word Offsets. More...
 
static vf32_t vec_vgl4fswsx (float *array, vi32_t vra, const unsigned char scale)
 Vector Gather-Load 4 Words from Vector Word Scaled Indexes. More...
 
static vf32_t vec_vgl4fswx (float *array, vi32_t vra)
 Vector Gather-Load 4 Words from Vector Word Indexes. More...
 
static vf64_t vec_vglfsdo (float *array, vi64_t vra)
 Vector Gather-Load Single Floats from Vector Doubleword Offsets. More...
 
static vf64_t vec_vglfsdsx (float *array, vi64_t vra, const unsigned char scale)
 Vector Gather-Load Single Floats from Vector Doubleword Scaled Indexes. More...
 
static vf64_t vec_vglfsdx (float *array, vi64_t vra)
 Vector Gather-Load Single Floats from Vector Doubleword Indexes. More...
 
static vf64_t vec_vglfsso (float *array, const long long offset0, const long long offset1)
 Vector Gather-Load Float Single from scalar Offsets. More...
 
static vf64_t vec_vlxsspx (const signed long long ra, const float *rb)
 Vector Load Scalar Single Float Indexed. More...
 
static void vec_vsst4fsso (vf32_t xs, float *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
 Vector Scatter-Store 4 Float Singles to Scalar Offsets. More...
 
static void vec_vsst4fswo (vf32_t xs, float *array, vi32_t vra)
 Vector Scatter-Store 4 Float Singles to Vector Word Offsets. More...
 
static void vec_vsst4fswsx (vf32_t xs, float *array, vi32_t vra, const unsigned char scale)
 Vector Scatter-Store 4 Float Singles to Vector Word Indexes. More...
 
static void vec_vsst4fswx (vf32_t xs, float *array, vi32_t vra)
 Vector Scatter-Store 4 Float Singles to Vector Word Indexes. More...
 
static void vec_vsstfsdo (vf64_t xs, float *array, vi64_t vra)
 Vector Scatter-Store Floats Singles to Vector Doubleword Offsets. More...
 
static void vec_vsstfsdsx (vf64_t xs, float *array, vi64_t vra, const unsigned char scale)
 Vector Scatter-Store Words to Vector Doubleword Scaled Indexes. More...
 
static void vec_vsstfsdx (vf64_t xs, float *array, vi64_t vra)
 Vector Scatter-Store Words to Vector Doubleword Indexes. More...
 
static void vec_vsstfsso (vf64_t xs, float *array, const long long offset0, const long long offset1)
 Vector Scatter-Store Float Singles to Scalar Offsets. More...
 
static void vec_vstxsspx (vf64_t xs, const signed long long ra, float *rb)
 Vector Store Scalar Single Float Indexed. More...
 
static vf32_t vec_xviexpsp (vui32_t sig, vui32_t exp)
 Vector Insert Exponent Single-Precision. More...
 
static vui32_t vec_xvxexpsp (vf32_t vrb)
 Vector Extract Exponent Single-Precision. More...
 
static vui32_t vec_xvxsigsp (vf32_t vrb)
 Vector Extract Significand Single-Precision. More...
 

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 4x32-bit floating point elements.

Most vector float (32-bit float) operations are implemented with PowerISA VMX instructions either defined by the original VMX (a.k.a. Altivec) or added to later versions of the PowerISA. POWER8 added the Vector Scalar Extended (VSX) with access to additional vector registers (64 total) and operations. Most of these operations (compiler built-ins, or intrinsics) are defined in <altivec.h> and described in the compiler documentation.

Note
The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example if you compile with -mcpu=power7, some of the wordwise pack, unpack and merge operations useful for conversions are not defined and the equivalent vec_perm and permute control must be used instead. This header will provide the appropriate substitutions, will generate the minimum code, appropriate for the target, and produce correct results.
Most ppc64le compilers will default to -mcpu=power8 if not specified.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides an inline assembler implementation for older compilers that do not provide the built-ins.

POWER9 adds useful vector float operations, including: test data class, extract exponent, extract significand, and insert exponent. These operations are common in math library implementations.

Note
GCC 7.3 defines vector forms of the test data class, extract significand, and extract/insert_exp for float and double. These built-ins are not defined in GCC 6.4. See compiler documentation. These are useful operations and can be implement in a few vector logical instruction for earlier machines.

So it is reasonable for this header to provide vector forms of the floating point classification functions (isnormal/subnormal/finite/inf/nan/zero, etc.). These functions can be implemented directly using (one or more) POWER9 instructions, or a few vector logical and integer compare instructions for POWER7/8. Each is comfortably small enough to be in-lined and inherently faster than the equivalent POSIX or compiler built-in runtime scalar functions.

This header covers operations that are any of the following:

Examples

For example: using the the classification functions for implementing the math library function sine and cosine. The POSIX specification requires that special input values are processed without raising extraneous floating point exceptions and return specific floating point values in response. For example the sin() function.

The following code example uses functions from this header to address the POSIX requirements for special values input to for a vectorized sinf():

test_vec_sinf32 (vf32_t value)
{
const vf32_t vec_f0 = { 0.0, 0.0, 0.0, 0.0 };
const vui32_t vec_f32_qnan =
{ 0x7f800001, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
vf32_t result;
vb32_t normmask, infmask;
normmask = vec_isnormalf32 (value);
if (vec_any_isnormalf32 (value))
{
// replace non-normal input values with safe values.
vf32_t safeval = vec_sel (vec_f0, value, normmask);
// body of vec_sin(safeval) computation elided for this example.
}
else
result = value;
// merge non-normal input values back into result
result = vec_sel (value, result, normmask);
// Inf input value elements return quiet-nan
infmask = vec_isinff32 (value);
result = vec_sel (result, (vf32_t) vec_f32_qnan, infmask);
return result;
}

The code generated for this fragment runs between 24 (-mcpu=power9) and 40 (-mcpu=power8) instructions. The normal execution path is 14 to 25 instructions respectively.

Another example the cos() function.

The following code example uses functions from this header to address the POSIX requirements for special values input to vectorized cosf():

test_vec_cosf32 (vf32_t value)
{
vf32_t result;
const vf32_t vec_f0 = { 0.0, 0.0, 0.0, 0.0 };
const vf32_t vec_f1 = { 1.0, 1.0, 1.0, 1.0 };
const vui32_t vec_f32_qnan =
{ 0x7f800001, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
vb32_t finitemask, infmask, zeromask;
finitemask = vec_isfinitef32 (value);
if (vec_any_isfinitef32 (value))
{
// replace non-finite input values with safe values
vf32_t safeval = vec_sel (vec_f0, value, finitemask);
// body of vec_sin(safeval) computation elided for this example
}
else
result = value;
// merge non-finite input values back into result
result = vec_sel (value, result, finitemask);
// Set +-0.0 input elements to exactly 1.0 in result
zeromask = vec_iszerof32 (value);
result = vec_sel (result, vec_f1, zeromask);
// Set Inf input elements to quiet-nan in result
infmask = vec_isinff32 (value);
result = vec_sel (result, (vf32_t) vec_f32_qnan, infmask);
return result;
}

Neither example raises floating point exceptions or sets errno, as appropriate for a vector math library.

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absf32()

static vf32_t vec_absf32 ( vf32_t  vf32x)
inlinestatic

Vector float absolute value.

processor Latency Throughput
power8 6-7 2/cycle
power9 2 2/cycle
Parameters
vf32xvector float values containing the magnitudes.
Returns
vector absolute values of 4x float elements of vf32x.

◆ vec_all_isfinitef32()

static int vec_all_isfinitef32 ( vf32_t  vf32)
inlinestatic

Return true if all 4x32-bit vector float values are Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 4-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
an int containing 0 or 1.

◆ vec_all_isinff32()

static int vec_all_isinff32 ( vf32_t  vf32)
inlinestatic

Return true if all 4x32-bit vector float values are infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
boolean int, true if all 4 float values are infinity

◆ vec_all_isnanf32()

static int vec_all_isnanf32 ( vf32_t  vf32)
inlinestatic

Return true if all of 4x32-bit vector float values are NaN.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if all of 4 vector float values are NaN.

◆ vec_all_isnormalf32()

static int vec_all_isnormalf32 ( vf32_t  vf32)
inlinestatic

Return true if all of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 1/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if all of 4 vector float values are normal.

◆ vec_all_issubnormalf32()

static int vec_all_issubnormalf32 ( vf32_t  vf32)
inlinestatic

Return true if all of 4x32-bit vector float values is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 10-30 1/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if all of 4 vector float values are subnormal.

◆ vec_all_iszerof32()

static int vec_all_iszerof32 ( vf32_t  vf32)
inlinestatic

Return true if all of 4x32-bit vector float values are +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if all of 4 vector float values are +/- zero.

◆ vec_any_isfinitef32()

static int vec_any_isfinitef32 ( vf32_t  vf32)
inlinestatic

Return true if any 4x32-bit vector float values are Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 4-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
an int containing 0 or 1.

◆ vec_any_isinff32()

static int vec_any_isinff32 ( vf32_t  vf32)
inlinestatic

Return true if any 4x32-bit vector float values are infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
boolean int, true if any of 4 float values are infinity

◆ vec_any_isnanf32()

static int vec_any_isnanf32 ( vf32_t  vf32)
inlinestatic

Return true if any of 4x32-bit vector float values are NaN.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if any of 4 vector float values are NaN.

◆ vec_any_isnormalf32()

static int vec_any_isnormalf32 ( vf32_t  vf32)
inlinestatic

Return true if any of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 10-24 1/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if any of 4 vector float values are normal.

◆ vec_any_issubnormalf32()

static int vec_any_issubnormalf32 ( vf32_t  vf32)
inlinestatic

Return true if any of 4x32-bit vector float values is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 10-18 1/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
if any of 4 vector float values are subnormal.

◆ vec_any_iszerof32()

static int vec_any_iszerof32 ( vf32_t  vf32)
inlinestatic

Return true if any of 4x32-bit vector float values are +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-20 2/cycle
power9 6 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a boolean int, true if any of 4 vector float values are +/- zero.

◆ vec_copysignf32()

static vf32_t vec_copysignf32 ( vf32_t  vf32x,
vf32_t  vf32y 
)
inlinestatic

Copy the sign bit from vf32x merged with magnitude from vf32y and return the resulting vector float values.

Note
This operation was patterned after the intrinsic vec_cpsgn (altivec.h) introduced for POWER7 and VSX. It turns out the original (GCC 4.9) compiler implementation reversed the operands and does not match the PowerISA or the Vector Intrinsic Programming Reference manuals. Subsequent compilers and PVECLIB implementations replicated this (operand order) error. This has now been reported as bug against the compilers, which are in the process of applying fixes and distributing updates. This version of PVECLIB is updated to match the Vector Intrinsic Programming Reference. This implementation is independent of the compilers update status.
processor Latency Throughput
power8 6-7 2/cycle
power9 2 2/cycle
Parameters
vf32xvector float values containing the sign bits.
vf32yvector float values containing the magnitudes.
Returns
vector float values with magnitude from vf32y and the sign of vf32x.

◆ vec_isfinitef32()

static vb32_t vec_isfinitef32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values for each float element that is Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. Using the vec_cmpeq conditional to generate the predicate mask for NaN / Inf and then invert this for the finite condition. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-15 2/cycle
power9 5 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isinff32()

static vb32_t vec_isinff32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values for each float, if infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 4-13 2/cycle
power9 3 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isnanf32()

static vb32_t vec_isnanf32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values, for each float NaN value.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

processor Latency Throughput
power8 4-13 2/cycle
power9 3 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isnormalf32()

static vb32_t vec_isnormalf32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values, for each float value, if normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-15 1/cycle
power9 5 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_issubnormalf32()

static vb32_t vec_issubnormalf32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values, for each float value that is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 6-16 1/cycle
power9 3 1/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_iszerof32()

static vb32_t vec_iszerof32 ( vf32_t  vf32)
inlinestatic

Return 4x32-bit vector boolean true values, for each float value that is +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note
This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.
processor Latency Throughput
power8 4-13 2/cycle
power9 5 2/cycle
Parameters
vf32a vector of __binary32 values.
Returns
a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_setb_sp()

static vb32_t vec_setb_sp ( vf32_t  vra)
inlinestatic

Vector Set Bool from Sign, Single Precision.

For each float, propagate the sign bit to all 32-bits of that word. The result is vector bool int reflecting the sign bit of each 32-bit float.

The resulting mask can be used in masking and select operations.

Note
This operation will set the sign mask regardless of data class, while the Vector Test Data Class will not distinguish between +/- NaN.
processor Latency Throughput
power8 2-9 2/cycle
power9 2-8 2/cycle
Parameters
vraVector float.
Returns
vector bool int reflecting the sign bits of each float value.

◆ vec_vgl4fsso()

static vf32_t vec_vgl4fsso ( float *  array,
const long long  offset0,
const long long  offset1,
const long long  offset2,
const long long  offset3 
)
inlinestatic

Vector Gather-Load 4 Words from scalar Offsets.

For each scalar offset[0,1,2,3], load the word from the effective address formed by *(char*)array+offset[0-3]. Merge resulting float single word elements [0,1,2,3] and return the resulting vector.

processor Latency Throughput
power8 10 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of integer words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.
offset2Scalar (64-bit) byte offset from &array.
offset3Scalar (64-bit) byte offset from &array.
Returns
vector word containing word elements [0-3] loaded from *(char*)array+offset[0-3].

◆ vec_vgl4fswo()

static vf32_t vec_vgl4fswo ( float *  array,
vi32_t  vra 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Offsets.

For each signed word element [i] of vra, load the float single word element at *(char*)array+vra[i]. Merge those word elements [0-3] and return the resulting vector.

Note
Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.
processor Latency Throughput
power8 14 1/cycle
power9 15 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) byte offsets from &array.
Returns
vector word containing word elements [0-3], each loaded from *(char*)array+vra[0-3].

◆ vec_vgl4fswsx()

static vf32_t vec_vgl4fswsx ( float *  array,
vi32_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Scaled Indexes.

For each signed word element [i] of vra, load the float single word element at array[vra[i] << scale]. Merge those word elements [0-3] and return the resulting vector.

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.
processor Latency Throughput
power8 16-25 1/cycle
power9 18-27 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector word containing word elements [0-3] each loaded from array[vra[0-3] << scale].

◆ vec_vgl4fswx()

static vf32_t vec_vgl4fswx ( float *  array,
vi32_t  vra 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Indexes.

For word element [i] of vra, load the float single word element at array[vra[i]]. Merge those word elements [0-3] and return the resulting vector.

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.
processor Latency Throughput
power8 16-25 1/cycle
power9 18-27 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes.
Returns
vector word containing word elements [0-3], each loaded from array[vra[0-3]].

◆ vec_vglfsdo()

static vf64_t vec_vglfsdo ( float *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Offsets.

For each doubleword element [0-1] of vra, load the float single word element at *(char*)array+vra[i] expanding them to float double format. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 12 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of float singles.
vraVector of doubleword (64-bit) byte offsets from &array.
Returns
vector doubleword elements [0,1] loaded from expanded float single words at *(char*)array+vra[i].

◆ vec_vglfsdsx()

static vf64_t vec_vglfsdsx ( float *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Scaled Indexes.

For each doubleword element [0-1] of vra, load the float single word element at array[vra[i] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of float.
vraVector of doubleword indexes from &array.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector doubleword elements [0,1] loaded from the float single words at array[vra[0,1]<<scale].

◆ vec_vglfsdx()

static vf64_t vec_vglfsdx ( float *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Indexes.

For each doubleword element [0-1] of vra, load the float single word element at array[vra[i]]. Merge doubleword elements [0,1] and return the resulting vector.

Note
As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.
processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of float.
vraVector of doubleword indexes from &array.
Returns
vector doubleword elements [0,1] loaded from float single words at array[vra[0,1]].

◆ vec_vglfsso()

static vf64_t vec_vglfsso ( float *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Gather-Load Float Single from scalar Offsets.

For each scalar offset[0|1], load the float single element at *(char*)array+offset[0|1] expanding them to float double format. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 7 2/cycle
power9 11 2/cycle
Parameters
arrayPointer to array of floats.
offset0Scalar (64-bit) byte offsets from &array.
offset1Scalar (64-bit) byte offsets from &array.
Returns
vector double containing elements loaded from *(char*)array+offset0 and *(char*)array+offset1.

◆ vec_vlxsspx()

static vf64_t vec_vlxsspx ( const signed long long  ra,
const float *  rb 
)
inlinestatic

Vector Load Scalar Single Float Indexed.

Load doubleword[0] of vector xt as a scalar (double float formatted) single float word from the effective address formed by rb+ra. The operand rb is a pointer to an array of float words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vf64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note
The Left most doubleword is the single float value, expanded and formated as a double float. The right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsspx instruction can load directly into any of the 64 VSRs, while expanding the single float word value into float double format, in a single operation. Both simplify merging elements for gather operations.

Note
The lxsspx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lfs[x] and xxpermdi to move the result from VSR/FPR range to VSR/VR range if needed.
processor Latency Throughput
power8 5 2/cycle
power9 8 2/cycle
Parameters
raconst doubleword index (offset/displacement).
rbconst pointer to an array of floats.
Returns
The word stored at (ra + rb) is expanded from single to double float format and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vsst4fsso()

static void vec_vsst4fsso ( vf32_t  xs,
float *  array,
const long long  offset0,
const long long  offset1,
const long long  offset2,
const long long  offset3 
)
inlinestatic

Vector Scatter-Store 4 Float Singles to Scalar Offsets.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array+offset[i].

processor Latency Throughput
power8 6 1/cycle
power9 4 2/cycle
Parameters
xsVector float elements to scatter store.
arrayPointer to array of float words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.
offset2Scalar (64-bit) byte offset from &array.
offset3Scalar (64-bit) byte offset from &array.

◆ vec_vsst4fswo()

static void vec_vsst4fswo ( vf32_t  xs,
float *  array,
vi32_t  vra 
)
inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Offsets.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array+vra[i].

Note
Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.
processor Latency Throughput
power8 10 1/cycle
power9 12 2/cycle
Parameters
xsVector float elements to scatter store.
arrayPointer to array of float words.
vraVector of signed word (32-bit) byte offsets from &array.

◆ vec_vsst4fswsx()

static void vec_vsst4fswsx ( vf32_t  xs,
float *  array,
vi32_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Indexes.

For each float word element [0-4] of xs, store the float element xs[i] at *(char*)array[vra[i]<<scale].

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.
processor Latency Throughput
power8 12-21 1/cycle
power9 15-24 2/cycle
Parameters
xsVector float elements to scatter store.
arrayPointer to array of float words.
vraVector of signed word (32-bit) indexes from array.
scale8-bit integer. Indexes are multiplying by 2scale.

◆ vec_vsst4fswx()

static void vec_vsst4fswx ( vf32_t  xs,
float *  array,
vi32_t  vra 
)
inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Indexes.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array[vra[i]].

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.
processor Latency Throughput
power8 12-21 1/cycle
power9 15-24 2/cycle
Parameters
xsVector float elements to scatter store.
arrayPointer to array of float words.
vraVector of signed word (32-bit) indexes from array.

◆ vec_vsstfsdo()

static void vec_vsstfsdo ( vf64_t  xs,
float *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Floats Singles to Vector Doubleword Offsets.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at *(char*)array+vra[i].

processor Latency Throughput
power8 8 1/cycle
power9 9 2/cycle
Parameters
xsVector doubleword elements to scatter store as float single words.
arrayPointer to array of float words.
vraVector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstfsdsx()

static void vec_vsstfsdsx ( vf64_t  xs,
float *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at array[vra[i]<<scale].

processor Latency Throughput
power8 10-19 1/cycle
power9 10-19 1/cycle
Parameters
xsVector doubleword elements to scatter store as float single words.
arrayPointer to array of float words.
vraVector of doubleword (64-bit) indexes from &array.
scale8-bit integer. Indexes are multiplying by 2scale.

◆ vec_vsstfsdx()

static void vec_vsstfsdx ( vf64_t  xs,
float *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Words to Vector Doubleword Indexes.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at array[vra[i]].

processor Latency Throughput
power8 10-19 1/cycle
power9 10-19 1/cycle
Parameters
xsVector doubleword elements to scatter store as float single words.
arrayPointer to array of float words.
vraVector of doubleword (64-bit) indexes from &array.

◆ vec_vsstfsso()

static void vec_vsstfsso ( vf64_t  xs,
float *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Scatter-Store Float Singles to Scalar Offsets.

For each scalar offset[0-1], Store the doubleword element xs[i], converted to float single word format, at *(char*)array+offset[0|1].

processor Latency Throughput
power8 3 1/cycle
power9 3 2/cycle
Parameters
xsVector doubleword elements to scatter store as float single words.
arrayPointer to array of float words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.

◆ vec_vstxsspx()

static void vec_vstxsspx ( vf64_t  xs,
const signed long long  ra,
float *  rb 
)
inlinestatic

Vector Store Scalar Single Float Indexed.

Stores doubleword float element 0 of vector xs as a scalar float word at the effective address formed by rb+ra. The operand rb is a pointer to an array of float. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be word aligned (integer multiple of 4).

This operation is an alternate form of vector store element (vec_ste), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the stxsspx instruction can load directly into any of the 64 VSRs. Both simplify scatter operations.

Note
The stxsspx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will, move the source (xs) from VSR/VR range to VSR/FPR range if needed, then use stsf[x].
processor Latency Throughput
power8 0 - 2 2/cycle
power9 0 - 2 4/cycle
Parameters
xsvector doubleword element 0 to be stored as single float.
raconst doubleword index (offset/displacement).
rbconst pointer to an array of floats.

◆ vec_xviexpsp()

static vf32_t vec_xviexpsp ( vui32_t  sig,
vui32_t  exp 
)
inlinestatic

Vector Insert Exponent Single-Precision.

For each word of sig and exp, merge the sign (bit 0) and significand (bits 9:31) from sig with the 8-bit exponent from exp (bits 24:31). The exponent is merged into bits 1:8 of the final result. The result is returned as a Vector Single-Precision floating point value.

Note
This operation is equivalent to the POWER9 xviexpsp instruction and the built-in vec_insert_exp. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.
processor Latency Throughput
power8 6-15 2/cycle
power9 2 4/cycle
Parameters
sigVector unsigned int containing the Sign Bit and 23-bit significand.
expVector unsigned int containing the 8-bit exponent.
Returns
a vf32_t value where the exponent bits (1:8) of sig are replaced from bits 24:31 of exp.

◆ vec_xvxexpsp()

static vui32_t vec_xvxexpsp ( vf32_t  vrb)
inlinestatic

Vector Extract Exponent Single-Precision.

For each word of vrb, Extract the single-precision exponent (bits 1:8) and right justify it to (bits 24:31 of) of the result vector word. The result is returned as vector unsigned integer value.

Note
This operation is equivalent to the POWER9 xvxexpsp instruction and the built-in vec_extract_exp. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.
processor Latency Throughput
power8 6-15 2/cycle
power9 2 4/cycle
Parameters
vrbvector double value.
Returns
vector unsigned int containing the 8-bit exponent right justified in each word

◆ vec_xvxsigsp()

static vui32_t vec_xvxsigsp ( vf32_t  vrb)
inlinestatic

Vector Extract Significand Single-Precision.

For each word of vrb, Extract the single-precision significand (bits 0:31) and restore the implied (hidden) bit (bit 8) if the single-precision value is normal (not zero, subnormal, Infinity or NaN). The result is return as vector unsigned int value with up to 24 bits of significance.

Note
This operation is equivalent to the POWER9 xvxsigsp instruction and the built-in vec_extract_sig. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.
processor Latency Throughput
power8 8-17 1/cycle
power9 3 2/cycle
Parameters
vrbvector double value.
Returns
vector unsigned int containing the significand.
vec_isfinitef32
static vb32_t vec_isfinitef32(vf32_t vf32)
Return 4x32-bit vector boolean true values for each float element that is Finite (Not NaN nor Inf).
Definition: vec_f32_ppc.h:864
vb32_t
__vector __bool int vb32_t
vector of 32-bit bool int elements.
Definition: vec_common_ppc.h:228
vf32_t
__vector float vf32_t
vector of 32-bit float elements.
Definition: vec_common_ppc.h:219
vec_any_isfinitef32
static int vec_any_isfinitef32(vf32_t vf32)
Return true if any 4x32-bit vector float values are Finite (Not NaN nor Inf).
Definition: vec_f32_ppc.h:534
vec_iszerof32
static vb32_t vec_iszerof32(vf32_t vf32)
Return 4x32-bit vector boolean true values, for each float value that is +-0.0.
Definition: vec_f32_ppc.h:1090
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
vec_isnormalf32
static vb32_t vec_isnormalf32(vf32_t vf32)
Return 4x32-bit vector boolean true values, for each float value, if normal (Not NaN,...
Definition: vec_f32_ppc.h:996
vec_isinff32
static vb32_t vec_isinff32(vf32_t vf32)
Return 4x32-bit vector boolean true values for each float, if infinity.
Definition: vec_f32_ppc.h:908
vec_any_isnormalf32
static int vec_any_isnormalf32(vf32_t vf32)
Return true if any of 4x32-bit vector float values are normal (Not NaN, Inf, denormal,...
Definition: vec_f32_ppc.h:668