Header package containing a collection of 128-bit SIMD operations over 4x32-bit floating point elements. More...

#include <pveclib/vec_common_ppc.h>
#include <pveclib/vec_int128_ppc.h>

Typedefs
typedef vf32_t	__vbinary32
	typedef __vbinary32 to vector of 4 xfloat elements.

Functions
static vf32_t	vec_absf32 (vf32_t vf32x)
	Vector float absolute value. More...

static int	vec_all_isfinitef32 (vf32_t vf32)
	Return true if all 4x32-bit vector float values are Finite (Not NaN nor Inf). More...

static int	vec_all_isinff32 (vf32_t vf32)
	Return true if all 4x32-bit vector float values are infinity. More...

static int	vec_all_isnanf32 (vf32_t vf32)
	Return true if all of 4x32-bit vector float values are NaN. More...

static int	vec_all_isnormalf32 (vf32_t vf32)
	Return true if all of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero). More...

static int	vec_all_issubnormalf32 (vf32_t vf32)
	Return true if all of 4x32-bit vector float values is subnormal (denormal). More...

static int	vec_all_iszerof32 (vf32_t vf32)
	Return true if all of 4x32-bit vector float values are +-0.0. More...

static int	vec_any_isfinitef32 (vf32_t vf32)
	Return true if any 4x32-bit vector float values are Finite (Not NaN nor Inf). More...

static int	vec_any_isinff32 (vf32_t vf32)
	Return true if any 4x32-bit vector float values are infinity. More...

static int	vec_any_isnanf32 (vf32_t vf32)
	Return true if any of 4x32-bit vector float values are NaN. More...

static int	vec_any_isnormalf32 (vf32_t vf32)
	Return true if any of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero). More...

static int	vec_any_issubnormalf32 (vf32_t vf32)
	Return true if any of 4x32-bit vector float values is subnormal (denormal). More...

static int	vec_any_iszerof32 (vf32_t vf32)
	Return true if any of 4x32-bit vector float values are +-0.0. More...

static vf32_t	vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
	Copy the sign bit from vf32x merged with magnitude from vf32y and return the resulting vector float values. More...

static vb32_t	vec_isfinitef32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values for each float element that is Finite (Not NaN nor Inf). More...

static vb32_t	vec_isinff32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values for each float, if infinity. More...

static vb32_t	vec_isnanf32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values, for each float NaN value. More...

static vb32_t	vec_isnormalf32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values, for each float value, if normal (Not NaN, Inf, denormal, or zero). More...

static vb32_t	vec_issubnormalf32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values, for each float value that is subnormal (denormal). More...

static vb32_t	vec_iszerof32 (vf32_t vf32)
	Return 4x32-bit vector boolean true values, for each float value that is +-0.0. More...

static vb32_t	vec_setb_sp (vf32_t vra)
	Vector Set Bool from Sign, Single Precision. More...

static vf32_t	vec_vgl4fsso (float *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
	Vector Gather-Load 4 Words from scalar Offsets. More...

static vf32_t	vec_vgl4fswo (float *array, vi32_t vra)
	Vector Gather-Load 4 Words from Vector Word Offsets. More...

static vf32_t	vec_vgl4fswsx (float *array, vi32_t vra, const unsigned char scale)
	Vector Gather-Load 4 Words from Vector Word Scaled Indexes. More...

static vf32_t	vec_vgl4fswx (float *array, vi32_t vra)
	Vector Gather-Load 4 Words from Vector Word Indexes. More...

static vf64_t	vec_vglfsdo (float *array, vi64_t vra)
	Vector Gather-Load Single Floats from Vector Doubleword Offsets. More...

static vf64_t	vec_vglfsdsx (float *array, vi64_t vra, const unsigned char scale)
	Vector Gather-Load Single Floats from Vector Doubleword Scaled Indexes. More...

static vf64_t	vec_vglfsdx (float *array, vi64_t vra)
	Vector Gather-Load Single Floats from Vector Doubleword Indexes. More...

static vf64_t	vec_vglfsso (float *array, const long long offset0, const long long offset1)
	Vector Gather-Load Float Single from scalar Offsets. More...

static vf64_t	vec_vlxsspx (const signed long long ra, const float *rb)
	Vector Load Scalar Single Float Indexed. More...

static void	vec_vsst4fsso (vf32_t xs, float *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
	Vector Scatter-Store 4 Float Singles to Scalar Offsets. More...

static void	vec_vsst4fswo (vf32_t xs, float *array, vi32_t vra)
	Vector Scatter-Store 4 Float Singles to Vector Word Offsets. More...

static void	vec_vsst4fswsx (vf32_t xs, float *array, vi32_t vra, const unsigned char scale)
	Vector Scatter-Store 4 Float Singles to Vector Word Indexes. More...

static void	vec_vsst4fswx (vf32_t xs, float *array, vi32_t vra)
	Vector Scatter-Store 4 Float Singles to Vector Word Indexes. More...

static void	vec_vsstfsdo (vf64_t xs, float *array, vi64_t vra)
	Vector Scatter-Store Floats Singles to Vector Doubleword Offsets. More...

static void	vec_vsstfsdsx (vf64_t xs, float *array, vi64_t vra, const unsigned char scale)
	Vector Scatter-Store Words to Vector Doubleword Scaled Indexes. More...

static void	vec_vsstfsdx (vf64_t xs, float *array, vi64_t vra)
	Vector Scatter-Store Words to Vector Doubleword Indexes. More...

static void	vec_vsstfsso (vf64_t xs, float *array, const long long offset0, const long long offset1)
	Vector Scatter-Store Float Singles to Scalar Offsets. More...

static void	vec_vstxsspx (vf64_t xs, const signed long long ra, float *rb)
	Vector Store Scalar Single Float Indexed. More...

static vf32_t	vec_xviexpsp (vui32_t sig, vui32_t exp)
	Vector Insert Exponent Single-Precision. More...

static vui32_t	vec_xvxexpsp (vf32_t vrb)
	Vector Extract Exponent Single-Precision. More...

static vui32_t	vec_xvxsigsp (vf32_t vrb)
	Vector Extract Significand Single-Precision. More...

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 4x32-bit floating point elements.

Most vector float (32-bit float) operations are implemented with PowerISA VMX instructions either defined by the original VMX (a.k.a. Altivec) or added to later versions of the PowerISA. POWER8 added the Vector Scalar Extended (VSX) with access to additional vector registers (64 total) and operations. Most of these operations (compiler built-ins, or intrinsics) are defined in <altivec.h> and described in the compiler documentation.

Note: The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example if you compile with -mcpu=power7, some of the wordwise pack, unpack and merge operations useful for conversions are not defined and the equivalent vec_perm and permute control must be used instead. This header will provide the appropriate substitutions, will generate the minimum code, appropriate for the target, and produce correct results.; Most ppc64le compilers will default to -mcpu=power8 if not specified.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides an inline assembler implementation for older compilers that do not provide the built-ins.

POWER9 adds useful vector float operations, including: test data class, extract exponent, extract significand, and insert exponent. These operations are common in math library implementations.

Note: GCC 7.3 defines vector forms of the test data class, extract significand, and extract/insert_exp for float and double. These built-ins are not defined in GCC 6.4. See compiler documentation. These are useful operations and can be implement in a few vector logical instruction for earlier machines.

So it is reasonable for this header to provide vector forms of the floating point classification functions (isnormal/subnormal/finite/inf/nan/zero, etc.). These functions can be implemented directly using (one or more) POWER9 instructions, or a few vector logical and integer compare instructions for POWER7/8. Each is comfortably small enough to be in-lined and inherently faster than the equivalent POSIX or compiler built-in runtime scalar functions.

This header covers operations that are any of the following:

Implemented in hardware instructions in newer processors, but useful to programmers on slightly older processors (even if the equivalent function requires more instructions). Examples include the floating point test data class, extract exponent, extract significand, and insert exponent operations.
Defined in the OpenPOWER ABI but not yet defined in <altivec.h> provided by available compilers in common use. Examples include vector float even/odd.
Providing special vector float tests for special conditions without generating extraneous floating-point exceptions. This is important for implementing vectorized forms of ISO C99 Math functions.
Commonly used operations, not covered by the ABI or <altivec.h>, and require multiple instructions or are not obvious.

Examples

For example: using the the classification functions for implementing the math library function sine and cosine. The POSIX specification requires that special input values are processed without raising extraneous floating point exceptions and return specific floating point values in response. For example the sin() function.

If the input value is NaN then return a NaN.
If the input value is +-0.0 then return value.
If the input value is subnormal then return value.
If the input value is +-Inf then return a NaN.
Otherwise compute and return sin(value).

The following code example uses functions from this header to address the POSIX requirements for special values input to for a vectorized sinf():

vf32_t
test_vec_sinf32 (vf32_t value)
{
  const vf32_t vec_f0 = { 0.0, 0.0, 0.0, 0.0 };
  const vui32_t vec_f32_qnan =
    { 0x7f800001, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
  vf32_t result;
  vb32_t normmask, infmask;
 
  normmask = vec_isnormalf32 (value);
  if (vec_any_isnormalf32 (value))
    {
      // replace non-normal input values with safe values.
      vf32_t safeval = vec_sel (vec_f0, value, normmask);
      // body of vec_sin(safeval) computation elided for this example.
    }
  else
    result = value;
 
  // merge non-normal input values back into result
  result = vec_sel (value, result, normmask);
  // Inf input value elements return quiet-nan
  infmask = vec_isinff32 (value);
  result = vec_sel (result, (vf32_t) vec_f32_qnan, infmask);
 
  return result;
}

The code generated for this fragment runs between 24 (-mcpu=power9) and 40 (-mcpu=power8) instructions. The normal execution path is 14 to 25 instructions respectively.

Another example the cos() function.

If the input value is NaN then return a NaN.
If the input value is +-0.0 then return 1.0.
If the input value is +-Inf then return a NaN.
Otherwise compute and return cos(value).

The following code example uses functions from this header to address the POSIX requirements for special values input to vectorized cosf():

vf32_t
test_vec_cosf32 (vf32_t value)
{
  vf32_t result;
  const vf32_t vec_f0 = { 0.0, 0.0, 0.0, 0.0 };
  const vf32_t vec_f1 = { 1.0, 1.0, 1.0, 1.0 };
  const vui32_t vec_f32_qnan =
    { 0x7f800001, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
  vb32_t finitemask, infmask, zeromask;
 
  finitemask = vec_isfinitef32 (value);
  if (vec_any_isfinitef32 (value))
    {
      // replace non-finite input values with safe values
      vf32_t safeval = vec_sel (vec_f0, value, finitemask);
      // body of vec_sin(safeval) computation elided for this example
    }
  else
    result = value;
 
  // merge non-finite input values back into result
  result = vec_sel (value, result, finitemask);
  // Set +-0.0 input elements to exactly 1.0 in result
  zeromask = vec_iszerof32 (value);
  result = vec_sel (result, vec_f1, zeromask);
  // Set Inf input elements to quiet-nan in result
  infmask = vec_isinff32 (value);
  result = vec_sel (result, (vf32_t) vec_f32_qnan, infmask);
 
  return result;
}

Neither example raises floating point exceptions or sets errno, as appropriate for a vector math library.

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absf32()

static vf32_t vec_absf32 ( vf32_t vf32x )

inlinestatic

Vector float absolute value.

processor	Latency	Throughput
power8	6-7	2/cycle
power9	2	2/cycle

Parameters

vf32x vector float values containing the magnitudes.

Returns: vector absolute values of 4x float elements of vf32x.

◆ vec_all_isfinitef32()

static int vec_all_isfinitef32 ( vf32_t vf32 )

inlinestatic

Return true if all 4x32-bit vector float values are Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	4-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: an int containing 0 or 1.

◆ vec_all_isinff32()

static int vec_all_isinff32 ( vf32_t vf32 )

inlinestatic

Return true if all 4x32-bit vector float values are infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: boolean int, true if all 4 float values are infinity

◆ vec_all_isnanf32()

static int vec_all_isnanf32 ( vf32_t vf32 )

inlinestatic

Return true if all of 4x32-bit vector float values are NaN.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if all of 4 vector float values are NaN.

◆ vec_all_isnormalf32()

static int vec_all_isnormalf32 ( vf32_t vf32 )

inlinestatic

Return true if all of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	1/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if all of 4 vector float values are normal.

◆ vec_all_issubnormalf32()

static int vec_all_issubnormalf32 ( vf32_t vf32 )

inlinestatic

Return true if all of 4x32-bit vector float values is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	10-30	1/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if all of 4 vector float values are subnormal.

◆ vec_all_iszerof32()

static int vec_all_iszerof32 ( vf32_t vf32 )

inlinestatic

Return true if all of 4x32-bit vector float values are +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if all of 4 vector float values are +/- zero.

◆ vec_any_isfinitef32()

static int vec_any_isfinitef32 ( vf32_t vf32 )

inlinestatic

Return true if any 4x32-bit vector float values are Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	4-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: an int containing 0 or 1.

◆ vec_any_isinff32()

static int vec_any_isinff32 ( vf32_t vf32 )

inlinestatic

Return true if any 4x32-bit vector float values are infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: boolean int, true if any of 4 float values are infinity

◆ vec_any_isnanf32()

static int vec_any_isnanf32 ( vf32_t vf32 )

inlinestatic

Return true if any of 4x32-bit vector float values are NaN.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if any of 4 vector float values are NaN.

◆ vec_any_isnormalf32()

static int vec_any_isnormalf32 ( vf32_t vf32 )

inlinestatic

Return true if any of 4x32-bit vector float values are normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	10-24	1/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if any of 4 vector float values are normal.

◆ vec_any_issubnormalf32()

static int vec_any_issubnormalf32 ( vf32_t vf32 )

inlinestatic

Return true if any of 4x32-bit vector float values is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	10-18	1/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: if any of 4 vector float values are subnormal.

◆ vec_any_iszerof32()

static int vec_any_iszerof32 ( vf32_t vf32 )

inlinestatic

Return true if any of 4x32-bit vector float values are +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-20	2/cycle
power9	6	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a boolean int, true if any of 4 vector float values are +/- zero.

◆ vec_copysignf32()

static vf32_t vec_copysignf32	(	vf32_t	vf32x,
		vf32_t	vf32y
	)

inlinestatic

Copy the sign bit from vf32x merged with magnitude from vf32y and return the resulting vector float values.

Note: This operation was patterned after the intrinsic vec_cpsgn (altivec.h) introduced for POWER7 and VSX. It turns out the original (GCC 4.9) compiler implementation reversed the operands and does not match the PowerISA or the Vector Intrinsic Programming Reference manuals. Subsequent compilers and PVECLIB implementations replicated this (operand order) error. This has now been reported as bug against the compilers, which are in the process of applying fixes and distributing updates. This version of PVECLIB is updated to match the Vector Intrinsic Programming Reference. This implementation is independent of the compilers update status.

processor	Latency	Throughput
power8	6-7	2/cycle
power9	2	2/cycle

Parameters

vf32x	vector float values containing the sign bits.
vf32y	vector float values containing the magnitudes.

Returns: vector float values with magnitude from vf32y and the sign of vf32x.

◆ vec_isfinitef32()

static vb32_t vec_isfinitef32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values for each float element that is Finite (Not NaN nor Inf).

A IEEE Binary32 finite value has an exponent between 0x000 and 0x7f0 (a 0x7f8 indicates NaN or Inf). The significand can be any value. Using the vec_cmpeq conditional to generate the predicate mask for NaN / Inf and then invert this for the finite condition. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-15	2/cycle
power9	5	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isinff32()

static vb32_t vec_isinff32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values for each float, if infinity.

A IEEE Binary32 infinity has a exponent of 0x7f8 and significand of all zeros.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	4-13	2/cycle
power9	3	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isnanf32()

static vb32_t vec_isnanf32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values, for each float NaN value.

A IEEE Binary32 NaN value has an exponent between 0x7f8 and the significand is nonzero. The sign bit is ignored.

processor	Latency	Throughput
power8	4-13	2/cycle
power9	3	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_isnormalf32()

static vb32_t vec_isnormalf32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values, for each float value, if normal (Not NaN, Inf, denormal, or zero).

A IEEE Binary32 normal value has an exponent between 0x008 and 0x7f (a 0x7f8 indicates NaN or Inf). The significand can be any value (expect 0 if the exponent is zero). The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-15	1/cycle
power9	5	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_issubnormalf32()

static vb32_t vec_issubnormalf32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values, for each float value that is subnormal (denormal).

A IEEE Binary32 subnormal has an exponent of 0x000 and a nonzero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	6-16	1/cycle
power9	3	1/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_iszerof32()

static vb32_t vec_iszerof32 ( vf32_t vf32 )

inlinestatic

Return 4x32-bit vector boolean true values, for each float value that is +-0.0.

A IEEE Binary32 zero has an exponent of 0x000 and a zero significand. The sign bit is ignored.

Note: This function will not raise VXSNAN or VXVC (FE_INVALID) exceptions. A normal float compare can.

processor	Latency	Throughput
power8	4-13	2/cycle
power9	5	2/cycle

Parameters

vf32	a vector of __binary32 values.

Returns: a vector boolean int, each containing all 0s(false) or 1s(true).

◆ vec_setb_sp()

static vb32_t vec_setb_sp ( vf32_t vra )

inlinestatic

Vector Set Bool from Sign, Single Precision.

For each float, propagate the sign bit to all 32-bits of that word. The result is vector bool int reflecting the sign bit of each 32-bit float.

The resulting mask can be used in masking and select operations.

Note: This operation will set the sign mask regardless of data class, while the Vector Test Data Class will not distinguish between +/- NaN.

processor	Latency	Throughput
power8	2-9	2/cycle
power9	2-8	2/cycle

Parameters

vra	Vector float.

Returns: vector bool int reflecting the sign bits of each float value.

◆ vec_vgl4fsso()

static vf32_t vec_vgl4fsso	(	float *	array,
		const long long	offset0,
		const long long	offset1,
		const long long	offset2,
		const long long	offset3
	)

inlinestatic

Vector Gather-Load 4 Words from scalar Offsets.

For each scalar offset[0,1,2,3], load the word from the effective address formed by *(char*)array+offset[0-3]. Merge resulting float single word elements [0,1,2,3] and return the resulting vector.

processor	Latency	Throughput
power8	10	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of integer words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.
offset2	Scalar (64-bit) byte offset from &array.
offset3	Scalar (64-bit) byte offset from &array.

Returns: vector word containing word elements [0-3] loaded from *(char*)array+offset[0-3].

◆ vec_vgl4fswo()

static vf32_t vec_vgl4fswo	(	float *	array,
		vi32_t	vra
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Offsets.

For each signed word element [i] of vra, load the float single word element at *(char*)array+vra[i]. Merge those word elements [0-3] and return the resulting vector.

Note: Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.

processor	Latency	Throughput
power8	14	1/cycle
power9	15	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) byte offsets from &array.

Returns: vector word containing word elements [0-3], each loaded from *(char*)array+vra[0-3].

◆ vec_vgl4fswsx()

static vf32_t vec_vgl4fswsx	(	float *	array,
		vi32_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Scaled Indexes.

For each signed word element [i] of vra, load the float single word element at array[vra[i] << scale]. Merge those word elements [0-3] and return the resulting vector.

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.

processor	Latency	Throughput
power8	16-25	1/cycle
power9	18-27	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector word containing word elements [0-3] each loaded from array[vra[0-3] << scale].

◆ vec_vgl4fswx()

static vf32_t vec_vgl4fswx	(	float *	array,
		vi32_t	vra
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Indexes.

For word element [i] of vra, load the float single word element at array[vra[i]]. Merge those word elements [0-3] and return the resulting vector.

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.

processor	Latency	Throughput
power8	16-25	1/cycle
power9	18-27	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes.

Returns: vector word containing word elements [0-3], each loaded from array[vra[0-3]].

◆ vec_vglfsdo()

static vf64_t vec_vglfsdo	(	float *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Offsets.

For each doubleword element [0-1] of vra, load the float single word element at *(char*)array+vra[i] expanding them to float double format. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	12	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of float singles.
vra	Vector of doubleword (64-bit) byte offsets from &array.

Returns: vector doubleword elements [0,1] loaded from expanded float single words at *(char*)array+vra[i].

◆ vec_vglfsdsx()

static vf64_t vec_vglfsdsx	(	float *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Scaled Indexes.

For each doubleword element [0-1] of vra, load the float single word element at array[vra[i] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of float.
vra	Vector of doubleword indexes from &array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector doubleword elements [0,1] loaded from the float single words at array[vra[0,1]<<scale].

◆ vec_vglfsdx()

static vf64_t vec_vglfsdx	(	float *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Single Floats from Vector Doubleword Indexes.

For each doubleword element [0-1] of vra, load the float single word element at array[vra[i]]. Merge doubleword elements [0,1] and return the resulting vector.

Note: As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of float.
vra	Vector of doubleword indexes from &array.

Returns: vector doubleword elements [0,1] loaded from float single words at array[vra[0,1]].

◆ vec_vglfsso()

static vf64_t vec_vglfsso	(	float *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Gather-Load Float Single from scalar Offsets.

For each scalar offset[0|1], load the float single element at *(char*)array+offset[0|1] expanding them to float double format. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	7	2/cycle
power9	11	2/cycle

Parameters

array	Pointer to array of floats.
offset0	Scalar (64-bit) byte offsets from &array.
offset1	Scalar (64-bit) byte offsets from &array.

Returns: vector double containing elements loaded from *(char*)array+offset0 and *(char*)array+offset1.

◆ vec_vlxsspx()

static vf64_t vec_vlxsspx	(	const signed long long	ra,
		const float *	rb
	)

inlinestatic

Vector Load Scalar Single Float Indexed.

Load doubleword[0] of vector xt as a scalar (double float formatted) single float word from the effective address formed by rb+ra. The operand rb is a pointer to an array of float words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vf64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note: The Left most doubleword is the single float value, expanded and formated as a double float. The right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsspx instruction can load directly into any of the 64 VSRs, while expanding the single float word value into float double format, in a single operation. Both simplify merging elements for gather operations.

Note: The lxsspx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lfs[x] and xxpermdi to move the result from VSR/FPR range to VSR/VR range if needed.

processor	Latency	Throughput
power8	5	2/cycle
power9	8	2/cycle

Parameters

ra	const doubleword index (offset/displacement).
rb	const pointer to an array of floats.

Returns: The word stored at (ra + rb) is expanded from single to double float format and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vsst4fsso()

static void vec_vsst4fsso	(	vf32_t	xs,
		float *	array,
		const long long	offset0,
		const long long	offset1,
		const long long	offset2,
		const long long	offset3
	)

inlinestatic

Vector Scatter-Store 4 Float Singles to Scalar Offsets.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array+offset[i].

processor	Latency	Throughput
power8	6	1/cycle
power9	4	2/cycle

Parameters

xs	Vector float elements to scatter store.
array	Pointer to array of float words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.
offset2	Scalar (64-bit) byte offset from &array.
offset3	Scalar (64-bit) byte offset from &array.

◆ vec_vsst4fswo()

static void vec_vsst4fswo	(	vf32_t	xs,
		float *	array,
		vi32_t	vra
	)

inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Offsets.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array+vra[i].

Note: Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.

processor	Latency	Throughput
power8	10	1/cycle
power9	12	2/cycle

Parameters

xs	Vector float elements to scatter store.
array	Pointer to array of float words.
vra	Vector of signed word (32-bit) byte offsets from &array.

◆ vec_vsst4fswsx()

static void vec_vsst4fswsx	(	vf32_t	xs,
		float *	array,
		vi32_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Indexes.

For each float word element [0-4] of xs, store the float element xs[i] at *(char*)array[vra[i]<<scale].

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.

processor	Latency	Throughput
power8	12-21	1/cycle
power9	15-24	2/cycle

Parameters

xs	Vector float elements to scatter store.
array	Pointer to array of float words.
vra	Vector of signed word (32-bit) indexes from array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

◆ vec_vsst4fswx()

static void vec_vsst4fswx	(	vf32_t	xs,
		float *	array,
		vi32_t	vra
	)

inlinestatic

Vector Scatter-Store 4 Float Singles to Vector Word Indexes.

For each float word element [0-3] of xs, store the float element xs[i] at *(char*)array[vra[i]].

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.

processor	Latency	Throughput
power8	12-21	1/cycle
power9	15-24	2/cycle

Parameters

xs	Vector float elements to scatter store.
array	Pointer to array of float words.
vra	Vector of signed word (32-bit) indexes from array.

◆ vec_vsstfsdo()

static void vec_vsstfsdo	(	vf64_t	xs,
		float *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Floats Singles to Vector Doubleword Offsets.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at *(char*)array+vra[i].

processor	Latency	Throughput
power8	8	1/cycle
power9	9	2/cycle

Parameters

xs	Vector doubleword elements to scatter store as float single words.
array	Pointer to array of float words.
vra	Vector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstfsdsx()

static void vec_vsstfsdsx	(	vf64_t	xs,
		float *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at array[vra[i]<<scale].

processor	Latency	Throughput
power8	10-19	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector doubleword elements to scatter store as float single words.
array	Pointer to array of float words.
vra	Vector of doubleword (64-bit) indexes from &array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

◆ vec_vsstfsdx()

static void vec_vsstfsdx	(	vf64_t	xs,
		float *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Words to Vector Doubleword Indexes.

For each doubleword element [0-1] of vra, store the doubleword float element xs[i], converted to float single word format, at array[vra[i]].

processor	Latency	Throughput
power8	10-19	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector doubleword elements to scatter store as float single words.
array	Pointer to array of float words.
vra	Vector of doubleword (64-bit) indexes from &array.

◆ vec_vsstfsso()

static void vec_vsstfsso	(	vf64_t	xs,
		float *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Scatter-Store Float Singles to Scalar Offsets.

For each scalar offset[0-1], Store the doubleword element xs[i], converted to float single word format, at *(char*)array+offset[0|1].

processor	Latency	Throughput
power8	3	1/cycle
power9	3	2/cycle

Parameters

xs	Vector doubleword elements to scatter store as float single words.
array	Pointer to array of float words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.

◆ vec_vstxsspx()

static void vec_vstxsspx	(	vf64_t	xs,
		const signed long long	ra,
		float *	rb
	)

inlinestatic

Vector Store Scalar Single Float Indexed.

Stores doubleword float element 0 of vector xs as a scalar float word at the effective address formed by rb+ra. The operand rb is a pointer to an array of float. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be word aligned (integer multiple of 4).

This operation is an alternate form of vector store element (vec_ste), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the stxsspx instruction can load directly into any of the 64 VSRs. Both simplify scatter operations.

Note: The stxsspx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will, move the source (xs) from VSR/VR range to VSR/FPR range if needed, then use stsf[x].

processor	Latency	Throughput
power8	0 - 2	2/cycle
power9	0 - 2	4/cycle

Parameters

xs	vector doubleword element 0 to be stored as single float.
ra	const doubleword index (offset/displacement).
rb	const pointer to an array of floats.

◆ vec_xviexpsp()

static vf32_t vec_xviexpsp	(	vui32_t	sig,
		vui32_t	exp
	)

inlinestatic

Vector Insert Exponent Single-Precision.

For each word of sig and exp, merge the sign (bit 0) and significand (bits 9:31) from sig with the 8-bit exponent from exp (bits 24:31). The exponent is merged into bits 1:8 of the final result. The result is returned as a Vector Single-Precision floating point value.

Note: This operation is equivalent to the POWER9 xviexpsp instruction and the built-in vec_insert_exp. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.

processor	Latency	Throughput
power8	6-15	2/cycle
power9	2	4/cycle

Parameters

sig	Vector unsigned int containing the Sign Bit and 23-bit significand.
exp	Vector unsigned int containing the 8-bit exponent.

Returns: a vf32_t value where the exponent bits (1:8) of sig are replaced from bits 24:31 of exp.

◆ vec_xvxexpsp()

static vui32_t vec_xvxexpsp ( vf32_t vrb )

inlinestatic

Vector Extract Exponent Single-Precision.

For each word of vrb, Extract the single-precision exponent (bits 1:8) and right justify it to (bits 24:31 of) of the result vector word. The result is returned as vector unsigned integer value.

Note: This operation is equivalent to the POWER9 xvxexpsp instruction and the built-in vec_extract_exp. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.

processor	Latency	Throughput
power8	6-15	2/cycle
power9	2	4/cycle

Parameters

vrb	vector double value.

Returns: vector unsigned int containing the 8-bit exponent right justified in each word

◆ vec_xvxsigsp()

static vui32_t vec_xvxsigsp ( vf32_t vrb )

inlinestatic

Vector Extract Significand Single-Precision.

For each word of vrb, Extract the single-precision significand (bits 0:31) and restore the implied (hidden) bit (bit 8) if the single-precision value is normal (not zero, subnormal, Infinity or NaN). The result is return as vector unsigned int value with up to 24 bits of significance.

Note: This operation is equivalent to the POWER9 xvxsigsp instruction and the built-in vec_extract_sig. These require a POWER9-enabled compiler targeting -mcpu=power9 and are not available for older compilers nor POWER8 and earlier. This function provides this operation for all VSX-enabled platforms.

processor	Latency	Throughput
power8	8-17	1/cycle
power9	3	2/cycle

Parameters

vrb	vector double value.

Returns: vector unsigned int containing the significand.

Typedefs

Functions

Detailed Description

Examples

Performance data.

Function Documentation

◆ vec_absf32()

◆ vec_all_isfinitef32()

◆ vec_all_isinff32()

◆ vec_all_isnanf32()

◆ vec_all_isnormalf32()

◆ vec_all_issubnormalf32()

◆ vec_all_iszerof32()

◆ vec_any_isfinitef32()

◆ vec_any_isinff32()

◆ vec_any_isnanf32()

◆ vec_any_isnormalf32()

◆ vec_any_issubnormalf32()

◆ vec_any_iszerof32()

◆ vec_copysignf32()

◆ vec_isfinitef32()

◆ vec_isinff32()

◆ vec_isnanf32()

◆ vec_isnormalf32()

◆ vec_issubnormalf32()

◆ vec_iszerof32()

◆ vec_setb_sp()

◆ vec_vgl4fsso()

◆ vec_vgl4fswo()

◆ vec_vgl4fswsx()

◆ vec_vgl4fswx()

◆ vec_vglfsdo()

◆ vec_vglfsdsx()

◆ vec_vglfsdx()

◆ vec_vglfsso()

◆ vec_vlxsspx()

◆ vec_vsst4fsso()

◆ vec_vsst4fswo()

◆ vec_vsst4fswsx()

◆ vec_vsst4fswx()

◆ vec_vsstfsdo()

◆ vec_vsstfsdsx()

◆ vec_vsstfsdx()

◆ vec_vsstfsso()

◆ vec_vstxsspx()

◆ vec_xviexpsp()

◆ vec_xvxexpsp()

◆ vec_xvxsigsp()