Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements. More...

#include <pveclib/vec_int16_ppc.h>

Functions
static vui32_t	vec_absduw (vui32_t vra, vui32_t vrb)
	Vector Absolute Difference Unsigned Word. More...

static vui32_t	vec_clzw (vui32_t vra)
	Vector Count Leading Zeros word. More...

static vui32_t	vec_ctzw (vui32_t vra)
	Vector Count Trailing Zeros word. More...

static vui32_t	vec_mrgahw (vui64_t vra, vui64_t vrb)
	Vector Merge Algebraic High Words. More...

static vui32_t	vec_mrgalw (vui64_t vra, vui64_t vrb)
	Vector merge Algebraic low words. More...

static vui32_t	vec_mrgew (vui32_t vra, vui32_t vrb)
	Vector Merge Even Words. More...

static vui32_t	vec_mrgow (vui32_t vra, vui32_t vrb)
	Vector Merge Odd Words. More...

static vi64_t	vec_mulesw (vi32_t a, vi32_t b)
	Vector multiply even signed words. More...

static vi64_t	vec_mulosw (vi32_t a, vi32_t b)
	Vector multiply odd signed words. More...

static vui64_t	vec_muleuw (vui32_t a, vui32_t b)
	Vector multiply even unsigned words. More...

static vui64_t	vec_mulouw (vui32_t a, vui32_t b)
	Vector multiply odd unsigned words. More...

static vi32_t	vec_mulhsw (vi32_t vra, vi32_t vrb)
	Vector Multiply High Signed Word. More...

static vui32_t	vec_mulhuw (vui32_t vra, vui32_t vrb)
	Vector Multiply High Unsigned Word. More...

static vui32_t	vec_muluwm (vui32_t a, vui32_t b)
	Vector Multiply Unsigned Word Modulo. More...

static vui32_t	vec_popcntw (vui32_t vra)
	Vector Population Count word. More...

static vui32_t	vec_revbw (vui32_t vra)
	byte reverse each word of a vector unsigned int. More...

static vb32_t	vec_setb_sw (vi32_t vra)
	Vector Set Bool from Signed Word. More...

static vui32_t	vec_slwi (vui32_t vra, const unsigned int shb)
	Vector Shift left Word Immediate. More...

static vi32_t	vec_srawi (vi32_t vra, const unsigned int shb)
	Vector Shift Right Algebraic Word Immediate. More...

static vui32_t	vec_srwi (vui32_t vra, const unsigned int shb)
	Vector Shift Right Word Immediate. More...

static vui32_t	vec_vgl4wso (unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
	Vector Gather-Load 4 Words from scalar Offsets. More...

static vui32_t	vec_vgl4wwo (unsigned int *array, vi32_t vra)
	Vector Gather-Load 4 Words from Vector Word Offsets. More...

static vui32_t	vec_vgl4wwsx (unsigned int *array, vi32_t vra, const unsigned char scale)
	Vector Gather-Load 4 Words from Vector Word Scaled Indexes. More...

static vui32_t	vec_vgl4wwx (unsigned int *array, vi32_t vra)
	Vector Gather-Load 4 Words from Vector Word Indexes. More...

static vi64_t	vec_vglswso (signed int *array, const long long offset0, const long long offset1)
	Vector Gather-Load Signed Word from Scalar Offsets. More...

static vi64_t	vec_vglswdo (signed int *array, vi64_t vra)
	Vector Gather-Load Signed Words from Vector Doubleword Offsets. More...

static vi64_t	vec_vglswdsx (signed int *array, vi64_t vra, const unsigned char scale)
	Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes. More...

static vi64_t	vec_vglswdx (signed int *array, vi64_t vra)
	Vector Gather-Load Signed Words from Vector Doubleword Indexes. More...

static vui64_t	vec_vgluwso (unsigned int *array, const long long offset0, const long long offset1)
	Vector Gather-Load Unsigned Word from Scalar Offsets. More...

static vui64_t	vec_vgluwdo (unsigned int *array, vi64_t vra)
	Vector Gather-Load Unsigned Words from Vector Doubleword Offsets. More...

static vui64_t	vec_vgluwdsx (unsigned int *array, vi64_t vra, const unsigned char scale)
	Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes. More...

static vui64_t	vec_vgluwdx (unsigned int *array, vi64_t vra)
	Vector Gather-Load Unsigned Words from Vector Doubleword Indexes. More...

static vi64_t	vec_vlxsiwax (const signed long long ra, const signed int *rb)
	Vector Load Scalar Integer Word Algebraic Indexed. More...

static vui64_t	vec_vlxsiwzx (const signed long long ra, const unsigned int *rb)
	Vector Load Scalar Integer Word and Zero Indexed. More...

static vui64_t	vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
	Vector Multiply-Add2 Even Unsigned Words. More...

static vui64_t	vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
	Vector Multiply-Add2 Odd Unsigned Words. More...

static vui64_t	vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c)
	Vector Multiply-Add Even Unsigned Words. More...

static vui64_t	vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c)
	Vector Multiply-Add Odd Unsigned Words. More...

static vui64_t	vec_vmsumuwm (vui32_t a, vui32_t b, vui64_t c)
	Vector Multiply-Sum Unsigned Word Modulo. More...

static vui64_t	vec_vmuleuw (vui32_t vra, vui32_t vrb)
	Vector Multiply Even Unsigned words. More...

static vui64_t	vec_vmulouw (vui32_t vra, vui32_t vrb)
	Vector Multiply Odd Unsigned Words. More...

static void	vec_vsst4wso (vui32_t xs, unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
	Vector Scatter-Store 4 words to Scalar Offsets. More...

static void	vec_vsst4wwo (vui32_t xs, unsigned int *array, vi32_t vra)
	Vector Scatter-Store 4 words to Vector Word Offsets. More...

static void	vec_vsst4wwsx (vui32_t xs, unsigned int *array, vi32_t vra, const unsigned char scale)
	Vector Scatter-Store 4 words to Vector Word Indexes. More...

static void	vec_vsst4wwx (vui32_t xs, unsigned int *array, vi32_t vra)
	Vector Scatter-Store 4 words to Vector Word Indexes. More...

static void	vec_vsstwdo (vui64_t xs, unsigned int *array, vi64_t vra)
	Vector Scatter-Store Words to Vector Doubleword Offsets. More...

static void	vec_vsstwdsx (vui64_t xs, unsigned int *array, vi64_t vra, const unsigned char scale)
	Vector Scatter-Store Words to Vector Doubleword Scaled Indexes. More...

static void	vec_vsstwdx (vui64_t xs, unsigned int *array, vi64_t vra)
	Vector Scatter-Store Words to Vector Doubleword Indexes. More...

static void	vec_vsstwso (vui64_t xs, unsigned int *array, const long long offset0, const long long offset1)
	Vector Scatter-Store Words to Scalar Offsets. More...

static void	vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb)
	Vector Store Scalar Integer Word Indexed. More...

static vi32_t	vec_vsum2sw (vi32_t vra, vi32_t vrb)
	Vector Sum-across Half Signed Word Saturate. More...

static vi32_t	vec_vsumsw (vi32_t vra, vi32_t vrb)
	Vector Sum-across Signed Word Saturate. More...

static vi64_t	vec_vupkhsw (vi32_t vra)
	Vector Unpack High Signed Word. More...

static vui64_t	vec_vupkhuw (vui32_t vra)
	Vector Unpack High Unsigned Word. More...

static vi64_t	vec_vupklsw (vi32_t vra)
	Vector Unpack Low Signed Word. More...

static vui64_t	vec_vupkluw (vui32_t vra)
	Vector Unpack Low Unsigned Word. More...

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.

Most vector int (32-bit integer word) operations are implemented with PowerISA VMX instructions either defined by the original VMX (AKA Altivec) or added to later versions of the PowerISA. Vector word-wise merge, shift, and splat operations were added with VSX in PowerISA 2.06B (POWER7). PowerISA 2.07B (POWER8) added several useful word wise operations (multiply, merge even/odd, count leading zeros, population count) not included in the original VMX. PowerISA 3.0B (POWER9) adds several more (compare not equal, count trailing zeros, extend sign, extract/insert, and parity). Most of these intrinsic (compiler built-ins) operations are defined in <altivec.h> and described in the compiler documentation.

Note: The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example if you compile with -mcpu=power7, vec_vclz and vec_vclzw will not be defined. Another example if you compile with -mcpu=power8, vec_revb will not be defined. This header provides the appropriate substitutions, will generate the minimum code, appropriate for the target, and produce correct results.; Most ppc64le compilers will default to -mcpu=power8 if not specified.

The newly introduced vector operations imply some useful composite operations. For example, we can make the vector multiply even/odd/modulo word operations available for older compilers. And provide implementations for older (POWER7 and earlier) processors using the original VMX operations.

This header covers operations that are either:

Implemented in hardware instructions for later processors and useful to programmers, on slightly older processors, even if the equivalent function requires more instructions. Examples include the multiply even/odd/modulo word operations.
Defined in the OpenPOWER ABI but not yet defined in <altivec.h> provided by available compilers in common use. Examples include Count Leading Zeros, Population Count and Byte Reverse.
Commonly used operations, not covered by the ABI or <altivec.h>, and require multiple instructions or are not obvious. Examples include the shift immediate, merge algebraic high/low, and multiply high operations.

Recent Additions

Added vec_vmaddeuw(), vec_vmaddouw(), vec_vmadd2euw(), and vec_vmadd2ouw() as an optimization for the vector multiply quadword implementations on POWER8.

Endian problems with word operations

It would be useful to provide a vector multiply high word (return the high order 32-bits of the 64-bit product) operation. This can be used for multiplicative inverse (effectively integer divide) operations. Neither integer multiply high nor divide are available as vector instructions. However the multiply high word operation can be composed from the existing multiply even/odd word operations followed by the vector merge even word instruction.

As a prerequisite we need to provide the merge even/odd word operations for older compilers and an implementation for older (POWER7) processors. Fortunately vector merge operations are just a special case of vector permute. So the POWER7 (and earlier) implementation can use vec_perm and appropriate selection vectors to provide these merge operations.

But this is complicated by little-endian (LE) support as specified in the OpenPOWER ABI and as implemented in the compilers. Little-endian changes the effective vector element numbering and the location of even and odd elements. This means that the vector built-ins provided by altivec.h may not generate the instructions you would expect.

See also: General Endian Issues

The OpenPOWER ABI provides a helpful table of Endian Sensitive Operations. For vec_mergee (vmrgew) it specifies:

Swap inputs and use vmrgow, for LE.

Also for vec_mule (vmuleuw, vmulesw):

Replace with vmulouw and so on, for LE.

Also for vec_perm (vperm) it specifies:

For LE, Swap input arguments and complement the selection vector.

The above is just a sampling of a larger list of Endian Sensitive Operations.

So the obvious coding for Vector Multiply High Word:

vui32_t
test_mulhw (vui32_t vra, vui32_t vrb)
{
  return vec_mergee ((vui32_t)vec_mule (vra, vrb),
                     (vui32_t)vec_mulo (vra, vrb));
}

Would produce the expected code and correct results when compiled for BE:

<test_mulhw>:
        vmuleuw v0,v2,v3
        vmuluuw v2,v2,v3
        vmrgew  v2,v0,v2
        blr

But the following and wrong code for LE:

<test_mulhw>:
        vmulouw v0,v2,v3
        vmuleuw v2,v2,v3
        vmrgow  v2,v2,v0
        blr

The compiler swapped the multiplies even for odd and odd of even. That is somewhat mitigated by swapping the input arguments in the merge. But changing the merge from even to odd actually returns the low order 32-bits of the product. This is not the correct result for multiply high.

This header provides implementations of vector merge even/odd word (vec_mrgew() and vec_mrgow()) that support older compilers and older (POWER7) processor. Similarly for the multiply Even/odd unsigned/signed word instructions (vec_mulesw(), vec_mulosw(), vec_muleuw() and vec_mulouw()). These implementations include the mandated LE transforms.

Vector Merge Algebraic High Word example

This header also provides the higher level operations Vector Merge Algebraic High/low Word (vec_mrgahw() and vec_mrgalw()). These implementations generate the correct merge even/odd word instruction for the operation independent of endian.

Note: The parameters are vector unsigned long (vui64_t) to match results from vec_muleuw() and vec_mulouw().

static inline vui32_t
vec_mrgahw (vui64_t vra, vui64_t vrb)
{
  vui32_t res;
#ifdef _ARCH_PWR8
#ifdef vec_vmrgew // Use altivec.h builtins
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  // really want vmrgew here! So do the opposite.
  res = vec_vmrgow ((vui32_t)vrb, (vui32_t)vra);
#else
  res = vec_vmrgew ((vui32_t)vra, (vui32_t)vrb);
#endif
#else // Generate vmrgew directly in assembler
  __asm__(
      "vmrgew %0,%1,%2;\n"
      : "=v" (res)
      : "v" (vra),
      "v" (vrb)
      : );
#endif
#else // POWER7 and earlier, Assume BE only
  const vui32_t vconstp =
      CONST_VINT32_W(0x00010203,  0x10111213, 0x08090a0b,  0x18191a1b);
  res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
#endif
  return (res);
}

The implementation is a bit complicated so that is can nullify the unwanted LE transformation of vec_vmrgew(), in addition to handling older and compilers and processors.

Vector Multiply High Unsigned Word example

Now we can implement Vector Multiply High Unsigned Word (vec_mulhuw()):

static inline vui32_t
vec_mulhuw (vui32_t vra, vui32_t vrb)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return vec_mrgahw (vec_mulouw (vra, vrb), vec_muleuw (vra, vrb));
#else
  return vec_mrgahw (vec_muleuw (vra, vrb), vec_mulouw (vra, vrb));
#endif
}

Again the implementation is more complicated than expected as we still have to nullify the LE transformation associated with multiply even/odd.

The good news is all this complexity is contained within pveclib and the generated code is still just 3 instructions.

vmulouw v0,v2,v3
vmuleuw v2,v2,v3
vmrgew  v2,v2,v0

Vector Word Examples

Suppose we have a requirement to convert an array of 32-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 4 words.

The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume that the lower 32-bits of the TimeBase is sufficient to compute intervals (~8.38 seconds). TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.

The timespec format it a struct of unsigned int fields for seconds and microseconds. So the task is to convert the 512MHz TimeBase intervals to microseconds and then split the integer seconds and microseconds for the timespec.

First the TimeBase to microseconds conversion is simply (1000000 / 512000000) which reduces to (1 / 512) or divide by 512. The vector unit does not provide integer divide but luckily, 512 is a power of 2 and we can shift right. If we don't care for the niceties of rounding we can simply shift right 9 bits:

tb_usec = vec_srwi (*tb++, 9);

But if we decide that rounding is important we can leverage the Vector Average Unsigned Word (vavguw) instruction. Here we need to add 256 (512 / 2 = 256) to the timeBase interval before we shift right.

But we need to reverse engineer the vavguw operation to get the results we want. For each word, vavguw computes the sum of A and B plus 1, then shifts the 33-bit sum right 1 bit. We can effectively round by passing the rounding factor as the B operand to the vec_avg() built-in. But we get a +1 and 1 bit right shift for free. So in this case the rounding constant is 256-1 = 255. And we only need to shift an additional 8 bits to complete the conversion:

const vui32_t rnd_512 =
  { (256-1), (256-1), (256-1), (256-1) };
// Convert 512MHz timebase to microseconds with rounding.
tmp = vec_avg (*tb++, rnd_512);
tb_usec = vec_srwi (tmp, 8);

Note: vec_avg() is an existing altivec.h generic built-in.

Next we need to separate TimeBase microseconds into the integer seconds and microseconds. Normally scalar codes would use integer divide/modulo by 1000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?

Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 32-bit fraction and we have a multiply high (vec_mulhuw()) operation. Multiplying a 32-bit unsigned integer by a 32-bit unsigned fraction generates a 64-bit product with 32-bits above (integer) and below (fraction) the radix point. The high 32-bits of the product is the integer quotient.

It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full analysis, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.

See also: "Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

In the chapter above;

Figure 10-2 Computing the magic number for unsigned division.

provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For the divisor 1000000 this is { 1125899907, 0 , 18 }:

the multiplier is 1125899907.
no corrective add of the dividend is required.
the final shift is 18-bits right.

const vui32_t mul_invs_1m =
  { 1125899907, 1125899907, 1125899907, 1125899907 };
const int shift_1m = 18;
 
tmp = vec_mulhuw (tb_usec, mul_invs_1m);
seconds = vec_srwi (tmp, shift_1m);

Now we need to compute the remainder to get microseconds.

const vui32_t usec_sec =
  { 1000000, 1000000, 1000000, 1000000 };
 
tmp = vec_muluwm (seconds, usec_sec);
useconds = vec_sub (tb_usec, tmp);

Finally we need to merge the vectors of seconds and useconds into vectors of timespec.

timespec1 = vec_mergeh (seconds, useconds);

timespec2 = vec_mergel (seconds, useconds);

Note: vec_sub(), vec_mergeh(), and vec_mergel() are an existing altivec.h generic built-ins.

Vectorized TimeBase conversion example

Here is the complete vectorized TimeBase to timespec conversion example:

void
example_convert_timebase (vui32_t *tb, vui32_t *timespec, int n)
{
  const vui32_t rnd_512 =
    { (256-1), (256-1), (256-1), (256-1) };
  // Magic numbers for multiplicative inverse to divide by 1,000,000
  // are 1125899907 and shift right 18 bits.
  const vui32_t mul_invs_1m =
    { 1125899907, 1125899907, 1125899907, 1125899907 };
  const int shift_1m = 18;
  // Need const for microseconds/second to extract remainder.
  const vui32_t usec_sec =
    { 1000000, 1000000, 1000000, 1000000 };
  vui32_t tmp, tb_usec, seconds, useconds;
  vui32_t timespec1, timespec2;
  int i;
 
  for (i = 0; i < n; i++)
    {
      // Convert 512MHz timebase to microseconds with rounding.
      tmp = vec_avg (*tb++, rnd_512);
      tb_usec = vec_srwi (tmp, 8);
      // extract integer seconds from tb_usec.
      tmp = vec_mulhuw (tb_usec, mul_invs_1m);
      seconds = vec_srwi (tmp, shift_1m);
      // Extract remainder microseconds.
      tmp = vec_muluwm (seconds, usec_sec);
      useconds = vec_sub (tb_usec, tmp);
      // Use merge high/low to interleave seconds and useconds in timespec.
      timespec1 = vec_mergeh (seconds, useconds);
      timespec2 = vec_mergel (seconds, useconds);
      // Store timespec.
      *timespec++ = timespec1;
      *timespec++ = timespec2;
    }
}

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absduw()

static vui32_t vec_absduw	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Absolute Difference Unsigned Word.

Compute the absolute difference for each word. For each unsigned word, subtract VRB[i] from VRA[i] and return the absolute value of the difference.

processor	Latency	Throughput
power8	4	1/cycle
power9	3	2/cycle

Parameters

vra	vector of 4 x unsigned words
vrb	vector of 4 x unsigned words

Returns: vector of the absolute differences.

◆ vec_clzw()

static vui32_t vec_clzw ( vui32_t vra )

inlinestatic

Vector Count Leading Zeros word.

Count the number of leading '0' bits (0-32) within each word element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Word instruction vclzw. Otherwise use sequence of pre 2.07 VMX instructions. SIMDized count leading zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Figure 5-12.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as 4 x 32-bit integer (words) elements.

Returns: 128-bit vector with the Leading Zeros count for each word element.

◆ vec_ctzw()

static vui32_t vec_ctzw ( vui32_t vra )

inlinestatic

Vector Count Trailing Zeros word.

Count the number of trailing '0' bits (0-32) within each word element of a 128-bit vector.

For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Word instruction vctzw. Otherwise use a sequence of pre ISA 3.0 VMX instructions. SIMDized count Trailing zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.

processor	Latency	Throughput
power8	6-8	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as 4 x 32-bit integer (words) elements.

Returns: 128-bit vector with the Trailng Zeros count for each word element.

◆ vec_mrgahw()

static vui32_t vec_mrgahw	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Merge Algebraic High Words.

Merge only the high words from 4 x Algebraic doublewords across vectors vra and vrb. This effectively the Vector Merge Even Word operation that is not modified for endian.

For example merge the high 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply high unsigned word.

Note: This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned long.
vrb	128-bit vector unsigned long.

Returns: A vector merge from only the high words of the 4 x Algebraic doublewords across vra and vrb.

◆ vec_mrgalw()

static vui32_t vec_mrgalw	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector merge Algebraic low words.

Merge the arithmetic low words 4 x Algebraic doublewords across vectors vra and vrb. This is effectively the Vector Merge Odd Word operation that is not modified for endian.

For example merge the low 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply low unsigned word (multiply unsigned word modulo).

Note: This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned long.
vrb	128-bit vector unsigned long.

Returns: A vector merge from only the low words of the 4 x Algebraic doublewords across vra and vrb.

◆ vec_mrgew()

static vui32_t vec_mrgew	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Merge Even Words.

Merge the even word elements from the concatenation of 2 x vectors (vra and vrb).

res[0] = vra[0];
res[1] = vrb[0];
res[2] = vra[2];
res[3] = vrb[2];

The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.

Returns: A vector merge from only the even words of vra and vrb.

◆ vec_mrgow()

static vui32_t vec_mrgow	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Merge Odd Words.

Merge the odd word elements from the concatenation of 2 x vectors (vra and vrb).

res[0] = vra[1];
res[1] = vrb[1];
res[2] = vra[3];
res[3] = vrb[3];

The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.

Returns: A vector merge from only the even words of vra and vrb.

◆ vec_mulesw()

static vi64_t vec_mulesw	(	vi32_t	a,
		vi32_t	b
	)

inlinestatic

Vector multiply even signed words.

Multiple the even words of two vector signed int values and return the signed long product of the even words.

For POWER8 and later we can use the vmulesw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_muleuw().

Here we start with a unsigned vec_muleuw product, then correct the high 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

a	128-bit vector signed int.
b	128-bit vector signed int.

Returns: vector signed long product of the even words of a and b.

◆ vec_muleuw()

static vui64_t vec_muleuw	(	vui32_t	a,
		vui32_t	b
	)

inlinestatic

Vector multiply even unsigned words.

Multiple the even words of two vector unsigned int values and return the unsigned long product of the even words.

For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.

Returns: vector unsigned long product of the even words of a and b.

◆ vec_mulhsw()

static vi32_t vec_mulhsw	(	vi32_t	vra,
		vi32_t	vrb
	)

inlinestatic

Vector Multiply High Signed Word.

Multiple the corresponding word elements of two vector signed int values and return the high order 32-bits, for each 64-bit product element.

processor	Latency	Throughput
power8	9	1/cycle
power9	9	1/cycle

Parameters

vra	128-bit vector signed int.
vrb	128-bit vector signed int.

Returns: vector of the high order 32-bits of the product of the word elements from vra and vrb.

◆ vec_mulhuw()

static vui32_t vec_mulhuw	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Multiply High Unsigned Word.

Multiple the corresponding word elements of two vector unsigned int values and return the high order 32-bits, from each 64-bit product.

processor	Latency	Throughput
power8	9	1/cycle
power9	9	1/cycle

Note: This operation can be used to effectively perform a divide by multiplying by the scaled multiplicative inverse (reciprocal).

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.

Returns: vector of the high order 32-bits of the signed product of the word elements from vra and vrb.

◆ vec_mulosw()

static vi64_t vec_mulosw	(	vi32_t	a,
		vi32_t	b
	)

inlinestatic

Vector multiply odd signed words.

Multiple the odd words of two vector signed int values and return the signed long product of the odd words.

For POWER8 and later we can use the vmulosw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_mulouw().

Here we start with a unsigned vec_mulouw product, then correct the high-order 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

a	128-bit vector signed int.
b	128-bit vector signed int.

Returns: vector signed long product of the odd words of a and b.

◆ vec_mulouw()

static vui64_t vec_mulouw	(	vui32_t	a,
		vui32_t	b
	)

inlinestatic

Vector multiply odd unsigned words.

Multiple the odd words of two vector unsigned int values and return the unsigned long product of the odd words.

For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.

Returns: vector unsigned long product of the odd words of a and b.

◆ vec_muluwm()

static vui32_t vec_muluwm	(	vui32_t	a,
		vui32_t	b
	)

inlinestatic

Vector Multiply Unsigned Word Modulo.

Multiple the corresponding word elements of two vector unsigned int values and return the low order 32-bits of the 64-bit product for each element.

Note: vec_muluwm can be used for unsigned or signed integers. It is the vector equivalent of Multiply Low Word.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

a	128-bit vector signed int.
b	128-bit vector signed int.

Returns: vector of the low order 32-bits of the unsigned product of the word elements from vra and vrb.

◆ vec_popcntw()

static vui32_t vec_popcntw ( vui32_t vra )

inlinestatic

Vector Population Count word.

Count the number of '1' bits (0-32) within each word element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Population Count Word instruction. Otherwise use the pveclib vec_popcntb to count each byte then sum across with Vector Sum across Quarter Unsigned Byte Saturate.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as 4 x 32-bit integer (words) elements.

Returns: 128-bit vector with the population count for each word element.

◆ vec_revbw()

static vui32_t vec_revbw ( vui32_t vra )

inlinestatic

byte reverse each word of a vector unsigned int.

For each word of the input vector, reverse the order of bytes / octets within the word.

processor	Latency	Throughput
power8	2-11	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector unsigned int.

Returns: a 128-bit vector with the bytes of each word reversed.

◆ vec_setb_sw()

static vb32_t vec_setb_sw ( vi32_t vra )

inlinestatic

Vector Set Bool from Signed Word.

For each word, propagate the sign bit to all 32-bits of that word. The result is vector bool int reflecting the sign bit of each 32-bit word.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	Vector signed int.

Returns: vector bool int reflecting the sign bits of each word.

◆ vec_slwi()

static vui32_t vec_slwi	(	vui32_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift left Word Immediate.

Shift left each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.

processor	Latency	Throughput
power8	4-11	2/cycle
power9	5-11	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned int.
shb	shift amount in the range 0-31.

Returns: 128-bit vector unsigned int, shifted left shb bits.

◆ vec_srawi()

static vi32_t vec_srawi	(	vi32_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Algebraic Word Immediate.

Shift Right Algebraic each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return the sign bit propagated to each bit of each element.

processor	Latency	Throughput
power8	4-11	2/cycle
power9	5-11	2/cycle

Parameters

vra	a 128-bit vector treated as a vector signed int.
shb	shift amount in the range 0-31.

Returns: 128-bit vector signed int, shifted right shb bits.

◆ vec_srwi()

static vui32_t vec_srwi	(	vui32_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Word Immediate.

Shift right each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.

processor	Latency	Throughput
power8	4-11	2/cycle
power9	5-11	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned char.
shb	shift amount in the range 0-31.

Returns: 128-bit vector unsigned int, shifted right shb bits.

◆ vec_vgl4wso()

static vui32_t vec_vgl4wso	(	unsigned int *	array,
		const long long	offset0,
		const long long	offset1,
		const long long	offset2,
		const long long	offset3
	)

inlinestatic

Vector Gather-Load 4 Words from scalar Offsets.

For each scalar offset[0,1,2,3], load the word from the effective address formed by *(char*)array+offset[0-3]. Merge resulting word elements [0,1,2,3] and return the resulting vector.

processor	Latency	Throughput
power8	10	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of integer words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.
offset2	Scalar (64-bit) byte offset from &array.
offset3	Scalar (64-bit) byte offset from &array.

Returns: vector word containing word elements [0-3] loaded from *(char*)array+offset[0-3].

◆ vec_vgl4wwo()

static vui32_t vec_vgl4wwo	(	unsigned int *	array,
		vi32_t	vra
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Offsets.

For each signed word element [i] of vra, load the word element at *(char*)array+vra[i]. Merge those word elements and return the resulting vector.

processor	Latency	Throughput
power8	14	1/cycle
power9	15	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) byte offsets from &array.

Returns: vector word containing word elements [0-3], each loaded from *(char*)array+vra[0-3].

◆ vec_vgl4wwsx()

static vui32_t vec_vgl4wwsx	(	unsigned int *	array,
		vi32_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Scaled Indexes.

For each signed word element [i] of vra, load the word element at array[vra[i] << scale]. Merge those word elements and return the resulting vector.

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left 2+scale bits. This converts each index to an 64-bit offset for effective address calculation.

processor	Latency	Throughput
power8	16-25	1/cycle
power9	18-27	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector word containing word elements [0-3] each loaded from array[vra[0-3] << scale].

◆ vec_vgl4wwx()

static vui32_t vec_vgl4wwx	(	unsigned int *	array,
		vi32_t	vra
	)

inlinestatic

Vector Gather-Load 4 Words from Vector Word Indexes.

For word element [i] of vra, load the word element at array[vra[i]]. Merge those word elements and return the resulting vector.

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.

processor	Latency	Throughput
power8	16-25	1/cycle
power9	18-27	1/cycle

Parameters

array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes.

Returns: vector word containing word elements [0-3], each loaded from array[vra[0-3]].

◆ vec_vglswdo()

static vi64_t vec_vglswdo	(	signed int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Offsets.

For each doubleword element [i] of vra, load the sign extended word element at *(char*)array+vra[i]. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	12	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of signed words.
vra	Vector of doubleword (64-bit) byte offsets from &array.

Returns: vector doubleword elements [0,1] loaded from sign extended words at *(char*)array+vra[0,1].

◆ vec_vglswdsx()

static vi64_t vec_vglswdsx	(	signed int *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, load the sign extended word element at array[vra[i] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of signed words.
vra	Vector of doubleword indexes from &array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector doubleword elements [0,1] loaded from the sign extended words at array[vra[0,1]<<scale].

◆ vec_vglswdx()

static vi64_t vec_vglswdx	(	signed int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Indexes.

For each doubleword element [i] of vra, load the sign extended word element at array[vra[i]]. Merge doubleword elements [0,1] and return the resulting vector.

Note: As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of signed words.
vra	Vector of doubleword indexes from &array.

Returns: vector doubleword elements [0,1] loaded from sign extended words at array[vra[0,1]].

◆ vec_vglswso()

static vi64_t vec_vglswso	(	signed int *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Gather-Load Signed Word from Scalar Offsets.

For each scalar offset[0|1], load the signed word (sign extended) from the effective address formed by *(char*)array+offset[0|1]. Merge resulting doubleword elements and return the resulting vector.

processor	Latency	Throughput
power8	7	1/cycle
power9	8	1/cycle

Parameters

array	Pointer to array of words.
offset0	Scalar (64-bit) byte offsets from &array.
offset1	Scalar (64-bit) byte offsets from &array.

Returns: vector doubleword elements [0,1] loaded from sign extend words at *(char*)array+offset[0,1].

◆ vec_vgluwdo()

static vui64_t vec_vgluwdo	(	unsigned int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Offsets.

For each doubleword element [0,1] of vra, load the zero extended word element at *(char*)array+vra[0,1]. Merge those doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	12	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of unsigned words.
vra	Vector of doubleword (64-bit) byte offsets from &array.

Returns: vector doubleword elements [0,1] loaded from zero extended words at *(char*)array+vra[0,1].

◆ vec_vgluwdsx()

static vui64_t vec_vgluwdsx	(	unsigned int *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes.

For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of unsigned words.
vra	Vector of doubleword indexes from &array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector doubleword elements [0,1] loaded from zero extended words at array[vra[0,1]<<scale].

◆ vec_vgluwdx()

static vui64_t vec_vgluwdx	(	unsigned int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Indexes.

For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1]]. Merge those doubleword elements [0,1] and return the resulting vector.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of unsigned words.
vra	Vector of doubleword indexes from &array.

Returns: Vector doubleword [0,1] loaded from zero extended words at array[vra[0,1]].

◆ vec_vgluwso()

static vui64_t vec_vgluwso	(	unsigned int *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Gather-Load Unsigned Word from Scalar Offsets.

For each scalar offset[0,1], load the unsigned word (zero extended) from the effective address formed by *(char*)array+offset[0,1] Merge resulting doubleword [0,1] elements and return the resulting vector.

processor	Latency	Throughput
power8	7	1/cycle
power9	8	1/cycle

Parameters

array	Pointer to array of words.
offset0	Scalar (64-bit) byte offsets from &array.
offset1	Scalar (64-bit) byte offsets from &array.

Returns: vector doubleword elements [0,1] loaded from zero extened words at *(char*)array+offset[0,1].

◆ vec_vlxsiwax()

static vi64_t vec_vlxsiwax	(	const signed long long	ra,
		const signed int *	rb
	)

inlinestatic

Vector Load Scalar Integer Word Algebraic Indexed.

Load the left most doubleword of vector xt as a scalar sign extended word from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vi64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note: The right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwax instruction combines load with sign extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.

Note: The lxsiwax instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lvewx.

processor	Latency	Throughput
power8	5	2/cycle
power9	5	2/cycle

Parameters

ra	const doubleword index (offset/displacement).
rb	const word pointer to an array of integers.

Returns: The word stored at (ra + rb) is sign extended and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vlxsiwzx()

static vui64_t vec_vlxsiwzx	(	const signed long long	ra,
		const unsigned int *	rb
	)

inlinestatic

Vector Load Scalar Integer Word and Zero Indexed.

Load the left most doubleword of vector xt as a scalar unsigned word (zero extended to doubleword) from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note: the right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwzx instruction combines load with zero extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.

Note: The lxsiwzx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lvewx.

processor	Latency	Throughput
power8	5	2/cycle
power9	5	2/cycle

Parameters

ra	const doubleword index (offset/displacement).
rb	const word pointer to an array of integers.

Returns: The word stored at (ra + rb) is zero extended and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vmadd2euw()

static vui64_t vec_vmadd2euw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c,
		vui32_t	d
	)

inlinestatic

Vector Multiply-Add2 Even Unsigned Words.

Note: this implementation exists in vec_int64_ppc::h::vec_vmadd2euw() as it requires vec_addudm().

◆ vec_vmadd2ouw()

static vui64_t vec_vmadd2ouw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c,
		vui32_t	d
	)

inlinestatic

Vector Multiply-Add2 Odd Unsigned Words.

Note: this implementation exists in vec_int64_ppc::h::vec_vmadd2ouw() as it requires vec_addudm().

◆ vec_vmaddeuw()

static vui64_t vec_vmaddeuw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c
	)

inlinestatic

Vector Multiply-Add Even Unsigned Words.

Note: this implementation exists in vec_int64_ppc::h::vec_vmaddeuw() as it requires vec_addudm().

◆ vec_vmaddouw()

static vui64_t vec_vmaddouw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c
	)

inlinestatic

Vector Multiply-Add Odd Unsigned Words.

Note: this implementation exists in vec_int64_ppc::h::vec_vmaddouw() as it requires vec_addudm().

◆ vec_vmsumuwm()

static vui64_t vec_vmsumuwm	(	vui32_t	a,
		vui32_t	b,
		vui64_t	c
	)

inlinestatic

Vector Multiply-Sum Unsigned Word Modulo.

Note: this implementation exists in vec_int64_ppc::h::vec_vmsumuwm() as it requires vec_addudm().

◆ vec_vmuleuw()

static vui64_t vec_vmuleuw	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Multiply Even Unsigned words.

Multiply the even words of two vector unsigned int values and return the unsigned long product of the even words.

For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

Note: This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.

Returns: vector unsigned long product of the even words of a and b.

◆ vec_vmulouw()

static vui64_t vec_vmulouw	(	vui32_t	vra,
		vui32_t	vrb
	)

inlinestatic

Vector Multiply Odd Unsigned Words.

Multiply the odd words of two vector unsigned int values and return the unsigned long product of the odd words.

For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.

Returns: vector unsigned long product of the odd words of a and b.

◆ vec_vsst4wso()

static void vec_vsst4wso	(	vui32_t	xs,
		unsigned int *	array,
		const long long	offset0,
		const long long	offset1,
		const long long	offset2,
		const long long	offset3
	)

inlinestatic

Vector Scatter-Store 4 words to Scalar Offsets.

For each word element [i] of xs, store the element xs[i] at *(char*)array+offset[i].

processor	Latency	Throughput
power8	6	1/cycle
power9	4	2/cycle

Parameters

xs	Vector integer word elements to scatter store.
array	Pointer to array of integer words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.
offset2	Scalar (64-bit) byte offset from &array.
offset3	Scalar (64-bit) byte offset from &array.

◆ vec_vsst4wwo()

static void vec_vsst4wwo	(	vui32_t	xs,
		unsigned int *	array,
		vi32_t	vra
	)

inlinestatic

Vector Scatter-Store 4 words to Vector Word Offsets.

For each word element [i] of xs, store the element xs[i] at *(char*)array+vra[i].

Note: Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.

processor	Latency	Throughput
power8	10	1/cycle
power9	12	2/cycle

Parameters

xs	Vector integer word elements to scatter store.
array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) byte offsets from &array.

◆ vec_vsst4wwsx()

static void vec_vsst4wwsx	(	vui32_t	xs,
		unsigned int *	array,
		vi32_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Scatter-Store 4 words to Vector Word Indexes.

For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]<<scale].

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.

processor	Latency	Throughput
power8	12-21	1/cycle
power9	15-24	2/cycle

Parameters

xs	Vector integer word elements to scatter store.
array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes from array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

◆ vec_vsst4wwx()

static void vec_vsst4wwx	(	vui32_t	xs,
		unsigned int *	array,
		vi32_t	vra
	)

inlinestatic

Vector Scatter-Store 4 words to Vector Word Indexes.

For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]].

Note: Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.

processor	Latency	Throughput
power8	12-21	1/cycle
power9	15-24	2/cycle

Parameters

xs	Vector doubleword elements to scatter store.
array	Pointer to array of integer words.
vra	Vector of signed word (32-bit) indexes from array.

◆ vec_vsstwdo()

static void vec_vsstwdo	(	vui64_t	xs,
		unsigned int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Words to Vector Doubleword Offsets.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].

processor	Latency	Throughput
power8	8	1/cycle
power9	9	2/cycle

Parameters

xs	Vector doubleword elements to scatter store low order words of each doubleword.
array	Pointer to array of integer words.
vra	Vector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstwdsx()

static void vec_vsstwdsx	(	vui64_t	xs,
		unsigned int *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]<<scale].

processor	Latency	Throughput
power8	10-19	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector doubleword elements to scatter store low order words of each doubleword.
array	Pointer to array of integer words.
vra	Vector of doubleword (64-bit) indexes from &array.
scale	8-bit integer. Indexes are multiplying by 2^scale.

◆ vec_vsstwdx()

static void vec_vsstwdx	(	vui64_t	xs,
		unsigned int *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Words to Vector Doubleword Indexes.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]].

processor	Latency	Throughput
power8	10-19	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector doubleword elements to scatter store low order words of each doubleword.
array	Pointer to array of integer words.
vra	Vector of doubleword (64-bit) indexes from &array.

◆ vec_vsstwso()

static void vec_vsstwso	(	vui64_t	xs,
		unsigned int *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Scatter-Store Words to Scalar Offsets.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].

processor	Latency	Throughput
power8	3	1/cycle
power9	3	2/cycle

Parameters

xs	Vector doubleword elements to scatter store low order words of each doubleword.
array	Pointer to array of integer words.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.

◆ vec_vstxsiwx()

static void vec_vstxsiwx	(	vui32_t	xs,
		const signed long long	ra,
		unsigned int *	rb
	)

inlinestatic

Vector Store Scalar Integer Word Indexed.

Stores word element 1 of vector xs as a scalar word at the effective address formed by rb+ra. The operand rb is a pointer to an array of integer words. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be word aligned (integer multiple of 4).

This operation is an alternate form of vector store element (vec_ste), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the stxsiwx instruction can load directly into any of the 64 VSRs. Both simplify scatter operations.

Note: The stxsiwx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use stvewx.

processor	Latency	Throughput
power8	0 - 2	2/cycle
power9	0 - 2	4/cycle

Parameters

xs	vector doubleword element 0 to be stored.
ra	const doubleword index (offset/displacement).
rb	const doubleword pointer to an array of doubles.

◆ vec_vsum2sw()

static vi32_t vec_vsum2sw	(	vi32_t	vra,
		vi32_t	vrb
	)

inlinestatic

Vector Sum-across Half Signed Word Saturate.

Sum across adjacent signed words within doublewords from vra and word addends from vrb. This is effectively the vec_sum2s built-in operation (vsum2sws instruction) without the endian sensitive modifications mandated by the ABI.

This is useful for computing the final doubleword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 58:63 of the doubleword element (word elements 1 and 3).

For vec_sum2s and little endian the ABI mandates that the addend words from vrb be from little endian word elements 1 and 3 (vector element 0 and 2) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 1 and 3 (vector element 0 and 2). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsum2sws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.

This also leaves the sums in bits 26:31 of the doubleword element and out of position for doubleword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sum2s.

Note: This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

vra	Vector signed int as adjcent words within doublewords.
vrb	Vector signed int where odd words are summed with adjacent words from vra.

Returns: Vector signed int with even words set to 0 and odd words containing the word sums within doublewords.

◆ vec_vsumsw()

static vi32_t vec_vsumsw	(	vi32_t	vra,
		vi32_t	vrb
	)

inlinestatic

Vector Sum-across Signed Word Saturate.

Sum across the 4 signed words from vra and word element 3 from vrb. This is effectively the vec_sums built-in operation (vsumsws instruction) without the endian sensitive modifications mandated by the ABI.

This is useful for computing the final quadword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 121:127 of the quadword (word element 3).

For vec_sums and little endian the ABI mandates that the addend word from vrb be from little endian word elements 3 (vector element 0) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 3 (vector element 0). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsumsws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.

This also leaves the sums in bits 25:31 of the quadword and out of position for quadword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sums.

Note: This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	7	2/cycle
power9	7	2/cycle

Parameters

vra	Vector signed int as words within quadword.
vrb	Vector signed int where word element 3 is summed with words from vra.

Returns: Vector signed int with words 0-2 set to 0 and word element 3 containing the word sums.

◆ vec_vupkhsw()

static vi64_t vec_vupkhsw ( vi32_t vra )

inlinestatic

Vector Unpack High Signed Word.

From the word source in vra. For each integer word [i] from 0 to 1, sign extend to 64-bit and place in doubleword element [i] of the result vector.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: This operation is the equivalent of the generic vec_unpackh for type vector signed int. However vec_unpackh (for this type) is not available for _ARCH_PWR7 and earlier versions of GCC. This PVECLIB operation is available to both.; Use vec_vupkhsw naming but only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 4 x signed integers.

Returns: 128-bit vector treated as 2 x signed long long integers.

◆ vec_vupkhuw()

static vui64_t vec_vupkhuw ( vui32_t vra )

inlinestatic

Vector Unpack High Unsigned Word.

From the word source in vra. For each integer word [i] from 0 to 1, zero extend to 64-bit and place in doubleword element [i] of the result vector.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-4	2/cycle

Note: vec_vupkhuw does not exist in <altivec.h> nor as an instruction is the PowerISA. But it is easy to construct using vec_mergeh and a zero vector.

Parameters

vra	a 128-bit vector treated as 4 x unsigned integers.

Returns: 128-bit vector treated as 2 x unsigned long long integers.

◆ vec_vupklsw()

static vi64_t vec_vupklsw ( vi32_t vra )

inlinestatic

Vector Unpack Low Signed Word.

From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), sign extend to 64-bit and place in doubleword element [i] of the result vector.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Use vec_vupkhsw naming but only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 4 x signed integers.

Returns: 128-bit vector treated as 2 x signed long long integers.

◆ vec_vupkluw()

static vui64_t vec_vupkluw ( vui32_t vra )

inlinestatic

Vector Unpack Low Unsigned Word.

From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), zero extend to 64-bit and place in doubleword element [i] of the result vector.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-4	2/cycle

Note: vec_vupkluw does not exist in <altivec.h> nor as an instruction is the PowerISA. But it is easy to construct using vec_mergeh and a zero vector.

Parameters

vra	a 128-bit vector treated as 4 x unsigned integers.

Returns: 128-bit vector treated as 2 x unsigned long long integers.

Functions

Detailed Description

Recent Additions

Endian problems with word operations

Vector Merge Algebraic High Word example

Vector Multiply High Unsigned Word example

Vector Word Examples

Vectorized TimeBase conversion example

Performance data.

Function Documentation

◆ vec_absduw()

◆ vec_clzw()

◆ vec_ctzw()

◆ vec_mrgahw()

◆ vec_mrgalw()

◆ vec_mrgew()

◆ vec_mrgow()

◆ vec_mulesw()

◆ vec_muleuw()

◆ vec_mulhsw()

◆ vec_mulhuw()

◆ vec_mulosw()

◆ vec_mulouw()

◆ vec_muluwm()

◆ vec_popcntw()

◆ vec_revbw()

◆ vec_setb_sw()

◆ vec_slwi()

◆ vec_srawi()

◆ vec_srwi()

◆ vec_vgl4wso()

◆ vec_vgl4wwo()

◆ vec_vgl4wwsx()

◆ vec_vgl4wwx()

◆ vec_vglswdo()

◆ vec_vglswdsx()

◆ vec_vglswdx()

◆ vec_vglswso()

◆ vec_vgluwdo()

◆ vec_vgluwdsx()

◆ vec_vgluwdx()

◆ vec_vgluwso()

◆ vec_vlxsiwax()

◆ vec_vlxsiwzx()

◆ vec_vmadd2euw()

◆ vec_vmadd2ouw()

◆ vec_vmaddeuw()

◆ vec_vmaddouw()

◆ vec_vmsumuwm()

◆ vec_vmuleuw()

◆ vec_vmulouw()

◆ vec_vsst4wso()

◆ vec_vsst4wwo()

◆ vec_vsst4wwsx()

◆ vec_vsst4wwx()

◆ vec_vsstwdo()

◆ vec_vsstwdsx()

◆ vec_vsstwdx()

◆ vec_vsstwso()

◆ vec_vstxsiwx()

◆ vec_vsum2sw()

◆ vec_vsumsw()

◆ vec_vupkhsw()

◆ vec_vupkhuw()

◆ vec_vupklsw()

◆ vec_vupkluw()