POWER Vector Library Manual  1.0.4
Functions
vec_int32_ppc.h File Reference

Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements. More...

#include <pveclib/vec_int16_ppc.h>

Go to the source code of this file.

Functions

static vui32_t vec_absduw (vui32_t vra, vui32_t vrb)
 Vector Absolute Difference Unsigned Word. More...
 
static vui32_t vec_clzw (vui32_t vra)
 Vector Count Leading Zeros word. More...
 
static vui32_t vec_ctzw (vui32_t vra)
 Vector Count Trailing Zeros word. More...
 
static vui32_t vec_mrgahw (vui64_t vra, vui64_t vrb)
 Vector Merge Algebraic High Words. More...
 
static vui32_t vec_mrgalw (vui64_t vra, vui64_t vrb)
 Vector merge Algebraic low words. More...
 
static vui32_t vec_mrgew (vui32_t vra, vui32_t vrb)
 Vector Merge Even Words. More...
 
static vui32_t vec_mrgow (vui32_t vra, vui32_t vrb)
 Vector Merge Odd Words. More...
 
static vi64_t vec_mulesw (vi32_t a, vi32_t b)
 Vector multiply even signed words. More...
 
static vi64_t vec_mulosw (vi32_t a, vi32_t b)
 Vector multiply odd signed words. More...
 
static vui64_t vec_muleuw (vui32_t a, vui32_t b)
 Vector multiply even unsigned words. More...
 
static vui64_t vec_mulouw (vui32_t a, vui32_t b)
 Vector multiply odd unsigned words. More...
 
static vi32_t vec_mulhsw (vi32_t vra, vi32_t vrb)
 Vector Multiply High Signed Word. More...
 
static vui32_t vec_mulhuw (vui32_t vra, vui32_t vrb)
 Vector Multiply High Unsigned Word. More...
 
static vui32_t vec_muluwm (vui32_t a, vui32_t b)
 Vector Multiply Unsigned Word Modulo. More...
 
static vui32_t vec_popcntw (vui32_t vra)
 Vector Population Count word. More...
 
static vui32_t vec_revbw (vui32_t vra)
 byte reverse each word of a vector unsigned int. More...
 
static vb32_t vec_setb_sw (vi32_t vra)
 Vector Set Bool from Signed Word. More...
 
static vui32_t vec_slwi (vui32_t vra, const unsigned int shb)
 Vector Shift left Word Immediate. More...
 
static vi32_t vec_srawi (vi32_t vra, const unsigned int shb)
 Vector Shift Right Algebraic Word Immediate. More...
 
static vui32_t vec_srwi (vui32_t vra, const unsigned int shb)
 Vector Shift Right Word Immediate. More...
 
static vui32_t vec_vgl4wso (unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
 Vector Gather-Load 4 Words from scalar Offsets. More...
 
static vui32_t vec_vgl4wwo (unsigned int *array, vi32_t vra)
 Vector Gather-Load 4 Words from Vector Word Offsets. More...
 
static vui32_t vec_vgl4wwsx (unsigned int *array, vi32_t vra, const unsigned char scale)
 Vector Gather-Load 4 Words from Vector Word Scaled Indexes. More...
 
static vui32_t vec_vgl4wwx (unsigned int *array, vi32_t vra)
 Vector Gather-Load 4 Words from Vector Word Indexes. More...
 
static vi64_t vec_vglswso (signed int *array, const long long offset0, const long long offset1)
 Vector Gather-Load Signed Word from Scalar Offsets. More...
 
static vi64_t vec_vglswdo (signed int *array, vi64_t vra)
 Vector Gather-Load Signed Words from Vector Doubleword Offsets. More...
 
static vi64_t vec_vglswdsx (signed int *array, vi64_t vra, const unsigned char scale)
 Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes. More...
 
static vi64_t vec_vglswdx (signed int *array, vi64_t vra)
 Vector Gather-Load Signed Words from Vector Doubleword Indexes. More...
 
static vui64_t vec_vgluwso (unsigned int *array, const long long offset0, const long long offset1)
 Vector Gather-Load Unsigned Word from Scalar Offsets. More...
 
static vui64_t vec_vgluwdo (unsigned int *array, vi64_t vra)
 Vector Gather-Load Unsigned Words from Vector Doubleword Offsets. More...
 
static vui64_t vec_vgluwdsx (unsigned int *array, vi64_t vra, const unsigned char scale)
 Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes. More...
 
static vui64_t vec_vgluwdx (unsigned int *array, vi64_t vra)
 Vector Gather-Load Unsigned Words from Vector Doubleword Indexes. More...
 
static vi64_t vec_vlxsiwax (const signed long long ra, const signed int *rb)
 Vector Load Scalar Integer Word Algebraic Indexed. More...
 
static vui64_t vec_vlxsiwzx (const signed long long ra, const unsigned int *rb)
 Vector Load Scalar Integer Word and Zero Indexed. More...
 
static vui64_t vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
 Vector Multiply-Add2 Even Unsigned Words. More...
 
static vui64_t vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
 Vector Multiply-Add2 Odd Unsigned Words. More...
 
static vui64_t vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c)
 Vector Multiply-Add Even Unsigned Words. More...
 
static vui64_t vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c)
 Vector Multiply-Add Odd Unsigned Words. More...
 
static vui64_t vec_vmsumuwm (vui32_t a, vui32_t b, vui64_t c)
 Vector Multiply-Sum Unsigned Word Modulo. More...
 
static vui64_t vec_vmuleuw (vui32_t vra, vui32_t vrb)
 Vector Multiply Even Unsigned words. More...
 
static vui64_t vec_vmulouw (vui32_t vra, vui32_t vrb)
 Vector Multiply Odd Unsigned Words. More...
 
static void vec_vsst4wso (vui32_t xs, unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3)
 Vector Scatter-Store 4 words to Scalar Offsets. More...
 
static void vec_vsst4wwo (vui32_t xs, unsigned int *array, vi32_t vra)
 Vector Scatter-Store 4 words to Vector Word Offsets. More...
 
static void vec_vsst4wwsx (vui32_t xs, unsigned int *array, vi32_t vra, const unsigned char scale)
 Vector Scatter-Store 4 words to Vector Word Indexes. More...
 
static void vec_vsst4wwx (vui32_t xs, unsigned int *array, vi32_t vra)
 Vector Scatter-Store 4 words to Vector Word Indexes. More...
 
static void vec_vsstwdo (vui64_t xs, unsigned int *array, vi64_t vra)
 Vector Scatter-Store Words to Vector Doubleword Offsets. More...
 
static void vec_vsstwdsx (vui64_t xs, unsigned int *array, vi64_t vra, const unsigned char scale)
 Vector Scatter-Store Words to Vector Doubleword Scaled Indexes. More...
 
static void vec_vsstwdx (vui64_t xs, unsigned int *array, vi64_t vra)
 Vector Scatter-Store Words to Vector Doubleword Indexes. More...
 
static void vec_vsstwso (vui64_t xs, unsigned int *array, const long long offset0, const long long offset1)
 Vector Scatter-Store Words to Scalar Offsets. More...
 
static void vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb)
 Vector Store Scalar Integer Word Indexed. More...
 
static vi32_t vec_vsum2sw (vi32_t vra, vi32_t vrb)
 Vector Sum-across Half Signed Word Saturate. More...
 
static vi32_t vec_vsumsw (vi32_t vra, vi32_t vrb)
 Vector Sum-across Signed Word Saturate. More...
 
static vi64_t vec_vupkhsw (vi32_t vra)
 Vector Unpack High Signed Word. More...
 
static vui64_t vec_vupkhuw (vui32_t vra)
 Vector Unpack High Unsigned Word. More...
 
static vi64_t vec_vupklsw (vi32_t vra)
 Vector Unpack Low Signed Word. More...
 
static vui64_t vec_vupkluw (vui32_t vra)
 Vector Unpack Low Unsigned Word. More...
 

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.

Most vector int (32-bit integer word) operations are implemented with PowerISA VMX instructions either defined by the original VMX (AKA Altivec) or added to later versions of the PowerISA. Vector word-wise merge, shift, and splat operations were added with VSX in PowerISA 2.06B (POWER7). PowerISA 2.07B (POWER8) added several useful word wise operations (multiply, merge even/odd, count leading zeros, population count) not included in the original VMX. PowerISA 3.0B (POWER9) adds several more (compare not equal, count trailing zeros, extend sign, extract/insert, and parity). Most of these intrinsic (compiler built-ins) operations are defined in <altivec.h> and described in the compiler documentation.

Note
The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example if you compile with -mcpu=power7, vec_vclz and vec_vclzw will not be defined. Another example if you compile with -mcpu=power8, vec_revb will not be defined. This header provides the appropriate substitutions, will generate the minimum code, appropriate for the target, and produce correct results.
Most ppc64le compilers will default to -mcpu=power8 if not specified.

The newly introduced vector operations imply some useful composite operations. For example, we can make the vector multiply even/odd/modulo word operations available for older compilers. And provide implementations for older (POWER7 and earlier) processors using the original VMX operations.

This header covers operations that are either:

Recent Additions

Added vec_vmaddeuw(), vec_vmaddouw(), vec_vmadd2euw(), and vec_vmadd2ouw() as an optimization for the vector multiply quadword implementations on POWER8.

Endian problems with word operations

It would be useful to provide a vector multiply high word (return the high order 32-bits of the 64-bit product) operation. This can be used for multiplicative inverse (effectively integer divide) operations. Neither integer multiply high nor divide are available as vector instructions. However the multiply high word operation can be composed from the existing multiply even/odd word operations followed by the vector merge even word instruction.

As a prerequisite we need to provide the merge even/odd word operations for older compilers and an implementation for older (POWER7) processors. Fortunately vector merge operations are just a special case of vector permute. So the POWER7 (and earlier) implementation can use vec_perm and appropriate selection vectors to provide these merge operations.

But this is complicated by little-endian (LE) support as specified in the OpenPOWER ABI and as implemented in the compilers. Little-endian changes the effective vector element numbering and the location of even and odd elements. This means that the vector built-ins provided by altivec.h may not generate the instructions you would expect.

See also
General Endian Issues

The OpenPOWER ABI provides a helpful table of Endian Sensitive Operations. For vec_mergee (vmrgew) it specifies:

Swap inputs and use vmrgow, for LE.

Also for vec_mule (vmuleuw, vmulesw):

Replace with vmulouw and so on, for LE.

Also for vec_perm (vperm) it specifies:

For LE, Swap input arguments and complement the selection vector.

The above is just a sampling of a larger list of Endian Sensitive Operations.

So the obvious coding for Vector Multiply High Word:

test_mulhw (vui32_t vra, vui32_t vrb)
{
return vec_mergee ((vui32_t)vec_mule (vra, vrb),
(vui32_t)vec_mulo (vra, vrb));
}

Would produce the expected code and correct results when compiled for BE:

<test_mulhw>:
vmuleuw v0,v2,v3
vmuluuw v2,v2,v3
vmrgew v2,v0,v2
blr

But the following and wrong code for LE:

<test_mulhw>:
vmulouw v0,v2,v3
vmuleuw v2,v2,v3
vmrgow v2,v2,v0
blr

The compiler swapped the multiplies even for odd and odd of even. That is somewhat mitigated by swapping the input arguments in the merge. But changing the merge from even to odd actually returns the low order 32-bits of the product. This is not the correct result for multiply high.

This header provides implementations of vector merge even/odd word (vec_mrgew() and vec_mrgow()) that support older compilers and older (POWER7) processor. Similarly for the multiply Even/odd unsigned/signed word instructions (vec_mulesw(), vec_mulosw(), vec_muleuw() and vec_mulouw()). These implementations include the mandated LE transforms.

Vector Merge Algebraic High Word example

This header also provides the higher level operations Vector Merge Algebraic High/low Word (vec_mrgahw() and vec_mrgalw()). These implementations generate the correct merge even/odd word instruction for the operation independent of endian.

Note
The parameters are vector unsigned long (vui64_t) to match results from vec_muleuw() and vec_mulouw().
static inline vui32_t
{
vui32_t res;
#ifdef _ARCH_PWR8
#ifdef vec_vmrgew // Use altivec.h builtins
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// really want vmrgew here! So do the opposite.
res = vec_vmrgow ((vui32_t)vrb, (vui32_t)vra);
#else
res = vec_vmrgew ((vui32_t)vra, (vui32_t)vrb);
#endif
#else // Generate vmrgew directly in assembler
__asm__(
"vmrgew %0,%1,%2;\n"
: "=v" (res)
: "v" (vra),
"v" (vrb)
: );
#endif
#else // POWER7 and earlier, Assume BE only
const vui32_t vconstp =
CONST_VINT32_W(0x00010203, 0x10111213, 0x08090a0b, 0x18191a1b);
res = (vui32_t) vec_perm ((vui8_t) vra, (vui8_t) vrb, (vui8_t) vconstp);
#endif
return (res);
}

The implementation is a bit complicated so that is can nullify the unwanted LE transformation of vec_vmrgew(), in addition to handling older and compilers and processors.

Vector Multiply High Unsigned Word example

Now we can implement Vector Multiply High Unsigned Word (vec_mulhuw()):

static inline vui32_t
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return vec_mrgahw (vec_mulouw (vra, vrb), vec_muleuw (vra, vrb));
#else
return vec_mrgahw (vec_muleuw (vra, vrb), vec_mulouw (vra, vrb));
#endif
}

Again the implementation is more complicated than expected as we still have to nullify the LE transformation associated with multiply even/odd.

The good news is all this complexity is contained within pveclib and the generated code is still just 3 instructions.

vmulouw v0,v2,v3
vmuleuw v2,v2,v3
vmrgew v2,v2,v0

Vector Word Examples

Suppose we have a requirement to convert an array of 32-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 4 words.

The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume that the lower 32-bits of the TimeBase is sufficient to compute intervals (~8.38 seconds). TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.

The timespec format it a struct of unsigned int fields for seconds and microseconds. So the task is to convert the 512MHz TimeBase intervals to microseconds and then split the integer seconds and microseconds for the timespec.

First the TimeBase to microseconds conversion is simply (1000000 / 512000000) which reduces to (1 / 512) or divide by 512. The vector unit does not provide integer divide but luckily, 512 is a power of 2 and we can shift right. If we don't care for the niceties of rounding we can simply shift right 9 bits:

tb_usec = vec_srwi (*tb++, 9);

But if we decide that rounding is important we can leverage the Vector Average Unsigned Word (vavguw) instruction. Here we need to add 256 (512 / 2 = 256) to the timeBase interval before we shift right.

But we need to reverse engineer the vavguw operation to get the results we want. For each word, vavguw computes the sum of A and B plus 1, then shifts the 33-bit sum right 1 bit. We can effectively round by passing the rounding factor as the B operand to the vec_avg() built-in. But we get a +1 and 1 bit right shift for free. So in this case the rounding constant is 256-1 = 255. And we only need to shift an additional 8 bits to complete the conversion:

const vui32_t rnd_512 =
{ (256-1), (256-1), (256-1), (256-1) };
// Convert 512MHz timebase to microseconds with rounding.
tmp = vec_avg (*tb++, rnd_512);
tb_usec = vec_srwi (tmp, 8);
Note
vec_avg() is an existing altivec.h generic built-in.

Next we need to separate TimeBase microseconds into the integer seconds and microseconds. Normally scalar codes would use integer divide/modulo by 1000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?

Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 32-bit fraction and we have a multiply high (vec_mulhuw()) operation. Multiplying a 32-bit unsigned integer by a 32-bit unsigned fraction generates a 64-bit product with 32-bits above (integer) and below (fraction) the radix point. The high 32-bits of the product is the integer quotient.

It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full analysis, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.

See also
"Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

In the chapter above;

Figure 10-2 Computing the magic number for unsigned division.

provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For the divisor 1000000 this is { 1125899907, 0 , 18 }:

const vui32_t mul_invs_1m =
{ 1125899907, 1125899907, 1125899907, 1125899907 };
const int shift_1m = 18;
tmp = vec_mulhuw (tb_usec, mul_invs_1m);
seconds = vec_srwi (tmp, shift_1m);

Now we need to compute the remainder to get microseconds.

const vui32_t usec_sec =
{ 1000000, 1000000, 1000000, 1000000 };
tmp = vec_muluwm (seconds, usec_sec);
useconds = vec_sub (tb_usec, tmp);

Finally we need to merge the vectors of seconds and useconds into vectors of timespec.

timespec1 = vec_mergeh (seconds, useconds);
timespec2 = vec_mergel (seconds, useconds);
Note
vec_sub(), vec_mergeh(), and vec_mergel() are an existing altivec.h generic built-ins.

Vectorized TimeBase conversion example

Here is the complete vectorized TimeBase to timespec conversion example:

void
example_convert_timebase (vui32_t *tb, vui32_t *timespec, int n)
{
const vui32_t rnd_512 =
{ (256-1), (256-1), (256-1), (256-1) };
// Magic numbers for multiplicative inverse to divide by 1,000,000
// are 1125899907 and shift right 18 bits.
const vui32_t mul_invs_1m =
{ 1125899907, 1125899907, 1125899907, 1125899907 };
const int shift_1m = 18;
// Need const for microseconds/second to extract remainder.
const vui32_t usec_sec =
{ 1000000, 1000000, 1000000, 1000000 };
vui32_t tmp, tb_usec, seconds, useconds;
vui32_t timespec1, timespec2;
int i;
for (i = 0; i < n; i++)
{
// Convert 512MHz timebase to microseconds with rounding.
tmp = vec_avg (*tb++, rnd_512);
tb_usec = vec_srwi (tmp, 8);
// extract integer seconds from tb_usec.
tmp = vec_mulhuw (tb_usec, mul_invs_1m);
seconds = vec_srwi (tmp, shift_1m);
// Extract remainder microseconds.
tmp = vec_muluwm (seconds, usec_sec);
useconds = vec_sub (tb_usec, tmp);
// Use merge high/low to interleave seconds and useconds in timespec.
timespec1 = vec_mergeh (seconds, useconds);
timespec2 = vec_mergel (seconds, useconds);
// Store timespec.
*timespec++ = timespec1;
*timespec++ = timespec2;
}
}

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absduw()

static vui32_t vec_absduw ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Absolute Difference Unsigned Word.

Compute the absolute difference for each word. For each unsigned word, subtract VRB[i] from VRA[i] and return the absolute value of the difference.

processor Latency Throughput
power8 4 1/cycle
power9 3 2/cycle
Parameters
vravector of 4 x unsigned words
vrbvector of 4 x unsigned words
Returns
vector of the absolute differences.

◆ vec_clzw()

static vui32_t vec_clzw ( vui32_t  vra)
inlinestatic

Vector Count Leading Zeros word.

Count the number of leading '0' bits (0-32) within each word element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Word instruction vclzw. Otherwise use sequence of pre 2.07 VMX instructions. SIMDized count leading zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Figure 5-12.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector treated as 4 x 32-bit integer (words) elements.
Returns
128-bit vector with the Leading Zeros count for each word element.

◆ vec_ctzw()

static vui32_t vec_ctzw ( vui32_t  vra)
inlinestatic

Vector Count Trailing Zeros word.

Count the number of trailing '0' bits (0-32) within each word element of a 128-bit vector.

For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Word instruction vctzw. Otherwise use a sequence of pre ISA 3.0 VMX instructions. SIMDized count Trailing zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.

processor Latency Throughput
power8 6-8 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector treated as 4 x 32-bit integer (words) elements.
Returns
128-bit vector with the Trailng Zeros count for each word element.

◆ vec_mrgahw()

static vui32_t vec_mrgahw ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Merge Algebraic High Words.

Merge only the high words from 4 x Algebraic doublewords across vectors vra and vrb. This effectively the Vector Merge Even Word operation that is not modified for endian.

For example merge the high 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply high unsigned word.

Note
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned long.
vrb128-bit vector unsigned long.
Returns
A vector merge from only the high words of the 4 x Algebraic doublewords across vra and vrb.

◆ vec_mrgalw()

static vui32_t vec_mrgalw ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector merge Algebraic low words.

Merge the arithmetic low words 4 x Algebraic doublewords across vectors vra and vrb. This is effectively the Vector Merge Odd Word operation that is not modified for endian.

For example merge the low 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply low unsigned word (multiply unsigned word modulo).

Note
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned long.
vrb128-bit vector unsigned long.
Returns
A vector merge from only the low words of the 4 x Algebraic doublewords across vra and vrb.

◆ vec_mrgew()

static vui32_t vec_mrgew ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Merge Even Words.

Merge the even word elements from the concatenation of 2 x vectors (vra and vrb).

  • res[0] = vra[0];
  • res[1] = vrb[0];
  • res[2] = vra[2];
  • res[3] = vrb[2];

The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
Returns
A vector merge from only the even words of vra and vrb.

◆ vec_mrgow()

static vui32_t vec_mrgow ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Merge Odd Words.

Merge the odd word elements from the concatenation of 2 x vectors (vra and vrb).

  • res[0] = vra[1];
  • res[1] = vrb[1];
  • res[2] = vra[3];
  • res[3] = vrb[3];

The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
Returns
A vector merge from only the even words of vra and vrb.

◆ vec_mulesw()

static vi64_t vec_mulesw ( vi32_t  a,
vi32_t  b 
)
inlinestatic

Vector multiply even signed words.

Multiple the even words of two vector signed int values and return the signed long product of the even words.

For POWER8 and later we can use the vmulesw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_muleuw().

Here we start with a unsigned vec_muleuw product, then correct the high 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.

processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
a128-bit vector signed int.
b128-bit vector signed int.
Returns
vector signed long product of the even words of a and b.

◆ vec_muleuw()

static vui64_t vec_muleuw ( vui32_t  a,
vui32_t  b 
)
inlinestatic

Vector multiply even unsigned words.

Multiple the even words of two vector unsigned int values and return the unsigned long product of the even words.

For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
Returns
vector unsigned long product of the even words of a and b.

◆ vec_mulhsw()

static vi32_t vec_mulhsw ( vi32_t  vra,
vi32_t  vrb 
)
inlinestatic

Vector Multiply High Signed Word.

Multiple the corresponding word elements of two vector signed int values and return the high order 32-bits, for each 64-bit product element.

processor Latency Throughput
power8 9 1/cycle
power9 9 1/cycle
Parameters
vra128-bit vector signed int.
vrb128-bit vector signed int.
Returns
vector of the high order 32-bits of the product of the word elements from vra and vrb.

◆ vec_mulhuw()

static vui32_t vec_mulhuw ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Multiply High Unsigned Word.

Multiple the corresponding word elements of two vector unsigned int values and return the high order 32-bits, from each 64-bit product.

processor Latency Throughput
power8 9 1/cycle
power9 9 1/cycle
Note
This operation can be used to effectively perform a divide by multiplying by the scaled multiplicative inverse (reciprocal).

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
Returns
vector of the high order 32-bits of the signed product of the word elements from vra and vrb.

◆ vec_mulosw()

static vi64_t vec_mulosw ( vi32_t  a,
vi32_t  b 
)
inlinestatic

Vector multiply odd signed words.

Multiple the odd words of two vector signed int values and return the signed long product of the odd words.

For POWER8 and later we can use the vmulosw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_mulouw().

Here we start with a unsigned vec_mulouw product, then correct the high-order 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.

processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
a128-bit vector signed int.
b128-bit vector signed int.
Returns
vector signed long product of the odd words of a and b.

◆ vec_mulouw()

static vui64_t vec_mulouw ( vui32_t  a,
vui32_t  b 
)
inlinestatic

Vector multiply odd unsigned words.

Multiple the odd words of two vector unsigned int values and return the unsigned long product of the odd words.

For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
Returns
vector unsigned long product of the odd words of a and b.

◆ vec_muluwm()

static vui32_t vec_muluwm ( vui32_t  a,
vui32_t  b 
)
inlinestatic

Vector Multiply Unsigned Word Modulo.

Multiple the corresponding word elements of two vector unsigned int values and return the low order 32-bits of the 64-bit product for each element.

Note
vec_muluwm can be used for unsigned or signed integers. It is the vector equivalent of Multiply Low Word.
processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
a128-bit vector signed int.
b128-bit vector signed int.
Returns
vector of the low order 32-bits of the unsigned product of the word elements from vra and vrb.

◆ vec_popcntw()

static vui32_t vec_popcntw ( vui32_t  vra)
inlinestatic

Vector Population Count word.

Count the number of '1' bits (0-32) within each word element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Population Count Word instruction. Otherwise use the pveclib vec_popcntb to count each byte then sum across with Vector Sum across Quarter Unsigned Byte Saturate.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector treated as 4 x 32-bit integer (words) elements.
Returns
128-bit vector with the population count for each word element.

◆ vec_revbw()

static vui32_t vec_revbw ( vui32_t  vra)
inlinestatic

byte reverse each word of a vector unsigned int.

For each word of the input vector, reverse the order of bytes / octets within the word.

processor Latency Throughput
power8 2-11 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector unsigned int.
Returns
a 128-bit vector with the bytes of each word reversed.

◆ vec_setb_sw()

static vb32_t vec_setb_sw ( vi32_t  vra)
inlinestatic

Vector Set Bool from Signed Word.

For each word, propagate the sign bit to all 32-bits of that word. The result is vector bool int reflecting the sign bit of each 32-bit word.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraVector signed int.
Returns
vector bool int reflecting the sign bits of each word.

◆ vec_slwi()

static vui32_t vec_slwi ( vui32_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift left Word Immediate.

Shift left each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.

processor Latency Throughput
power8 4-11 2/cycle
power9 5-11 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned int.
shbshift amount in the range 0-31.
Returns
128-bit vector unsigned int, shifted left shb bits.

◆ vec_srawi()

static vi32_t vec_srawi ( vi32_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift Right Algebraic Word Immediate.

Shift Right Algebraic each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return the sign bit propagated to each bit of each element.

processor Latency Throughput
power8 4-11 2/cycle
power9 5-11 2/cycle
Parameters
vraa 128-bit vector treated as a vector signed int.
shbshift amount in the range 0-31.
Returns
128-bit vector signed int, shifted right shb bits.

◆ vec_srwi()

static vui32_t vec_srwi ( vui32_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift Right Word Immediate.

Shift right each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.

processor Latency Throughput
power8 4-11 2/cycle
power9 5-11 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned char.
shbshift amount in the range 0-31.
Returns
128-bit vector unsigned int, shifted right shb bits.

◆ vec_vgl4wso()

static vui32_t vec_vgl4wso ( unsigned int *  array,
const long long  offset0,
const long long  offset1,
const long long  offset2,
const long long  offset3 
)
inlinestatic

Vector Gather-Load 4 Words from scalar Offsets.

For each scalar offset[0,1,2,3], load the word from the effective address formed by *(char*)array+offset[0-3]. Merge resulting word elements [0,1,2,3] and return the resulting vector.

processor Latency Throughput
power8 10 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of integer words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.
offset2Scalar (64-bit) byte offset from &array.
offset3Scalar (64-bit) byte offset from &array.
Returns
vector word containing word elements [0-3] loaded from *(char*)array+offset[0-3].

◆ vec_vgl4wwo()

static vui32_t vec_vgl4wwo ( unsigned int *  array,
vi32_t  vra 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Offsets.

For each signed word element [i] of vra, load the word element at *(char*)array+vra[i]. Merge those word elements and return the resulting vector.

processor Latency Throughput
power8 14 1/cycle
power9 15 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) byte offsets from &array.
Returns
vector word containing word elements [0-3], each loaded from *(char*)array+vra[0-3].

◆ vec_vgl4wwsx()

static vui32_t vec_vgl4wwsx ( unsigned int *  array,
vi32_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Scaled Indexes.

For each signed word element [i] of vra, load the word element at array[vra[i] << scale]. Merge those word elements and return the resulting vector.

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left 2+scale bits. This converts each index to an 64-bit offset for effective address calculation.
processor Latency Throughput
power8 16-25 1/cycle
power9 18-27 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector word containing word elements [0-3] each loaded from array[vra[0-3] << scale].

◆ vec_vgl4wwx()

static vui32_t vec_vgl4wwx ( unsigned int *  array,
vi32_t  vra 
)
inlinestatic

Vector Gather-Load 4 Words from Vector Word Indexes.

For word element [i] of vra, load the word element at array[vra[i]]. Merge those word elements and return the resulting vector.

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.
processor Latency Throughput
power8 16-25 1/cycle
power9 18-27 1/cycle
Parameters
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes.
Returns
vector word containing word elements [0-3], each loaded from array[vra[0-3]].

◆ vec_vglswdo()

static vi64_t vec_vglswdo ( signed int *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Offsets.

For each doubleword element [i] of vra, load the sign extended word element at *(char*)array+vra[i]. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 12 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of signed words.
vraVector of doubleword (64-bit) byte offsets from &array.
Returns
vector doubleword elements [0,1] loaded from sign extended words at *(char*)array+vra[0,1].

◆ vec_vglswdsx()

static vi64_t vec_vglswdsx ( signed int *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, load the sign extended word element at array[vra[i] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of signed words.
vraVector of doubleword indexes from &array.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector doubleword elements [0,1] loaded from the sign extended words at array[vra[0,1]<<scale].

◆ vec_vglswdx()

static vi64_t vec_vglswdx ( signed int *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Signed Words from Vector Doubleword Indexes.

For each doubleword element [i] of vra, load the sign extended word element at array[vra[i]]. Merge doubleword elements [0,1] and return the resulting vector.

Note
As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.
processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of signed words.
vraVector of doubleword indexes from &array.
Returns
vector doubleword elements [0,1] loaded from sign extended words at array[vra[0,1]].

◆ vec_vglswso()

static vi64_t vec_vglswso ( signed int *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Gather-Load Signed Word from Scalar Offsets.

For each scalar offset[0|1], load the signed word (sign extended) from the effective address formed by *(char*)array+offset[0|1]. Merge resulting doubleword elements and return the resulting vector.

processor Latency Throughput
power8 7 1/cycle
power9 8 1/cycle
Parameters
arrayPointer to array of words.
offset0Scalar (64-bit) byte offsets from &array.
offset1Scalar (64-bit) byte offsets from &array.
Returns
vector doubleword elements [0,1] loaded from sign extend words at *(char*)array+offset[0,1].

◆ vec_vgluwdo()

static vui64_t vec_vgluwdo ( unsigned int *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Offsets.

For each doubleword element [0,1] of vra, load the zero extended word element at *(char*)array+vra[0,1]. Merge those doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 12 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of unsigned words.
vraVector of doubleword (64-bit) byte offsets from &array.
Returns
vector doubleword elements [0,1] loaded from zero extended words at *(char*)array+vra[0,1].

◆ vec_vgluwdsx()

static vui64_t vec_vgluwdsx ( unsigned int *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes.

For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of unsigned words.
vraVector of doubleword indexes from &array.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector doubleword elements [0,1] loaded from zero extended words at array[vra[0,1]<<scale].

◆ vec_vgluwdx()

static vui64_t vec_vgluwdx ( unsigned int *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Unsigned Words from Vector Doubleword Indexes.

For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1]]. Merge those doubleword elements [0,1] and return the resulting vector.

processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of unsigned words.
vraVector of doubleword indexes from &array.
Returns
Vector doubleword [0,1] loaded from zero extended words at array[vra[0,1]].

◆ vec_vgluwso()

static vui64_t vec_vgluwso ( unsigned int *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Gather-Load Unsigned Word from Scalar Offsets.

For each scalar offset[0,1], load the unsigned word (zero extended) from the effective address formed by *(char*)array+offset[0,1] Merge resulting doubleword [0,1] elements and return the resulting vector.

processor Latency Throughput
power8 7 1/cycle
power9 8 1/cycle
Parameters
arrayPointer to array of words.
offset0Scalar (64-bit) byte offsets from &array.
offset1Scalar (64-bit) byte offsets from &array.
Returns
vector doubleword elements [0,1] loaded from zero extened words at *(char*)array+offset[0,1].

◆ vec_vlxsiwax()

static vi64_t vec_vlxsiwax ( const signed long long  ra,
const signed int *  rb 
)
inlinestatic

Vector Load Scalar Integer Word Algebraic Indexed.

Load the left most doubleword of vector xt as a scalar sign extended word from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vi64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note
The right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwax instruction combines load with sign extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.

Note
The lxsiwax instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lvewx.
processor Latency Throughput
power8 5 2/cycle
power9 5 2/cycle
Parameters
raconst doubleword index (offset/displacement).
rbconst word pointer to an array of integers.
Returns
The word stored at (ra + rb) is sign extended and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vlxsiwzx()

static vui64_t vec_vlxsiwzx ( const signed long long  ra,
const unsigned int *  rb 
)
inlinestatic

Vector Load Scalar Integer Word and Zero Indexed.

Load the left most doubleword of vector xt as a scalar unsigned word (zero extended to doubleword) from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).

Note
the right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwzx instruction combines load with zero extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.

Note
The lxsiwzx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use lvewx.
processor Latency Throughput
power8 5 2/cycle
power9 5 2/cycle
Parameters
raconst doubleword index (offset/displacement).
rbconst word pointer to an array of integers.
Returns
The word stored at (ra + rb) is zero extended and loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vmadd2euw()

static vui64_t vec_vmadd2euw ( vui32_t  a,
vui32_t  b,
vui32_t  c,
vui32_t  d 
)
inlinestatic

Vector Multiply-Add2 Even Unsigned Words.

Note
this implementation exists in vec_int64_ppc::h::vec_vmadd2euw() as it requires vec_addudm().

◆ vec_vmadd2ouw()

static vui64_t vec_vmadd2ouw ( vui32_t  a,
vui32_t  b,
vui32_t  c,
vui32_t  d 
)
inlinestatic

Vector Multiply-Add2 Odd Unsigned Words.

Note
this implementation exists in vec_int64_ppc::h::vec_vmadd2ouw() as it requires vec_addudm().

◆ vec_vmaddeuw()

static vui64_t vec_vmaddeuw ( vui32_t  a,
vui32_t  b,
vui32_t  c 
)
inlinestatic

Vector Multiply-Add Even Unsigned Words.

Note
this implementation exists in vec_int64_ppc::h::vec_vmaddeuw() as it requires vec_addudm().

◆ vec_vmaddouw()

static vui64_t vec_vmaddouw ( vui32_t  a,
vui32_t  b,
vui32_t  c 
)
inlinestatic

Vector Multiply-Add Odd Unsigned Words.

Note
this implementation exists in vec_int64_ppc::h::vec_vmaddouw() as it requires vec_addudm().

◆ vec_vmsumuwm()

static vui64_t vec_vmsumuwm ( vui32_t  a,
vui32_t  b,
vui64_t  c 
)
inlinestatic

Vector Multiply-Sum Unsigned Word Modulo.

Note
this implementation exists in vec_int64_ppc::h::vec_vmsumuwm() as it requires vec_addudm().

◆ vec_vmuleuw()

static vui64_t vec_vmuleuw ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Multiply Even Unsigned words.

Multiply the even words of two vector unsigned int values and return the unsigned long product of the even words.

For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

Note
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
Returns
vector unsigned long product of the even words of a and b.

◆ vec_vmulouw()

static vui64_t vec_vmulouw ( vui32_t  vra,
vui32_t  vrb 
)
inlinestatic

Vector Multiply Odd Unsigned Words.

Multiply the odd words of two vector unsigned int values and return the unsigned long product of the odd words.

For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.

processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
Returns
vector unsigned long product of the odd words of a and b.

◆ vec_vsst4wso()

static void vec_vsst4wso ( vui32_t  xs,
unsigned int *  array,
const long long  offset0,
const long long  offset1,
const long long  offset2,
const long long  offset3 
)
inlinestatic

Vector Scatter-Store 4 words to Scalar Offsets.

For each word element [i] of xs, store the element xs[i] at *(char*)array+offset[i].

processor Latency Throughput
power8 6 1/cycle
power9 4 2/cycle
Parameters
xsVector integer word elements to scatter store.
arrayPointer to array of integer words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.
offset2Scalar (64-bit) byte offset from &array.
offset3Scalar (64-bit) byte offset from &array.

◆ vec_vsst4wwo()

static void vec_vsst4wwo ( vui32_t  xs,
unsigned int *  array,
vi32_t  vra 
)
inlinestatic

Vector Scatter-Store 4 words to Vector Word Offsets.

For each word element [i] of xs, store the element xs[i] at *(char*)array+vra[i].

Note
Signed word offsets are expanded (unpacked) to doublewords before transfer to GRPs for effective address calculation.
processor Latency Throughput
power8 10 1/cycle
power9 12 2/cycle
Parameters
xsVector integer word elements to scatter store.
arrayPointer to array of integer words.
vraVector of signed word (32-bit) byte offsets from &array.

◆ vec_vsst4wwsx()

static void vec_vsst4wwsx ( vui32_t  xs,
unsigned int *  array,
vi32_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Scatter-Store 4 words to Vector Word Indexes.

For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]<<scale].

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left (2+scale) bits before transfer to GRPs for effective address calculation. This converts each index to an 64-bit offset.
processor Latency Throughput
power8 12-21 1/cycle
power9 15-24 2/cycle
Parameters
xsVector integer word elements to scatter store.
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes from array.
scale8-bit integer. Indexes are multiplying by 2scale.

◆ vec_vsst4wwx()

static void vec_vsst4wwx ( vui32_t  xs,
unsigned int *  array,
vi32_t  vra 
)
inlinestatic

Vector Scatter-Store 4 words to Vector Word Indexes.

For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]].

Note
Signed word indexes are expanded (unpacked) to doublewords before shifting left 2 bits. This converts each index to an 64-bit offset for effective address calculation.
processor Latency Throughput
power8 12-21 1/cycle
power9 15-24 2/cycle
Parameters
xsVector doubleword elements to scatter store.
arrayPointer to array of integer words.
vraVector of signed word (32-bit) indexes from array.

◆ vec_vsstwdo()

static void vec_vsstwdo ( vui64_t  xs,
unsigned int *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Words to Vector Doubleword Offsets.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].

processor Latency Throughput
power8 8 1/cycle
power9 9 2/cycle
Parameters
xsVector doubleword elements to scatter store low order words of each doubleword.
arrayPointer to array of integer words.
vraVector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstwdsx()

static void vec_vsstwdsx ( vui64_t  xs,
unsigned int *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]<<scale].

processor Latency Throughput
power8 10-19 1/cycle
power9 10-19 1/cycle
Parameters
xsVector doubleword elements to scatter store low order words of each doubleword.
arrayPointer to array of integer words.
vraVector of doubleword (64-bit) indexes from &array.
scale8-bit integer. Indexes are multiplying by 2scale.

◆ vec_vsstwdx()

static void vec_vsstwdx ( vui64_t  xs,
unsigned int *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Words to Vector Doubleword Indexes.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]].

processor Latency Throughput
power8 10-19 1/cycle
power9 10-19 1/cycle
Parameters
xsVector doubleword elements to scatter store low order words of each doubleword.
arrayPointer to array of integer words.
vraVector of doubleword (64-bit) indexes from &array.

◆ vec_vsstwso()

static void vec_vsstwso ( vui64_t  xs,
unsigned int *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Scatter-Store Words to Scalar Offsets.

For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].

processor Latency Throughput
power8 3 1/cycle
power9 3 2/cycle
Parameters
xsVector doubleword elements to scatter store low order words of each doubleword.
arrayPointer to array of integer words.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.

◆ vec_vstxsiwx()

static void vec_vstxsiwx ( vui32_t  xs,
const signed long long  ra,
unsigned int *  rb 
)
inlinestatic

Vector Store Scalar Integer Word Indexed.

Stores word element 1 of vector xs as a scalar word at the effective address formed by rb+ra. The operand rb is a pointer to an array of integer words. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be word aligned (integer multiple of 4).

This operation is an alternate form of vector store element (vec_ste), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the stxsiwx instruction can load directly into any of the 64 VSRs. Both simplify scatter operations.

Note
The stxsiwx instruction was introduced in PowerISA 2.07 (POWER8). Power7 and earlier will use stvewx.
processor Latency Throughput
power8 0 - 2 2/cycle
power9 0 - 2 4/cycle
Parameters
xsvector doubleword element 0 to be stored.
raconst doubleword index (offset/displacement).
rbconst doubleword pointer to an array of doubles.

◆ vec_vsum2sw()

static vi32_t vec_vsum2sw ( vi32_t  vra,
vi32_t  vrb 
)
inlinestatic

Vector Sum-across Half Signed Word Saturate.

Sum across adjacent signed words within doublewords from vra and word addends from vrb. This is effectively the vec_sum2s built-in operation (vsum2sws instruction) without the endian sensitive modifications mandated by the ABI.

This is useful for computing the final doubleword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 58:63 of the doubleword element (word elements 1 and 3).

For vec_sum2s and little endian the ABI mandates that the addend words from vrb be from little endian word elements 1 and 3 (vector element 0 and 2) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 1 and 3 (vector element 0 and 2). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsum2sws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.

This also leaves the sums in bits 26:31 of the doubleword element and out of position for doubleword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sum2s.

Note
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
vraVector signed int as adjcent words within doublewords.
vrbVector signed int where odd words are summed with adjacent words from vra.
Returns
Vector signed int with even words set to 0 and odd words containing the word sums within doublewords.

◆ vec_vsumsw()

static vi32_t vec_vsumsw ( vi32_t  vra,
vi32_t  vrb 
)
inlinestatic

Vector Sum-across Signed Word Saturate.

Sum across the 4 signed words from vra and word element 3 from vrb. This is effectively the vec_sums built-in operation (vsumsws instruction) without the endian sensitive modifications mandated by the ABI.

This is useful for computing the final quadword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 121:127 of the quadword (word element 3).

For vec_sums and little endian the ABI mandates that the addend word from vrb be from little endian word elements 3 (vector element 0) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 3 (vector element 0). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsumsws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.

This also leaves the sums in bits 25:31 of the quadword and out of position for quadword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sums.

Note
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 7 2/cycle
power9 7 2/cycle
Parameters
vraVector signed int as words within quadword.
vrbVector signed int where word element 3 is summed with words from vra.
Returns
Vector signed int with words 0-2 set to 0 and word element 3 containing the word sums.

◆ vec_vupkhsw()

static vi64_t vec_vupkhsw ( vi32_t  vra)
inlinestatic

Vector Unpack High Signed Word.

From the word source in vra. For each integer word [i] from 0 to 1, sign extend to 64-bit and place in doubleword element [i] of the result vector.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
This operation is the equivalent of the generic vec_unpackh for type vector signed int. However vec_unpackh (for this type) is not available for _ARCH_PWR7 and earlier versions of GCC. This PVECLIB operation is available to both.
Use vec_vupkhsw naming but only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 4 x signed integers.
Returns
128-bit vector treated as 2 x signed long long integers.

◆ vec_vupkhuw()

static vui64_t vec_vupkhuw ( vui32_t  vra)
inlinestatic

Vector Unpack High Unsigned Word.

From the word source in vra. For each integer word [i] from 0 to 1, zero extend to 64-bit and place in doubleword element [i] of the result vector.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-4 2/cycle
Note
vec_vupkhuw does not exist in <altivec.h> nor as an instruction is the PowerISA. But it is easy to construct using vec_mergeh and a zero vector.
Parameters
vraa 128-bit vector treated as 4 x unsigned integers.
Returns
128-bit vector treated as 2 x unsigned long long integers.

◆ vec_vupklsw()

static vi64_t vec_vupklsw ( vi32_t  vra)
inlinestatic

Vector Unpack Low Signed Word.

From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), sign extend to 64-bit and place in doubleword element [i] of the result vector.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Use vec_vupkhsw naming but only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 4 x signed integers.
Returns
128-bit vector treated as 2 x signed long long integers.

◆ vec_vupkluw()

static vui64_t vec_vupkluw ( vui32_t  vra)
inlinestatic

Vector Unpack Low Unsigned Word.

From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), zero extend to 64-bit and place in doubleword element [i] of the result vector.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-4 2/cycle
Note
vec_vupkluw does not exist in <altivec.h> nor as an instruction is the PowerISA. But it is easy to construct using vec_mergeh and a zero vector.
Parameters
vraa 128-bit vector treated as 4 x unsigned integers.
Returns
128-bit vector treated as 2 x unsigned long long integers.
vec_mrgahw
static vui32_t vec_mrgahw(vui64_t vra, vui64_t vrb)
Vector Merge Algebraic High Words.
Definition: vec_int32_ppc.h:653
vec_muleuw
static vui64_t vec_muleuw(vui32_t a, vui32_t b)
Vector multiply even unsigned words.
Definition: vec_int32_ppc.h:1007
CONST_VINT32_W
#define CONST_VINT32_W(__w0, __w1, __w2, __w3)
Arrange elements of word initializer in high->low order.
Definition: vec_common_ppc.h:306
vec_mulhuw
static vui32_t vec_mulhuw(vui32_t vra, vui32_t vrb)
Vector Multiply High Unsigned Word.
Definition: vec_int32_ppc.h:1103
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vec_srwi
static vui32_t vec_srwi(vui32_t vra, const unsigned int shb)
Vector Shift Right Word Immediate.
Definition: vec_int32_ppc.h:1405
vec_muluwm
static vui32_t vec_muluwm(vui32_t a, vui32_t b)
Vector Multiply Unsigned Word Modulo.
Definition: vec_int32_ppc.h:1132
vec_mulouw
static vui64_t vec_mulouw(vui32_t a, vui32_t b)
Vector multiply odd unsigned words.
Definition: vec_int32_ppc.h:1043
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206