POWER Vector Library Manual
1.0.4
|
Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements. More...
#include <pveclib/vec_int16_ppc.h>
Go to the source code of this file.
Functions | |
static vui32_t | vec_absduw (vui32_t vra, vui32_t vrb) |
Vector Absolute Difference Unsigned Word. More... | |
static vui32_t | vec_clzw (vui32_t vra) |
Vector Count Leading Zeros word. More... | |
static vui32_t | vec_ctzw (vui32_t vra) |
Vector Count Trailing Zeros word. More... | |
static vui32_t | vec_mrgahw (vui64_t vra, vui64_t vrb) |
Vector Merge Algebraic High Words. More... | |
static vui32_t | vec_mrgalw (vui64_t vra, vui64_t vrb) |
Vector merge Algebraic low words. More... | |
static vui32_t | vec_mrgew (vui32_t vra, vui32_t vrb) |
Vector Merge Even Words. More... | |
static vui32_t | vec_mrgow (vui32_t vra, vui32_t vrb) |
Vector Merge Odd Words. More... | |
static vi64_t | vec_mulesw (vi32_t a, vi32_t b) |
Vector multiply even signed words. More... | |
static vi64_t | vec_mulosw (vi32_t a, vi32_t b) |
Vector multiply odd signed words. More... | |
static vui64_t | vec_muleuw (vui32_t a, vui32_t b) |
Vector multiply even unsigned words. More... | |
static vui64_t | vec_mulouw (vui32_t a, vui32_t b) |
Vector multiply odd unsigned words. More... | |
static vi32_t | vec_mulhsw (vi32_t vra, vi32_t vrb) |
Vector Multiply High Signed Word. More... | |
static vui32_t | vec_mulhuw (vui32_t vra, vui32_t vrb) |
Vector Multiply High Unsigned Word. More... | |
static vui32_t | vec_muluwm (vui32_t a, vui32_t b) |
Vector Multiply Unsigned Word Modulo. More... | |
static vui32_t | vec_popcntw (vui32_t vra) |
Vector Population Count word. More... | |
static vui32_t | vec_revbw (vui32_t vra) |
byte reverse each word of a vector unsigned int. More... | |
static vb32_t | vec_setb_sw (vi32_t vra) |
Vector Set Bool from Signed Word. More... | |
static vui32_t | vec_slwi (vui32_t vra, const unsigned int shb) |
Vector Shift left Word Immediate. More... | |
static vi32_t | vec_srawi (vi32_t vra, const unsigned int shb) |
Vector Shift Right Algebraic Word Immediate. More... | |
static vui32_t | vec_srwi (vui32_t vra, const unsigned int shb) |
Vector Shift Right Word Immediate. More... | |
static vui32_t | vec_vgl4wso (unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3) |
Vector Gather-Load 4 Words from scalar Offsets. More... | |
static vui32_t | vec_vgl4wwo (unsigned int *array, vi32_t vra) |
Vector Gather-Load 4 Words from Vector Word Offsets. More... | |
static vui32_t | vec_vgl4wwsx (unsigned int *array, vi32_t vra, const unsigned char scale) |
Vector Gather-Load 4 Words from Vector Word Scaled Indexes. More... | |
static vui32_t | vec_vgl4wwx (unsigned int *array, vi32_t vra) |
Vector Gather-Load 4 Words from Vector Word Indexes. More... | |
static vi64_t | vec_vglswso (signed int *array, const long long offset0, const long long offset1) |
Vector Gather-Load Signed Word from Scalar Offsets. More... | |
static vi64_t | vec_vglswdo (signed int *array, vi64_t vra) |
Vector Gather-Load Signed Words from Vector Doubleword Offsets. More... | |
static vi64_t | vec_vglswdsx (signed int *array, vi64_t vra, const unsigned char scale) |
Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes. More... | |
static vi64_t | vec_vglswdx (signed int *array, vi64_t vra) |
Vector Gather-Load Signed Words from Vector Doubleword Indexes. More... | |
static vui64_t | vec_vgluwso (unsigned int *array, const long long offset0, const long long offset1) |
Vector Gather-Load Unsigned Word from Scalar Offsets. More... | |
static vui64_t | vec_vgluwdo (unsigned int *array, vi64_t vra) |
Vector Gather-Load Unsigned Words from Vector Doubleword Offsets. More... | |
static vui64_t | vec_vgluwdsx (unsigned int *array, vi64_t vra, const unsigned char scale) |
Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes. More... | |
static vui64_t | vec_vgluwdx (unsigned int *array, vi64_t vra) |
Vector Gather-Load Unsigned Words from Vector Doubleword Indexes. More... | |
static vi64_t | vec_vlxsiwax (const signed long long ra, const signed int *rb) |
Vector Load Scalar Integer Word Algebraic Indexed. More... | |
static vui64_t | vec_vlxsiwzx (const signed long long ra, const unsigned int *rb) |
Vector Load Scalar Integer Word and Zero Indexed. More... | |
static vui64_t | vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d) |
Vector Multiply-Add2 Even Unsigned Words. More... | |
static vui64_t | vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d) |
Vector Multiply-Add2 Odd Unsigned Words. More... | |
static vui64_t | vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c) |
Vector Multiply-Add Even Unsigned Words. More... | |
static vui64_t | vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c) |
Vector Multiply-Add Odd Unsigned Words. More... | |
static vui64_t | vec_vmsumuwm (vui32_t a, vui32_t b, vui64_t c) |
Vector Multiply-Sum Unsigned Word Modulo. More... | |
static vui64_t | vec_vmuleuw (vui32_t vra, vui32_t vrb) |
Vector Multiply Even Unsigned words. More... | |
static vui64_t | vec_vmulouw (vui32_t vra, vui32_t vrb) |
Vector Multiply Odd Unsigned Words. More... | |
static void | vec_vsst4wso (vui32_t xs, unsigned int *array, const long long offset0, const long long offset1, const long long offset2, const long long offset3) |
Vector Scatter-Store 4 words to Scalar Offsets. More... | |
static void | vec_vsst4wwo (vui32_t xs, unsigned int *array, vi32_t vra) |
Vector Scatter-Store 4 words to Vector Word Offsets. More... | |
static void | vec_vsst4wwsx (vui32_t xs, unsigned int *array, vi32_t vra, const unsigned char scale) |
Vector Scatter-Store 4 words to Vector Word Indexes. More... | |
static void | vec_vsst4wwx (vui32_t xs, unsigned int *array, vi32_t vra) |
Vector Scatter-Store 4 words to Vector Word Indexes. More... | |
static void | vec_vsstwdo (vui64_t xs, unsigned int *array, vi64_t vra) |
Vector Scatter-Store Words to Vector Doubleword Offsets. More... | |
static void | vec_vsstwdsx (vui64_t xs, unsigned int *array, vi64_t vra, const unsigned char scale) |
Vector Scatter-Store Words to Vector Doubleword Scaled Indexes. More... | |
static void | vec_vsstwdx (vui64_t xs, unsigned int *array, vi64_t vra) |
Vector Scatter-Store Words to Vector Doubleword Indexes. More... | |
static void | vec_vsstwso (vui64_t xs, unsigned int *array, const long long offset0, const long long offset1) |
Vector Scatter-Store Words to Scalar Offsets. More... | |
static void | vec_vstxsiwx (vui32_t xs, const signed long long ra, unsigned int *rb) |
Vector Store Scalar Integer Word Indexed. More... | |
static vi32_t | vec_vsum2sw (vi32_t vra, vi32_t vrb) |
Vector Sum-across Half Signed Word Saturate. More... | |
static vi32_t | vec_vsumsw (vi32_t vra, vi32_t vrb) |
Vector Sum-across Signed Word Saturate. More... | |
static vi64_t | vec_vupkhsw (vi32_t vra) |
Vector Unpack High Signed Word. More... | |
static vui64_t | vec_vupkhuw (vui32_t vra) |
Vector Unpack High Unsigned Word. More... | |
static vi64_t | vec_vupklsw (vi32_t vra) |
Vector Unpack Low Signed Word. More... | |
static vui64_t | vec_vupkluw (vui32_t vra) |
Vector Unpack Low Unsigned Word. More... | |
Header package containing a collection of 128-bit SIMD operations over 32-bit integer elements.
Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.
Most vector int (32-bit integer word) operations are implemented with PowerISA VMX instructions either defined by the original VMX (AKA Altivec) or added to later versions of the PowerISA. Vector word-wise merge, shift, and splat operations were added with VSX in PowerISA 2.06B (POWER7). PowerISA 2.07B (POWER8) added several useful word wise operations (multiply, merge even/odd, count leading zeros, population count) not included in the original VMX. PowerISA 3.0B (POWER9) adds several more (compare not equal, count trailing zeros, extend sign, extract/insert, and parity). Most of these intrinsic (compiler built-ins) operations are defined in <altivec.h> and described in the compiler documentation.
The newly introduced vector operations imply some useful composite operations. For example, we can make the vector multiply even/odd/modulo word operations available for older compilers. And provide implementations for older (POWER7 and earlier) processors using the original VMX operations.
This header covers operations that are either:
Added vec_vmaddeuw(), vec_vmaddouw(), vec_vmadd2euw(), and vec_vmadd2ouw() as an optimization for the vector multiply quadword implementations on POWER8.
It would be useful to provide a vector multiply high word (return the high order 32-bits of the 64-bit product) operation. This can be used for multiplicative inverse (effectively integer divide) operations. Neither integer multiply high nor divide are available as vector instructions. However the multiply high word operation can be composed from the existing multiply even/odd word operations followed by the vector merge even word instruction.
As a prerequisite we need to provide the merge even/odd word operations for older compilers and an implementation for older (POWER7) processors. Fortunately vector merge operations are just a special case of vector permute. So the POWER7 (and earlier) implementation can use vec_perm and appropriate selection vectors to provide these merge operations.
But this is complicated by little-endian (LE) support as specified in the OpenPOWER ABI and as implemented in the compilers. Little-endian changes the effective vector element numbering and the location of even and odd elements. This means that the vector built-ins provided by altivec.h may not generate the instructions you would expect.
The OpenPOWER ABI provides a helpful table of Endian Sensitive Operations. For vec_mergee (vmrgew) it specifies:
Swap inputs and use vmrgow, for LE.
Also for vec_mule (vmuleuw, vmulesw):
Replace with vmulouw and so on, for LE.
Also for vec_perm (vperm) it specifies:
For LE, Swap input arguments and complement the selection vector.
The above is just a sampling of a larger list of Endian Sensitive Operations.
So the obvious coding for Vector Multiply High Word:
Would produce the expected code and correct results when compiled for BE:
But the following and wrong code for LE:
The compiler swapped the multiplies even for odd and odd of even. That is somewhat mitigated by swapping the input arguments in the merge. But changing the merge from even to odd actually returns the low order 32-bits of the product. This is not the correct result for multiply high.
This header provides implementations of vector merge even/odd word (vec_mrgew() and vec_mrgow()) that support older compilers and older (POWER7) processor. Similarly for the multiply Even/odd unsigned/signed word instructions (vec_mulesw(), vec_mulosw(), vec_muleuw() and vec_mulouw()). These implementations include the mandated LE transforms.
This header also provides the higher level operations Vector Merge Algebraic High/low Word (vec_mrgahw() and vec_mrgalw()). These implementations generate the correct merge even/odd word instruction for the operation independent of endian.
The implementation is a bit complicated so that is can nullify the unwanted LE transformation of vec_vmrgew(), in addition to handling older and compilers and processors.
Now we can implement Vector Multiply High Unsigned Word (vec_mulhuw()):
Again the implementation is more complicated than expected as we still have to nullify the LE transformation associated with multiply even/odd.
The good news is all this complexity is contained within pveclib and the generated code is still just 3 instructions.
Suppose we have a requirement to convert an array of 32-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 4 words.
The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume that the lower 32-bits of the TimeBase is sufficient to compute intervals (~8.38 seconds). TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.
The timespec format it a struct of unsigned int fields for seconds and microseconds. So the task is to convert the 512MHz TimeBase intervals to microseconds and then split the integer seconds and microseconds for the timespec.
First the TimeBase to microseconds conversion is simply (1000000 / 512000000) which reduces to (1 / 512) or divide by 512. The vector unit does not provide integer divide but luckily, 512 is a power of 2 and we can shift right. If we don't care for the niceties of rounding we can simply shift right 9 bits:
But if we decide that rounding is important we can leverage the Vector Average Unsigned Word (vavguw) instruction. Here we need to add 256 (512 / 2 = 256) to the timeBase interval before we shift right.
But we need to reverse engineer the vavguw operation to get the results we want. For each word, vavguw computes the sum of A and B plus 1, then shifts the 33-bit sum right 1 bit. We can effectively round by passing the rounding factor as the B operand to the vec_avg() built-in. But we get a +1 and 1 bit right shift for free. So in this case the rounding constant is 256-1 = 255. And we only need to shift an additional 8 bits to complete the conversion:
Next we need to separate TimeBase microseconds into the integer seconds and microseconds. Normally scalar codes would use integer divide/modulo by 1000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?
Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 32-bit fraction and we have a multiply high (vec_mulhuw()) operation. Multiplying a 32-bit unsigned integer by a 32-bit unsigned fraction generates a 64-bit product with 32-bits above (integer) and below (fraction) the radix point. The high 32-bits of the product is the integer quotient.
It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full analysis, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.
In the chapter above;
Figure 10-2 Computing the magic number for unsigned division.
provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For the divisor 1000000 this is { 1125899907, 0 , 18 }:
Now we need to compute the remainder to get microseconds.
Finally we need to merge the vectors of seconds and useconds into vectors of timespec.
Here is the complete vectorized TimeBase to timespec conversion example:
High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.
Vector Absolute Difference Unsigned Word.
Compute the absolute difference for each word. For each unsigned word, subtract VRB[i] from VRA[i] and return the absolute value of the difference.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 1/cycle |
power9 | 3 | 2/cycle |
vra | vector of 4 x unsigned words |
vrb | vector of 4 x unsigned words |
Vector Count Leading Zeros word.
Count the number of leading '0' bits (0-32) within each word element of a 128-bit vector.
For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Word instruction vclzw. Otherwise use sequence of pre 2.07 VMX instructions. SIMDized count leading zeros inspired by:
Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Figure 5-12.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as 4 x 32-bit integer (words) elements. |
Vector Count Trailing Zeros word.
Count the number of trailing '0' bits (0-32) within each word element of a 128-bit vector.
For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Word instruction vctzw. Otherwise use a sequence of pre ISA 3.0 VMX instructions. SIMDized count Trailing zeros inspired by:
Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.
processor | Latency | Throughput |
---|---|---|
power8 | 6-8 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as 4 x 32-bit integer (words) elements. |
Vector Merge Algebraic High Words.
Merge only the high words from 4 x Algebraic doublewords across vectors vra and vrb. This effectively the Vector Merge Even Word operation that is not modified for endian.
For example merge the high 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply high unsigned word.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned long. |
vrb | 128-bit vector unsigned long. |
Vector merge Algebraic low words.
Merge the arithmetic low words 4 x Algebraic doublewords across vectors vra and vrb. This is effectively the Vector Merge Odd Word operation that is not modified for endian.
For example merge the low 32-bits from 4 x 64-bit products as generated by vec_muleuw/vec_mulouw. This result is effectively a vector multiply low unsigned word (multiply unsigned word modulo).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned long. |
vrb | 128-bit vector unsigned long. |
Vector Merge Even Words.
Merge the even word elements from the concatenation of 2 x vectors (vra and vrb).
The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
Vector Merge Odd Words.
Merge the odd word elements from the concatenation of 2 x vectors (vra and vrb).
The element numbering changes between big and little-endian environements. So the compiler and this implementation adjusts the generated code to reflect this.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
Vector multiply even signed words.
Multiple the even words of two vector signed int values and return the signed long product of the even words.
For POWER8 and later we can use the vmulesw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_muleuw().
Here we start with a unsigned vec_muleuw product, then correct the high 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
a | 128-bit vector signed int. |
b | 128-bit vector signed int. |
Vector multiply even unsigned words.
Multiple the even words of two vector unsigned int values and return the unsigned long product of the even words.
For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
Vector Multiply High Signed Word.
Multiple the corresponding word elements of two vector signed int values and return the high order 32-bits, for each 64-bit product element.
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 1/cycle |
power9 | 9 | 1/cycle |
vra | 128-bit vector signed int. |
vrb | 128-bit vector signed int. |
Vector Multiply High Unsigned Word.
Multiple the corresponding word elements of two vector unsigned int values and return the high order 32-bits, from each 64-bit product.
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 1/cycle |
power9 | 9 | 1/cycle |
Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
Vector multiply odd signed words.
Multiple the odd words of two vector signed int values and return the signed long product of the odd words.
For POWER8 and later we can use the vmulosw instruction. But for POWER7 and earlier we have to construct word multiplies from halfword multiplies. See vec_mulouw().
Here we start with a unsigned vec_mulouw product, then correct the high-order 32-bits of the product to signed. Based on: Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 8 Multiplication, Section 8-3 High-Order Product Signed from/to Unsigned.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
a | 128-bit vector signed int. |
b | 128-bit vector signed int. |
Vector multiply odd unsigned words.
Multiple the odd words of two vector unsigned int values and return the unsigned long product of the odd words.
For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
Vector Multiply Unsigned Word Modulo.
Multiple the corresponding word elements of two vector unsigned int values and return the low order 32-bits of the 64-bit product for each element.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
a | 128-bit vector signed int. |
b | 128-bit vector signed int. |
Vector Population Count word.
Count the number of '1' bits (0-32) within each word element of a 128-bit vector.
For POWER8 (PowerISA 2.07B) or later use the Vector Population Count Word instruction. Otherwise use the pveclib vec_popcntb to count each byte then sum across with Vector Sum across Quarter Unsigned Byte Saturate.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as 4 x 32-bit integer (words) elements. |
byte reverse each word of a vector unsigned int.
For each word of the input vector, reverse the order of bytes / octets within the word.
processor | Latency | Throughput |
---|---|---|
power8 | 2-11 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector unsigned int. |
Vector Set Bool from Signed Word.
For each word, propagate the sign bit to all 32-bits of that word. The result is vector bool int reflecting the sign bit of each 32-bit word.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | Vector signed int. |
Vector Shift left Word Immediate.
Shift left each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 4-11 | 2/cycle |
power9 | 5-11 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned int. |
shb | shift amount in the range 0-31. |
Vector Shift Right Algebraic Word Immediate.
Shift Right Algebraic each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return the sign bit propagated to each bit of each element.
processor | Latency | Throughput |
---|---|---|
power8 | 4-11 | 2/cycle |
power9 | 5-11 | 2/cycle |
vra | a 128-bit vector treated as a vector signed int. |
shb | shift amount in the range 0-31. |
Vector Shift Right Word Immediate.
Shift right each word element [0-3], 0-31 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-31. A shift count of 0 returns the original value of vra. Shift counts greater then 31 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 4-11 | 2/cycle |
power9 | 5-11 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned char. |
shb | shift amount in the range 0-31. |
|
inlinestatic |
Vector Gather-Load 4 Words from scalar Offsets.
For each scalar offset[0,1,2,3], load the word from the effective address formed by *(char*)array+offset[0-3]. Merge resulting word elements [0,1,2,3] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1/cycle |
power9 | 11 | 1/cycle |
array | Pointer to array of integer words. |
offset0 | Scalar (64-bit) byte offset from &array. |
offset1 | Scalar (64-bit) byte offset from &array. |
offset2 | Scalar (64-bit) byte offset from &array. |
offset3 | Scalar (64-bit) byte offset from &array. |
Vector Gather-Load 4 Words from Vector Word Offsets.
For each signed word element [i] of vra, load the word element at *(char*)array+vra[i]. Merge those word elements and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 14 | 1/cycle |
power9 | 15 | 1/cycle |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) byte offsets from &array. |
|
inlinestatic |
Vector Gather-Load 4 Words from Vector Word Scaled Indexes.
For each signed word element [i] of vra, load the word element at array[vra[i] << scale]. Merge those word elements and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 16-25 | 1/cycle |
power9 | 18-27 | 1/cycle |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) indexes. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Gather-Load 4 Words from Vector Word Indexes.
For word element [i] of vra, load the word element at array[vra[i]]. Merge those word elements and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 16-25 | 1/cycle |
power9 | 18-27 | 1/cycle |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) indexes. |
Vector Gather-Load Signed Words from Vector Doubleword Offsets.
For each doubleword element [i] of vra, load the sign extended word element at *(char*)array+vra[i]. Merge doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 12 | 1/cycle |
power9 | 11 | 1/cycle |
array | Pointer to array of signed words. |
vra | Vector of doubleword (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Gather-Load Signed Words from Vector Doubleword Scaled Indexes.
For each doubleword element [i] of vra, load the sign extended word element at array[vra[i] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of signed words. |
vra | Vector of doubleword indexes from &array. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Gather-Load Signed Words from Vector Doubleword Indexes.
For each doubleword element [i] of vra, load the sign extended word element at array[vra[i]]. Merge doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of signed words. |
vra | Vector of doubleword indexes from &array. |
|
inlinestatic |
Vector Gather-Load Signed Word from Scalar Offsets.
For each scalar offset[0|1], load the signed word (sign extended) from the effective address formed by *(char*)array+offset[0|1]. Merge resulting doubleword elements and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 1/cycle |
power9 | 8 | 1/cycle |
array | Pointer to array of words. |
offset0 | Scalar (64-bit) byte offsets from &array. |
offset1 | Scalar (64-bit) byte offsets from &array. |
Vector Gather-Load Unsigned Words from Vector Doubleword Offsets.
For each doubleword element [0,1] of vra, load the zero extended word element at *(char*)array+vra[0,1]. Merge those doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 12 | 1/cycle |
power9 | 11 | 1/cycle |
array | Pointer to array of unsigned words. |
vra | Vector of doubleword (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Gather-Load Unsigned Words from Vector Doubleword Scaled Indexes.
For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1] << scale)]. Merge doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of unsigned words. |
vra | Vector of doubleword indexes from &array. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Gather-Load Unsigned Words from Vector Doubleword Indexes.
For each doubleword element [0,1] of vra, load the zero extended word element at array[vra[0,1]]. Merge those doubleword elements [0,1] and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of unsigned words. |
vra | Vector of doubleword indexes from &array. |
|
inlinestatic |
Vector Gather-Load Unsigned Word from Scalar Offsets.
For each scalar offset[0,1], load the unsigned word (zero extended) from the effective address formed by *(char*)array+offset[0,1] Merge resulting doubleword [0,1] elements and return the resulting vector.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 1/cycle |
power9 | 8 | 1/cycle |
array | Pointer to array of words. |
offset0 | Scalar (64-bit) byte offsets from &array. |
offset1 | Scalar (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Load Scalar Integer Word Algebraic Indexed.
Load the left most doubleword of vector xt as a scalar sign extended word from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vi64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).
This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwax instruction combines load with sign extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.
processor | Latency | Throughput |
---|---|---|
power8 | 5 | 2/cycle |
power9 | 5 | 2/cycle |
ra | const doubleword index (offset/displacement). |
rb | const word pointer to an array of integers. |
|
inlinestatic |
Vector Load Scalar Integer Word and Zero Indexed.
Load the left most doubleword of vector xt as a scalar unsigned word (zero extended to doubleword) from the effective address formed by rb+ra. The operand rb is a pointer to an array of words. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be word aligned (integer multiple of 4).
This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the lxsiwzx instruction combines load with zero extend word and can load directly into any of the 64 VSRs. Both simplify merging elements for gather operations.
processor | Latency | Throughput |
---|---|---|
power8 | 5 | 2/cycle |
power9 | 5 | 2/cycle |
ra | const doubleword index (offset/displacement). |
rb | const word pointer to an array of integers. |
Vector Multiply-Add2 Even Unsigned Words.
Vector Multiply-Add2 Odd Unsigned Words.
Vector Multiply-Add Even Unsigned Words.
Vector Multiply-Add Odd Unsigned Words.
Vector Multiply-Sum Unsigned Word Modulo.
Vector Multiply Even Unsigned words.
Multiply the even words of two vector unsigned int values and return the unsigned long product of the even words.
For POWER8 and later we can use the vmuleuw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
Vector Multiply Odd Unsigned Words.
Multiply the odd words of two vector unsigned int values and return the unsigned long product of the odd words.
For POWER8 and later we can use the vmulouw instruction. But for POWER7 and earlier we have to construct word multiplies from two halfword multiplies (vmuleuh and vmulouh). Then sum the partial products for the final doubleword results. This is complicated by the fact that vector add doubleword is not available for POWER7. So we need to construct the doubleword add from Vector Add Unsigned Word Modulo (vadduwm) and Vector Add and Write Carry-Out Unsigned Word (vaddcuw) with shift double quadword to reposition the low word carry and a final vadduwm to complete the carry propagation for the doubleword add.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
|
inlinestatic |
Vector Scatter-Store 4 words to Scalar Offsets.
For each word element [i] of xs, store the element xs[i] at *(char*)array+offset[i].
processor | Latency | Throughput |
---|---|---|
power8 | 6 | 1/cycle |
power9 | 4 | 2/cycle |
xs | Vector integer word elements to scatter store. |
array | Pointer to array of integer words. |
offset0 | Scalar (64-bit) byte offset from &array. |
offset1 | Scalar (64-bit) byte offset from &array. |
offset2 | Scalar (64-bit) byte offset from &array. |
offset3 | Scalar (64-bit) byte offset from &array. |
Vector Scatter-Store 4 words to Vector Word Offsets.
For each word element [i] of xs, store the element xs[i] at *(char*)array+vra[i].
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1/cycle |
power9 | 12 | 2/cycle |
xs | Vector integer word elements to scatter store. |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) byte offsets from &array. |
|
inlinestatic |
Vector Scatter-Store 4 words to Vector Word Indexes.
For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]<<scale].
processor | Latency | Throughput |
---|---|---|
power8 | 12-21 | 1/cycle |
power9 | 15-24 | 2/cycle |
xs | Vector integer word elements to scatter store. |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) indexes from array. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Scatter-Store 4 words to Vector Word Indexes.
For each word element [i] of xs, store the element xs[i] at *(char*)array[vra[i]].
processor | Latency | Throughput |
---|---|---|
power8 | 12-21 | 1/cycle |
power9 | 15-24 | 2/cycle |
xs | Vector doubleword elements to scatter store. |
array | Pointer to array of integer words. |
vra | Vector of signed word (32-bit) indexes from array. |
Vector Scatter-Store Words to Vector Doubleword Offsets.
For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 1/cycle |
power9 | 9 | 2/cycle |
xs | Vector doubleword elements to scatter store low order words of each doubleword. |
array | Pointer to array of integer words. |
vra | Vector of doubleword (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Scatter-Store Words to Vector Doubleword Scaled Indexes.
For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]<<scale].
processor | Latency | Throughput |
---|---|---|
power8 | 10-19 | 1/cycle |
power9 | 10-19 | 1/cycle |
xs | Vector doubleword elements to scatter store low order words of each doubleword. |
array | Pointer to array of integer words. |
vra | Vector of doubleword (64-bit) indexes from &array. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Scatter-Store Words to Vector Doubleword Indexes.
For each doubleword element [i] of vra, Store the low order word element xs[i+1] at array[vra[i]].
processor | Latency | Throughput |
---|---|---|
power8 | 10-19 | 1/cycle |
power9 | 10-19 | 1/cycle |
xs | Vector doubleword elements to scatter store low order words of each doubleword. |
array | Pointer to array of integer words. |
vra | Vector of doubleword (64-bit) indexes from &array. |
|
inlinestatic |
Vector Scatter-Store Words to Scalar Offsets.
For each doubleword element [i] of vra, Store the low order word element xs[i+1] at *(char*)array+offset[0|1].
processor | Latency | Throughput |
---|---|---|
power8 | 3 | 1/cycle |
power9 | 3 | 2/cycle |
xs | Vector doubleword elements to scatter store low order words of each doubleword. |
array | Pointer to array of integer words. |
offset0 | Scalar (64-bit) byte offset from &array. |
offset1 | Scalar (64-bit) byte offset from &array. |
|
inlinestatic |
Vector Store Scalar Integer Word Indexed.
Stores word element 1 of vector xs as a scalar word at the effective address formed by rb+ra. The operand rb is a pointer to an array of integer words. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be word aligned (integer multiple of 4).
This operation is an alternate form of vector store element (vec_ste), with the added simplification that data is always left justified in the vector. Another advantage for Power8 and later, the stxsiwx instruction can load directly into any of the 64 VSRs. Both simplify scatter operations.
processor | Latency | Throughput |
---|---|---|
power8 | 0 - 2 | 2/cycle |
power9 | 0 - 2 | 4/cycle |
xs | vector doubleword element 0 to be stored. |
ra | const doubleword index (offset/displacement). |
rb | const doubleword pointer to an array of doubles. |
Vector Sum-across Half Signed Word Saturate.
Sum across adjacent signed words within doublewords from vra and word addends from vrb. This is effectively the vec_sum2s built-in operation (vsum2sws instruction) without the endian sensitive modifications mandated by the ABI.
This is useful for computing the final doubleword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 58:63 of the doubleword element (word elements 1 and 3).
For vec_sum2s and little endian the ABI mandates that the addend words from vrb be from little endian word elements 1 and 3 (vector element 0 and 2) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 1 and 3 (vector element 0 and 2). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsum2sws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.
This also leaves the sums in bits 26:31 of the doubleword element and out of position for doubleword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sum2s.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
vra | Vector signed int as adjcent words within doublewords. |
vrb | Vector signed int where odd words are summed with adjacent words from vra. |
Vector Sum-across Signed Word Saturate.
Sum across the 4 signed words from vra and word element 3 from vrb. This is effectively the vec_sums built-in operation (vsumsws instruction) without the endian sensitive modifications mandated by the ABI.
This is useful for computing the final quadword counts for operations like population count and count leading/trailing zeros. These results are often used as inputs to shift operations that require shift counts in bits 121:127 of the quadword (word element 3).
For vec_sums and little endian the ABI mandates that the addend word from vrb be from little endian word elements 3 (vector element 0) be used for the sum. The ABI also mandates that saturated word sum results are are in little endian elements 3 (vector element 0). This requires a 3 instruction dependent sequence to precondition vrb and and rotate the vsumsws result to match little endian element numbering. This adds 4 (6 for POWER9) cycles latency.
This also leaves the sums in bits 25:31 of the quadword and out of position for quadword shift/rotate. This in turn requires an additional corrective shift/rotate before using the sums. Or use this operation instead of vec_sums.
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 2/cycle |
power9 | 7 | 2/cycle |
vra | Vector signed int as words within quadword. |
vrb | Vector signed int where word element 3 is summed with words from vra. |
Vector Unpack High Signed Word.
From the word source in vra. For each integer word [i] from 0 to 1, sign extend to 64-bit and place in doubleword element [i] of the result vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 4 x signed integers. |
Vector Unpack High Unsigned Word.
From the word source in vra. For each integer word [i] from 0 to 1, zero extend to 64-bit and place in doubleword element [i] of the result vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-4 | 2/cycle |
vra | a 128-bit vector treated as 4 x unsigned integers. |
Vector Unpack Low Signed Word.
From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), sign extend to 64-bit and place in doubleword element [i] of the result vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 4 x signed integers. |
Vector Unpack Low Unsigned Word.
From the word source in vra. For each integer word [i+2] from 0 to 1 (words 2 and 3), zero extend to 64-bit and place in doubleword element [i] of the result vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-4 | 2/cycle |
vra | a 128-bit vector treated as 4 x unsigned integers. |