POWER Vector Library Manual
1.0.4
|
Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements. More...
#include <pveclib/vec_int32_ppc.h>
Go to the source code of this file.
Functions | |
static vui64_t | vec_absdud (vui64_t vra, vui64_t vrb) |
Vector Absolute Difference Unsigned Doubleword. More... | |
static vui64_t | vec_addudm (vui64_t a, vui64_t b) |
Vector Add Unsigned Doubleword Modulo. More... | |
static vui64_t | vec_clzd (vui64_t vra) |
Vector Count Leading Zeros Doubleword for unsigned long long elements. More... | |
static vui64_t | vec_ctzd (vui64_t vra) |
Vector Count Trailing Zeros Doubleword for unsigned long long elements. More... | |
static vb64_t | vec_cmpeqsd (vi64_t a, vi64_t b) |
Vector Compare Equal Signed Doubleword. More... | |
static vb64_t | vec_cmpequd (vui64_t a, vui64_t b) |
Vector Compare Equal Unsigned Doubleword. More... | |
static vb64_t | vec_cmpgesd (vi64_t a, vi64_t b) |
Vector Compare Greater Than or Equal Signed Doubleword. More... | |
static vb64_t | vec_cmpgeud (vui64_t a, vui64_t b) |
Vector Compare Greater Than or Equal Unsigned Doubleword. More... | |
static vb64_t | vec_cmpgtsd (vi64_t a, vi64_t b) |
Vector Compare Greater Than Signed Doubleword. More... | |
static vb64_t | vec_cmpgtud (vui64_t a, vui64_t b) |
Vector Compare Greater Than Unsigned Doubleword. More... | |
static vb64_t | vec_cmplesd (vi64_t a, vi64_t b) |
Vector Compare Less Than Equal Signed Doubleword. More... | |
static vb64_t | vec_cmpleud (vui64_t a, vui64_t b) |
Vector Compare Less Than Equal Unsigned Doubleword. More... | |
static vb64_t | vec_cmpltsd (vi64_t a, vi64_t b) |
Vector Compare less Than Signed Doubleword. More... | |
static vb64_t | vec_cmpltud (vui64_t a, vui64_t b) |
Vector Compare less Than Unsigned Doubleword. More... | |
static vb64_t | vec_cmpnesd (vi64_t a, vi64_t b) |
Vector Compare Not Equal Signed Doubleword. More... | |
static vb64_t | vec_cmpneud (vui64_t a, vui64_t b) |
Vector Compare Not Equal Unsigned Doubleword. More... | |
static int | vec_cmpsd_all_eq (vi64_t a, vi64_t b) |
Vector Compare all Equal Signed Doubleword. More... | |
static int | vec_cmpsd_all_ge (vi64_t a, vi64_t b) |
Vector Compare all Greater Than or Equal Signed Doubleword. More... | |
static int | vec_cmpsd_all_gt (vi64_t a, vi64_t b) |
Vector Compare all Greater Than Signed Doubleword. More... | |
static int | vec_cmpsd_all_le (vi64_t a, vi64_t b) |
Vector Compare all Less than equal Signed Doubleword. More... | |
static int | vec_cmpsd_all_lt (vi64_t a, vi64_t b) |
Vector Compare all Less than Signed Doubleword. More... | |
static int | vec_cmpsd_all_ne (vi64_t a, vi64_t b) |
Vector Compare all Not Equal Signed Doubleword. More... | |
static int | vec_cmpsd_any_eq (vi64_t a, vi64_t b) |
Vector Compare any Equal Signed Doubleword. More... | |
static int | vec_cmpsd_any_ge (vi64_t a, vi64_t b) |
Vector Compare any Greater Than or Equal Signed Doubleword. More... | |
static int | vec_cmpsd_any_gt (vi64_t a, vi64_t b) |
Vector Compare any Greater Than Signed Doubleword. More... | |
static int | vec_cmpsd_any_le (vi64_t a, vi64_t b) |
Vector Compare any Less than equal Signed Doubleword. More... | |
static int | vec_cmpsd_any_lt (vi64_t a, vi64_t b) |
Vector Compare any Less than Signed Doubleword. More... | |
static int | vec_cmpsd_any_ne (vi64_t a, vi64_t b) |
Vector Compare any Not Equal Signed Doubleword. More... | |
static int | vec_cmpud_all_eq (vui64_t a, vui64_t b) |
Vector Compare all Equal Unsigned Doubleword. More... | |
static int | vec_cmpud_all_ge (vui64_t a, vui64_t b) |
Vector Compare all Greater Than or Equal Unsigned Doubleword. More... | |
static int | vec_cmpud_all_gt (vui64_t a, vui64_t b) |
Vector Compare all Greater Than Unsigned Doubleword. More... | |
static int | vec_cmpud_all_le (vui64_t a, vui64_t b) |
Vector Compare all Less than equal Unsigned Doubleword. More... | |
static int | vec_cmpud_all_lt (vui64_t a, vui64_t b) |
Vector Compare all Less than Unsigned Doubleword. More... | |
static int | vec_cmpud_all_ne (vui64_t a, vui64_t b) |
Vector Compare all Not Equal Unsigned Doubleword. More... | |
static int | vec_cmpud_any_eq (vui64_t a, vui64_t b) |
Vector Compare any Equal Unsigned Doubleword. More... | |
static int | vec_cmpud_any_ge (vui64_t a, vui64_t b) |
Vector Compare any Greater Than or Equal Unsigned Doubleword. More... | |
static int | vec_cmpud_any_gt (vui64_t a, vui64_t b) |
Vector Compare any Greater Than Unsigned Doubleword. More... | |
static int | vec_cmpud_any_le (vui64_t a, vui64_t b) |
Vector Compare any Less than equal Unsigned Doubleword. More... | |
static int | vec_cmpud_any_lt (vui64_t a, vui64_t b) |
Vector Compare any Less than Unsigned Doubleword. More... | |
static int | vec_cmpud_any_ne (vui64_t a, vui64_t b) |
Vector Compare any Not Equal Unsigned Doubleword. More... | |
static vi64_t | vec_maxsd (vi64_t vra, vi64_t vrb) |
Vector Maximum Signed Doubleword. More... | |
static vui64_t | vec_maxud (vui64_t vra, vui64_t vrb) |
Vector Maximum Unsigned Doubleword. More... | |
static vi64_t | vec_minsd (vi64_t vra, vi64_t vrb) |
Vector Minimum Signed Doubleword. More... | |
static vui64_t | vec_minud (vui64_t vra, vui64_t vrb) |
Vector Minimum Unsigned Doubleword. More... | |
static vui64_t | vec_mrgahd (vui128_t vra, vui128_t vrb) |
Vector Merge Algebraic High Doublewords. More... | |
static vui64_t | vec_mrgald (vui128_t vra, vui128_t vrb) |
Vector Merge Algebraic Low Doublewords. More... | |
static vui64_t | vec_mrged (vui64_t __VA, vui64_t __VB) |
Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More... | |
static vui64_t | vec_mrghd (vui64_t __VA, vui64_t __VB) |
Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More... | |
static vui64_t | vec_mrgld (vui64_t __VA, vui64_t __VB) |
Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More... | |
static vui64_t | vec_mrgod (vui64_t __VA, vui64_t __VB) |
Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More... | |
static vui128_t | vec_msumudm (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Unsigned Doubleword Modulo. More... | |
static vui128_t | vec_muleud (vui64_t a, vui64_t b) |
Vector Multiply Even Unsigned Doublewords. More... | |
static vui64_t | vec_mulhud (vui64_t vra, vui64_t vrb) |
Vector Multiply High Unsigned Doubleword. More... | |
static vui128_t | vec_muloud (vui64_t a, vui64_t b) |
Vector Multiply Odd Unsigned Doublewords. More... | |
static vui64_t | vec_muludm (vui64_t vra, vui64_t vrb) |
Vector Multiply Unsigned Doubleword Modulo. More... | |
static vui64_t | vec_pasted (vui64_t __VH, vui64_t __VL) |
Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector. More... | |
static vui64_t | vec_permdi (vui64_t vra, vui64_t vrb, const int ctl) |
Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector. More... | |
static vui64_t | vec_popcntd (vui64_t vra) |
Vector Population Count doubleword. More... | |
static vui64_t | vec_revbd (vui64_t vra) |
byte reverse each doubleword for a vector unsigned long int. More... | |
static vui64_t | vec_vrld (vui64_t vra, vui64_t vrb) |
Vector Rotate Left Doubleword. More... | |
static vui64_t | vec_vsld (vui64_t vra, vui64_t vrb) |
Vector Shift Left Doubleword. More... | |
static vui64_t | vec_vsrd (vui64_t vra, vui64_t vrb) |
Vector Shift Right Doubleword. More... | |
static vi64_t | vec_vsrad (vi64_t vra, vui64_t vrb) |
Vector Shift Right Algebraic Doubleword. More... | |
static vb64_t | vec_setb_sd (vi64_t vra) |
Vector Set Bool from Signed Doubleword. More... | |
static vui64_t | vec_rldi (vui64_t vra, const unsigned int shb) |
Vector Rotate left Doubleword Immediate. More... | |
static vui64_t | vec_sldi (vui64_t vra, const unsigned int shb) |
Vector Shift left Doubleword Immediate. More... | |
static vi64_t | vec_selsd (vi64_t vra, vi64_t vrb, vb64_t vrc) |
Vector Select Signed Doubleword. More... | |
static vui64_t | vec_selud (vui64_t vra, vui64_t vrb, vb64_t vrc) |
Vector Select Unsigned Doubleword. More... | |
static vui64_t | vec_splatd (vui64_t vra, const int ctl) |
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian. More... | |
static vi64_t | vec_splat_s64 (const int sim) |
Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte | Halfword |Word). More... | |
static vui64_t | vec_splat_u64 (const int sim) |
Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte | Halfword |Word). More... | |
static vui64_t | vec_spltd (vui64_t vra, const int ctl) |
static vui64_t | vec_srdi (vui64_t vra, const unsigned int shb) |
Vector Shift Right Doubleword Immediate. More... | |
static vi64_t | vec_sradi (vi64_t vra, const unsigned int shb) |
Vector Shift Right Algebraic Doubleword Immediate. More... | |
static vui64_t | vec_subudm (vui64_t a, vui64_t b) |
Vector Subtract Unsigned Doubleword Modulo. More... | |
static vui64_t | vec_swapd (vui64_t vra) |
Vector doubleword swap. Exchange the high and low doubleword elements of a vector. More... | |
static vui64_t | vec_vgluddo (unsigned long long *array, vi64_t vra) |
Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets. More... | |
static vui64_t | vec_vgluddsx (unsigned long long *array, vi64_t vra, const unsigned char scale) |
Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes. More... | |
static vui64_t | vec_vgluddx (unsigned long long *array, vi64_t vra) |
Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes. More... | |
static vui64_t | vec_vgludso (unsigned long long *array, const long long offset0, const long long offset1) |
Vector Gather-Load Integer Doublewords from Scalar Offsets. More... | |
static vui64_t | vec_vlsidx (const signed long long ra, const unsigned long long *rb) |
Vector Load Scalar Integer Doubleword Indexed. More... | |
static vui128_t | vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d) |
Vector Multiply-Add2 Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c) |
Vector Multiply-Add Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d) |
Vector Multiply-Add2 Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c) |
Vector Multiply-Add Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmuleud (vui64_t a, vui64_t b) |
Vector Multiply Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmuloud (vui64_t a, vui64_t b) |
Vector Multiply Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Odd Unsigned Doublewords. More... | |
static vui32_t | vec_vpkudum (vui64_t vra, vui64_t vrb) |
Vector Pack Unsigned Doubleword Unsigned Modulo. More... | |
static void | vec_vsstuddo (vui64_t xs, unsigned long long *array, vi64_t vra) |
Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets. More... | |
static void | vec_vsstuddsx (vui64_t xs, unsigned long long *array, vi64_t vra, const unsigned char scale) |
Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes. More... | |
static void | vec_vsstuddx (vui64_t xs, unsigned long long *array, vi64_t vra) |
Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes. More... | |
static void | vec_vsstudso (vui64_t xs, unsigned long long *array, const long long offset0, const long long offset1) |
Vector Scatter-Store Integer Doublewords to Scalar Offsets. More... | |
static void | vec_vstsidx (vui64_t xs, const signed long long ra, unsigned long long *rb) |
Vector Store Scalar Integer Doubleword Indexed. More... | |
static vui64_t | vec_xxspltd (vui64_t vra, const int ctl) |
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. More... | |
static vui64_t | vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c) |
Vector Multiply-Add Even Unsigned Words. More... | |
static vui64_t | vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d) |
Vector Multiply-Add2 Even Unsigned Words. More... | |
static vui64_t | vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c) |
Vector Multiply-Add Odd Unsigned Words. More... | |
static vui64_t | vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d) |
Vector Multiply-Add2 Odd Unsigned Words. More... | |
static vui64_t | vec_vmsumuwm (vui32_t vra, vui32_t vrb, vui64_t vrc) |
Vector Multiply-Sum Unsigned Word Modulo. More... | |
Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements.
Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.
The original VMX (AKA Altivec) did not define any doubleword element (long long integer or double float) operations. The VSX facility (introduced with POWER7) added vector double float but did not add any integer doubleword (64-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector long int (64-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.
POWER8 (PowerISA 2.07B) adds important doubleword integer (add, subtract, compare, shift, rotate, ...) VMX operations. POWER8 also added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend).
POWER9 (PowerISA 3.0B) adds the Vector Multiply-Sum Unsigned Doubleword Modulo instruction. This is not the expected multiply even/odd/modulo doubleword nor a full multiply modulo quadword. But with a few extra (permutes and splat zero) instructions you can get equivalent function.
Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit integer vector intrinsics to be added to altivec.h. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.
64-bit integer operations are commonly used in the implementation of optimized double float math library functions and this applies to the vector equivalents of math functions. So missing, incomplete or buggy support for vector long integer intrinsics can be a impediment to the implementation of optimized and portable vector double math libraries. This header is a prerequisite for vec_f64_ppc.h which together are intended to support the implementation of vector math libraries.
Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.
This header covers operations that are any of the following:
The original VMX instruction set extension was limited to byte, halfword, and word size element operations. This limited vector arithmetic operations to char, short, int and float elements. This limitation persisted until PowerISA 2.06 (POWER7) added the Vector Scalar Extensions (VSX) facility. VSX combined/extended the FPRs and VRs into 64 by 128-bit Vector/Scalar Registers (VSRs).
VSX added a large number of scalar double-precision and vector single / double-precision floating-point operations. The double-precision scalar (xs prefix) instructions where largely duplicates of the existing Floating-Point Facility operations, extended to access the whole (64) VSX register set. Similarly the VSX vector single precision floating-point (xv prefix, sp suffix) instructions were added to give vectorized float code access to 64 VSX registers.
The addition of VSX vector double-precision (xv prefix) instructions was the most significant addition. This added vector doubleword floating-point operations and provided access to all 64 VSX registers. Alas, there are no doubleword (64-bit long) integer operations in the initial VSX. A few logical and permute class (xx prefix) operations on word/doubleword elements where tacked on. These apply equally to float and integer elements. But nothing for 64-bit integer arithmetic.
PowerISA 2.07 (POWER8) did add a significant number of doubleword (64-bit) integer operations. Including;
Also a number of new word (32-bit) integer operations;
And some new quadword (128-bit) integer operations;
And some specialized operations;
The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:
PowerISA 3.0 (POWER9) adds a few more doubleword (64-bit) integer operations. Including;
Also a number of new word (32-bit) integer operations;
And some new quadword (128-bit) integer operations;
The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:
An impressive list of operations that can be used for;
The challenge is that useful operations available for POWER9 will need equivalent implementations for POWER8 and POWER7. Similarly for operations introduced for POWER8 will need POWER7 implementations. Also there are some obvious missing operations;
The stated goals for pveclib are:
So the first step is to provide implementations for the key POWER8 doubleword integer operations for older compilers. For example, some of the generic doubleword integer operations were not defined until GCC 6.0. Here we define the specific Compare Equal Unsigned Doubleword implementation:
The implementation checks if the compile target is POWER8 then checks of the compiler is new enough to use the generic vector compare built-in. If the generic built-in is not defined in <altivec.h> then we provide the equivalent inline assembler.
For POWER7 targets we don't have any vector compare doubleword operations and we need to define the equivalent operation using PowerISA 2.06B (and earlier) instructions. For example:
Here we use Compare Equal Unsigned Word. If all words are equal, use the result as is. Otherwise, if any word elements are not equal, we do some extra work. For each doubleword, rotate the word compare result by 32-bits (here we use permute as we don't have rotate doubleword either). Then logical and the original word compare and rotated results to get the final doubleword compare results.
Similarly for all the doubleword compare variants. Similarly for doubleword; add, subtract, maximum, minimum, shift, rotate, count leading zeros, population count, and Byte reverse.
Now we can look at the case where vector doubleword operations of interest don't have an equivalent instruction. Here interesting operations include those that are supported for other element sizes and types.
The simplest example is absolute difference which was introduced in PowerISA 3.0 for byte, halfword and word elements. From the implementation of vec_absduw() we see how to implement the operation for POWER8 using subtract, maximum, and minimum. For example:
This works because pveclib provides implementations for min, max, and sub operations that work across GCC versions and provide processor specific implementations for POWER8/9 and POWER7.
Now we need to look at the multiply doubleword situation. We need implementations for vec_msumudm(), vec_muleud(), vec_mulhud(), vec_muloud(), and vec_muludm(). We saw in the implementations of vec_int32_ppc.h that multiply high and low/modulo can implemented using multiply and merge even/odd of that element size. Multiply low can also be implemented using the multiply sum and multiply odd of the next smaller element size. Also multiply-sum can be implemented using multiply even/odd and a couple of adds. And multiply even/odd can be implemented using multiply sum by supplying zeros to appropriate inputs/elements.
The above discussion has many circular dependencies. Eventually we need to get down to an implementation on each processor using actual hardware instructions. So what multiply doubleword operations does the PowerISA actually have from the list above:
It seems the best implementation strategy uses;
We really care about performance and latency for POWER9/8. We need POWER7 to work correctly so we can test on and support legacy hardware. The rest is grade school math.
First we need to make sure we have implementations across the GCC versions 6, 7, and 8 for the instructions we need. For example:
While we are it we can implement multiply-sum unsigned word modulo.
We will need this later.
Now we need to provide implementations of vec_muleud() and vec_muloud(). For example:
The implementation above is just handling the pesky little endian transforms. The real implementations are in vec_vmuleud() and vec_vmuloud() which implement the operation as if the PowerISA included such an instruction. These implementation is NOT endian sensitive and the function is stable across BE/LE implementations. For example:
The _ARCH_PWR9 implementation uses the multiply-sum doubleword operation but implements the multiply even behavior by forcing the contents of doubleword element 1 of [VRB] and the contents of [VRC] to 0.
The _ARCH_PWR8 implementation looks ugly but it works. It starts with some merges and splats to get inputs columns lined up for the multiply. Then we use (POWER8 instructions) Multiply Even/Odd Unsigned Word to generate doubleword partial products. Then more merges and a rotate to line up the partial products for summation as the final quadword product.
Individually vec_vmuleud() and vec_vmuloud() execute with a latency of 21-23 cycles on POWER8. Normally these operations are used and scheduled together as in the POWER8 implementation of vec_msumudm() or vec_mulhud(). Good scheduling by the compiler and pipelining keeps the POWER8 latency in the 28-32 cycle range. For example, the vec_mulhud() implementation:
Generates the following code for POWER8:
The POWER9 latencies for this operation range from 5-7 (for vmsumudm itself) to 11-16 (for vec_mulhud()). The additional latency reflects zero constant vector generation and merges required to condition the inputs and output. For these operations the vec_msumudm(), vrc operand is always zero. Selecting the even/odd doubleword for input requires a merge low/high. And selecting the high doubleword for multiply high require a final merge high.
vec_mulhud() generates the following code for POWER9:
Wrapping up the doubleword multiplies we should look at the multiply low (AKA Multiply Unsigned Doubleword Modulo). The POWER9 implementation is similar to vec_mulhud () and the generated code is similar to the example above.
Multiply low doubleword is a special case, as we are discarding the highest partial doubleword product. For POWER8 we can optimize for that case using multiply odd and multiply-sum word operations. Also as we are only generating doubleword partial products we only need add doubleword modulo operations to sum the results. This avoids the more expensive add quadword operation required for the general case. The fact that vec_vmsumuwm() is only a software construct is not an issue. It expands into hardware multiple even/odd word and add doubleword instructions that the compiler can schedule and optimize.
Here vec_mulouw() generates low order partial product. Then vec_vrld () and vec_vmsumuwm() generate doubleword sums of the two middle order partial products. Then vec_vsld() shifts the middle order partial sum left 32-bits (discarding the unneeded high order 32-bits). Finally sum the low and middle order partial doubleword products to produce the multiply-low doubleword result. For example, this POWER8 only implementation:
Which generates the following for POWER8:
And we can assume that the constant load of { 32, 32 } will be common-ed with other operations or hoisted out of loops. So the shift constant can be loaded early and vrld is not delayed. This keeps the POWER8 latency in the 19-28 cycle range.
Programming with vector doubleword integers will need doubleword constants for masking and arithmetic operations. Doubleword splat constants are common in vectorized long integer code for arithmetic, comparison, and mask operations. For example:
The endian sensitive macros from vec_common_ppc.h can be used to construct doubleword integer constants. For example:
In most cases this compiler will allocate these constant values to the read-only data (.rodata) section. When these constants are referenced in programming operations the compiler generates the appropriate vector loads. For example the GCC V11 generates the following for the -mcpu=power8 target:
The addis/addi/lvx pattern is common to loading most vector constants for POWER8 and earlier.
For some odd reason the compiler might generate the sequence:
for -mcpu=power8 ppc64le targets.
The Load VSX Vector Dword*2 Indexed (lxvd2x) would be required if the compiler could not know that the data was quadword aligned. The lxvd2x instruction handles unaligned access but requires the little endian adjustment (xxswapd). However the compiler controls the allocation and alignment of vector constants in .rodata and already insures quadword alignment.
For the -mcpu=power9 (and later) target GCC uses the Load VXS Vector (lxv) instruction:
The first sequence is expected for POWER8 as PowerISA 2.07B does not have any displacement form (D-Form) vector (VSX) load/store instructions. The compiler allocates constants to the .rodata sections and the linker collects .rodata from object files into a combined executable .rodata section. This section is placed near the Table of Contents (TOC) section. The ABI dedicates R2 as the base address .TOC. for the TOC and adjacent sections.
The Add Immediate Shifted (addis) Add Immediate (addi) sequence above computes a signed 32-bit .TOC. relative offset to a specific .rodata quadword. Two instructions are required as; addis provides the high adjusted (@ha) 16-bits shifted left 16-bits, while addi provides the low (@l) 16-bits. The sum of R2 and these immediate values is the 64-bit effective address of a .rodata constant value. A signed 32-bit offset is large enough to support most (-mcmodel=medium) program and library executables.
The load itself has a 5-cycle latency assuming a L1 cache hit. The three instruction sequence is sequentially dependent and requires 9-cycles latency (minimum) to execute. A L1 cache miss will increase the latency by 7-28 cycles, assuming the data resides in the L2/L3 caches.
However the compiler is not following the recommendations of PowerISA 2.07B, Book II, Chapter 2.1 Performance-Optimized Instruction Sequences. This chapter recommends a specific pattern for the addi/lvx sequence. For example:
In this case rx can be any GPR (including r0) while RA must be a valid base (r1 <-> r31) register.
The POWER8 implementation allows for Instruction Funsion combining information from two adjacentt instructions into one (internal) instruction so that it executes faster than the non-fused case. Effectively the addi/lvx combination above becomes a D-Form load vector instruction.
There are additional restrictions on the definition of adjacent:
This can reduce the latency from 9 to 7-cycles. This would be true even without Instruction Funsion as the addis/addi instructions are now independent and can execute in parallel.
The sequence generated for POWER9 is even more disappointing. The lxv is a D-Form (DQ) instruction and the displacement operand could be used to replace the addi instruction. For example: -mcpu=power9 target:
This provides the equivalent 32-bit TOC relative displacement with one less instructions and reduced latency of 7-cycles.
This is all a little cumbersome and it seems like there should be a better/faster way. Any instruction sequence that loads quadword integer constants in:
is a good deal.
The base (Altivec) vector ISA included Vector Splat Immediate Signed Byte/Halfword/Word instructions. These are fast (2-cycle latency) and convenient for small integer constants in the range -16 to 15. So far the ISA has not added doubleword or quadword forms of splat immediate.
POWER9 added a VSX Vector Splat Immediate Byte (xxspltib) instruction. This expands the immediate range to -128 to 127 but does not include larger element sizes. POWER9 does provide Vector Extend Sign Byte To Word/Doubleword (vextsb2w/vextsb2d) instructions. For example the two instruction sequence:
can generate a doubleword splat immediate for integers in the range -128 to 127 with a cycle latency of 5-cycles.
The GCC compiler does recognize some vector constants as special case. For example:
will generate:
As we will see the all zero/ones constants are common building blocks. So the compiler should treat these as common sub expressions across all operations using those constants.
So the compiler can do clever things with vector constants. But so far these are the only examples I have found. Other cases that you might expect to be a special case are not. For example:
both generate the 3 instruction (9-cycle) load from .rodata sequence. Also constants using the vector long long or __int128 types may fail to compile on older versions of the compiler.
We can generate small constants in the range -16 <-> 15 with using the following pattern:
Which should generate:
Here we use the vec_splat_s32(15) intrinsic to generate Vector Splat Immediate Signed Word (vspltisw) to splat the value 15 across word elements of vwi. Then vec_unpackl (vwi) to generate Vector Unpack Low Signed Word vupklsw which sign extends the 2 low words of vwi to signed doubleword elements. This sequence is only 2 instructions and will execute with 4-cycle latency.
Putting this all together we can create a static inline function to generate small doubleword constants (in the range -16 to 15). For example:
This version uses only <altivec.h> intrinsics supported by POWER8 and earlier. For constants in the range (-16 to 15) the range is divided into two groups:
Values outside this range use the vec_splats() intrinsic which will generate the appropriate quadword constant in .rodata and the load sequence to retrieve that value.
For POWER9 and later we can use the vec_splats() intrinsic which (so far) generates the xxspltib/vextsb2d sequence for the constant range -128 to 127.
From the examples above we see that the construction of higher precision multiplies requires significant massaging of input and output elements. Here merge even/odd, merge high/low, swap, and splat doubleword element operations are commonly used.
PowerISA 2.06 VSX (POWER7) added the general purpose Vector Permute Doubleword Immediate (xxpermdi). The compiler generates some form of xxpermdi for the doubleword (double float, long int, bool long) merge/splat/swap operations. As xxpermdi's element selection is an immediate field, most operations require only a single instruction. All the merge/splat/swap doubleword variant are just a specific select mask value and the inputs to xxpermdi.
Which is very useful indeed for assembling, disassembling, merging, splatting, swapping, and pasting doubleword elements.
Of course it took several compiler releases to implement all the generic merge/splat/swap operations for the supported types. GCC 4.8 as the first to support vec_xxpermdi as a built-in. GCC 4.8 also supported the generic built-ins vec_mergeh, vec_mergel, and vec_splat for the vector signed/unsigned/bool long type. But endian sensitive vec_mergeh, vec_mergel, and vec_splat were not supported until GCC 7. And the generic vec_mergee, vec_mergeo built-ins where not supported until GCC 8.
But as we have explained in General Endian Issues and Endian problems with word operations the little endian transforms applied by the compiler can cause problems for developers of multi-precision libraries. The doubleword forms of the generic merge/splat operations etc. are no exception. This is especially annoying when the endian sensitive transforms are applied between releases of the compiler.
So we need a strategy to provide endian invariant merge/splat/swap operations to be used in multi-precision arithmetic. And another set of endian sensitive operations that are mandated by the OpenPOWER ABI.
First we need a safely endian invariant version of xxpermdi to use in building other variants:
Then build the core set of endian invariant permute doubleword operations using vec_permdi():
We use the merge algebraic high/low doubleword operations in the implementation of vec_mulhud(), vec_mulhud(), vec_vmuleud(), and vec_vmuloud(). We use the vec_xxspltd operation in the implementation of vec_vrld(), vec_vmuleud(), and vec_vmuloud(). We use the paste doubleword (vec_pasted()) operation in the implementation of vec_vsrad(), vec_vmuleud(), and vec_vmuloud(). We use the swap doubleword operation in the implementation of vec_cmpequq(), vec_cmpneuq(), vec_muludq(), and vec_mulluq().
Then use the compilers __BYTE_ORDER__ == ORDER_LITTLE_ENDIAN conditional to invert the vec_permdi() select control for endian sensitive merge/splat doubleword operations:
Suppose we have a requirement to convert an array of 64-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 2 doublewords or 4 words.
The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume are dealing with longer intervals (greater than ~8.38 seconds) so the full 64-bit TimeBase is required. TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.
The timespec format is a struct of unsigned int fields for seconds and nanoseconds. So the task is to convert the 512MHz 64-bit TimeBase intervals to seconds and remaining clock ticks. Then convert the remaining (subsecond) clock ticks from 512MHz to nanoseconds. The separate seconds and nanoseconds are combined in the timespec structure.
First we need to separate the raw TimeBase into the integer seconds and (subsecond) clock-ticks. Normally scalar codes would use integer divide/modulo by 512000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?
Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 64-bit fraction and we have a multiply high (vec_mulhud()) operation. Multiplying a 64-bit unsigned integer by a 64-bit unsigned fraction generates a 128-bit product with 64-bits above (integer) and below (fraction) the radix point. The high 64-bits of the product is the integer quotient.
It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.
In the chapter above;
Figure 10-2 Computing the magic number for unsigned division.
provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.).
For the divisor 512000000 this is { 4835703278458516699, 0 , 27 }:
Next we need to convert the subseconds from TimeBase clock-ticks to nanoseconds. The subsecond remainder is now small enough (compared to a doubleword) that we can perform the conversion in place. The nanosecond conversion is ((tb_clocks * 1000000000) / 512000000). And we can reduce this to ((tb_clocks * 1000) / 512). We still have a small multiply but the divide can be converted to shift right of 9-bits.
Finally we need to merge the vectors of seconds and nanoseconds into vectors of timespec. So far we have been working with 64-bit integers but the timespec is a struct of 32-bit (word) integers. Here 32-bit seconds and nanosecond provided sufficient range and precision. So the final step packs a pair of 64-bit timespec values into a vector of two 32-bit timespec values, each containing 2 32-bit (second, nanosecond) values.
Here is the complete vectorized 64-bit TimeBase to timespec conversion example:
High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.
Vector Absolute Difference Unsigned Doubleword.
Compute the absolute difference for each doubleword. For each unsigned doubleword, subtract VRB[i] from VRA[i] and return the absolute value of the difference.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 1/cycle |
power9 | 5 | 1/cycle |
vra | vector of 2 x unsigned doublewords |
vrb | vector of 2 x unsigned doublewords |
Vector Add Unsigned Doubleword Modulo.
Add two vector long int values and return modulo 64-bits result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
a | 128-bit vector long int. |
b | 128-bit vector long int. |
Vector Count Leading Zeros Doubleword for unsigned long long elements.
Count the number of leading '0' bits (0-64) within each doubleword element of a 128-bit vector.
For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Doubleword instruction vclzd. Otherwise use sequence of pre 2.07 VMX instructions.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x 64-bit unsigned long long (doubleword) elements. |
Vector Compare Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare Greater Than or Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltud, then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare Greater Than or Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud, then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare Greater Than Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtsd) instruction. Otherwise use boolean logic using word compares.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare Greater Than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtud) instruction. Otherwise use boolean logic using word compares.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare Less Than Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare Less Than Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud. Use vec_cmpgtud then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare less Than Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare less Than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare Not Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare Not Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/cycle |
power9 | 5 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a and b are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Greater Than or Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a >= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Greater Than Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a > b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Less than equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a <= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Less than Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a < b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Not Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a and b are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if any elements of a and b are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Greater Than or Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if any elements of a >= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Greater Than Signed Doubleword.
Compare each signed long (64-bit) integer and return true if all elements of a > b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Less than equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if any elements of a <= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Less than Signed Doubleword.
Compare each signed long (64-bit) integer and return true if any elements of a < b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare any Not Equal Signed Doubleword.
Compare each signed long (64-bit) integer and return true if any elements of a and b are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit signed long integer (dword) elements. |
Vector Compare all Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a and b are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Greater Than or Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a >= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Greater Than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a > b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Less than equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a <= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Less than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a < b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare all Not Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a and b are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if any elements of a and b are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Greater Than or Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if any elements of a >= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Greater Than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if all elements of a > b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Less than equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if any elements of a <= b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Less than Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if any elements of a < b.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Compare any Not Equal Unsigned Doubleword.
Compare each unsigned long (64-bit) integer and return true if any elements of a and b are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
b | 128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements. |
Vector Count Trailing Zeros Doubleword for unsigned long long elements.
Count the number of trailing '0' bits (0-64) within each doubleword element of a 128-bit vector.
For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Doubleword instruction vctzd. Otherwise use a sequence of pre ISA 3.0 VMX instructions leveraging the PVECLIB popcntd operation. SIMDized count Trailing zeros inspired by:
Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.
processor | Latency | Throughput |
---|---|---|
power8 | 8-10 | 2/2 cycles |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as 2 x 64-bit integer (doublewords) elements. |
Vector Maximum Signed Doubleword.
For each doubleword element [0|1] of vra and vrb compare as signed integers and return the larger value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector long int. |
vrb | 128-bit vector long int. |
Vector Maximum Unsigned Doubleword.
For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the larger value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector long int. |
vrb | 128-bit vector long int. |
Vector Minimum Signed Doubleword.
For each doubleword element [0|1] of vra and vrb compare as signed integers and return the smaller value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector long int. |
vrb | 128-bit vector long int. |
Vector Minimum Unsigned Doubleword.
For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the smaller value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector unsigned long int. |
vrb | 128-bit vector unsignedlong int. |
Vector Merge Algebraic High Doublewords.
Merge only the high doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This is effectively the Vector Merge Even Doubleword operation that is not modified for endian.
For example, merge the high 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply high unsigned doubleword.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned __int128. |
vrb | 128-bit vector unsigned __int128. |
Vector Merge Algebraic Low Doublewords.
Merge only the low doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This effectively the Vector Merge Odd doubleword operation that is not modified for endian.
For example, merge the low 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply low unsigned doubleword.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | 128-bit vector unsigned __int128. |
vrb | 128-bit vector unsigned __int128. |
Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
__VA | a 128-bit vector as the source of the results even doubleword. |
__VB | a 128-bit vector as the source of the results odd doubleword. |
Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
__VA | a 128-bit vector as the source of the results even doubleword. |
__VB | a 128-bit vector as the source of the results odd doubleword. |
Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
__VA | a 128-bit vector as the source of the results even doubleword. |
__VB | a 128-bit vector as the source of the results odd doubleword. |
Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
__VA | a 128-bit vector as the source of the results even doubleword. |
__VB | a 128-bit vector as the source of the results odd doubleword. |
Vector Multiply-Sum Unsigned Doubleword Modulo.
Vector Multiply Even Unsigned Doublewords.
Vector Multiply High Unsigned Doubleword.
Vector Multiply Odd Unsigned Doublewords.
Vector Multiply Unsigned Doubleword Modulo.
Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
__VH | a 128-bit vector as the source of the high order doubleword. |
__VL | a 128-bit vector as the source of the low order doubleword. |
Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector.
The 2-bit control operand (ctl) selects which doubleword from the 1st and 2nd vector operands are transfered to the result vector. Control table:
ctl | vrt[0:63] | vrt[64:127] |
---|---|---|
0 | vra[0:63] | vrb[0:63] |
1 | vra[0:63] | vrb[64:127] |
2 | vra[64:127] | vrb[0:63] |
3 | vra[64:127] | vrb[64:127] |
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector as the source of the high order doubleword of the result. |
vrb | a 128-bit vector as the source of the low order doubleword of the result. |
ctl | const integer where the low order 2 bits control the selection of doublewords from input vector vra and vrb. |
Vector Population Count doubleword.
Count the number of '1' bits (0-64) within each doubleword element of a 128-bit vector.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
For POWER8 (PowerISA 2.07B) or later use the Vector Population Count DoubleWord (vpopcntd) instruction. Otherwise use the pveclib vec_popcntw to count each word then sum across with Vector Sum across Half Signed Word Saturate (vsum2sws).
vra | 128-bit vector treated as 2 x 64-bit integer (dwords) elements. |
byte reverse each doubleword for a vector unsigned long int.
For each doubleword of the input vector, reverse the order of bytes / octets within the doubleword.
processor | Latency | Throughput |
---|---|---|
power8 | 2-11 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector unsigned long int. |
Vector Rotate left Doubleword Immediate.
Rotate left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The rotate amount is a const unsigned int in the range 0-63. A rotate count of 0 returns the original value of vra. Shift counts greater then 63 bits handled modulo 64.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned long int. |
shb | rotate amount in the range 0-63. |
Vector Select Signed Doubleword.
Return the value, (vra & ~vrc) | (vrb & vrc).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as a vector signed long long int. |
vrb | a 128-bit vector treated as a vector signed long long int. |
vrc | a 128-bit vector treated as vector bool long long int. |
Vector Select Unsigned Doubleword.
Return the value, (vra & ~vrc) | (vrb & vrc).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned long long int. |
vrb | a 128-bit vector treated as a vector unsigned long long int. |
vrc | a 128-bit vector treated as vector bool long long int. |
Vector Set Bool from Signed Doubleword.
For each doubleword, propagate the sign bit to all 64-bits of that doubleword. The result is vector bool long long reflecting the sign bit of each 64-bit doubleword.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | Vector signed long long. |
Vector Shift left Doubleword Immediate.
Shift left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned long int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned long int. |
shb | shift amount in the range 0-63. |
|
inlinestatic |
Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte | Halfword |Word).
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 9 | 2/cycle |
power9 | 5 | 2/cycle |
sim | a small signed integer const. |
|
inlinestatic |
Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte | Halfword |Word).
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 9 | 2/cycle |
power9 | 5 | 2/cycle |
sim | a small signed integer const. |
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:
ctl | vrt[0] | vrt[1] |
---|---|---|
0 | vra[0] | vra[0] |
1 | vra[1] | vra[1] |
vra | a 128-bit vector. |
ctl | a const integer encoding the source doubleword. |
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:
ctl | vrt[0:63] | vrt[64:127] |
---|---|---|
0 | vra[0:63] | vra[0:63] |
1 | vra[64:127] | vra[64:127] |
vra | a 128-bit vector. |
ctl | a const integer encoding the source doubleword. |
Vector Shift Right Algebraic Doubleword Immediate.
Shift Right Algebraic each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return the sign bit propagated to each bit of each element.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | a 128-bit vector treated as a vector signed long int. |
shb | shift amount in the range 0-63. |
Vector Shift Right Doubleword Immediate.
Shift Right each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 2-4 | 2/cycle |
power9 | 2-5 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned long int. |
shb | shift amount in the range 0-63. |
Vector Subtract Unsigned Doubleword Modulo.
For each unsigned long (64-bit) integer element c[i] = a[i] + NOT(b[i]) + 1.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
For POWER8 (PowerISA 2.07B) or later use the Vector Subtract Unsigned Doubleword Modulo (vsubudm) instruction. Otherwise use vector add word modulo forms and propagate the carry bits.
a | 128-bit vector treated as 2 X unsigned long int. |
b | 128-bit vector treated as 2 X unsigned long int. |
Vector doubleword swap. Exchange the high and low doubleword elements of a vector.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector. |
Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets.
For each doubleword element [i] of vra, load the doubleword element at *(char*)array+vra[i]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).
processor | Latency | Throughput |
---|---|---|
power8 | 12 | 1/cycle |
power9 | 11 | 1/cycle |
array | Pointer to array of integer doublewords. |
vra | Vector of doubleword (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes.
For each doubleword element [i] of vra, load the doubleword element array[vra[i] * (1 << scale)]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of integer doublewords. |
vra | Vector of signed doubleword indexes. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes.
For each doubleword element [i] of vra, load the doubleword element from array[vra[i]]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left 3 bits.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 13-22 | 1/cycle |
array | Pointer to array of integer doublewords. |
vra | Vector of signed doubleword indexes. |
|
inlinestatic |
Vector Gather-Load Integer Doublewords from Scalar Offsets.
For each scalar offset[0|1], load the doubleword element at *(char*)array+offset[0|1]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets should be doubleword aligned (integer multiple of 8).
processor | Latency | Throughput |
---|---|---|
power8 | 7 | 1/cycle |
power9 | 8 | 1/cycle |
array | Pointer to array of integer doublewords. |
offset0 | Scalar (64-bit) byte offsets from &array. |
offset1 | Scalar (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Load Scalar Integer Doubleword Indexed.
Load the left most doubleword of vector xt as a scalar doubleword from the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be doubleword aligned (integer multiple of 8).
This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. This simplifies merging elements for gather operations.
processor | Latency | Throughput |
---|---|---|
power8 | 5 | 2/cycle |
power9 | 5 | 2/cycle |
ra | const signed doubleword index (offset/displacement). |
rb | const doubleword pointer to an array of doubles. |
Vector Multiply-Add2 Even Unsigned Doublewords.
Vector Multiply-Add2 Even Unsigned Words.
Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c and d (aeven * beven) + EXTZ(ceven + EXTZ(deven).
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 1/cycle |
power9 | 9 | 1/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
c | 128-bit vector unsigned int. |
d | 128-bit vector unsigned int. |
Vector Multiply-Add2 Odd Unsigned Doublewords.
Vector Multiply-Add2 Odd Unsigned Words.
Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c and d (aodd * bodd) + EXTZ(codd + EXTZ(dodd).
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 1/cycle |
power9 | 9 | 1/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
c | 128-bit vector unsigned int. |
d | 128-bit vector unsigned int. |
Vector Multiply-Add Even Unsigned Doublewords.
Vector Multiply-Add Even Unsigned Words.
Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c (aeven * beven) + EXTZ(ceven).
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 2/cycle |
power9 | 9 | 2/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
c | 128-bit vector unsigned int. |
Vector Multiply-Add Odd Unsigned Doublewords.
Vector Multiply-Add Odd Unsigned Words.
Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c (aodd * bodd) + EXTZ(codd).
processor | Latency | Throughput |
---|---|---|
power8 | 9 | 2/cycle |
power9 | 9 | 2/cycle |
a | 128-bit vector unsigned int. |
b | 128-bit vector unsigned int. |
c | 128-bit vector unsigned int. |
Vector Multiply-Sum Even Unsigned Doublewords.
Vector Multiply-Sum Odd Unsigned Doublewords.
Vector Multiply-Sum Unsigned Word Modulo.
Multiply the unsigned word elements of vra and vrb, internally generating doubleword products. Then generate three-way sum of adjacent doubleword product pairs, plus the doubleword elements from vrc. The final summation is modulo 64-bits.
processor | Latency | Throughput |
---|---|---|
power8 | 11 | 1/cycle |
power9 | 11 | 1/cycle |
vra | 128-bit vector unsigned int. |
vrb | 128-bit vector unsigned int. |
vrc | 128-bit vector unsigned long. |
Vector Multiply Even Unsigned Doublewords.
Vector Multiply Odd Unsigned Doublewords.
Vector Pack Unsigned Doubleword Unsigned Modulo.
The doubleword source is the concatination of vra and vrb. For each integer word from 0 to 3, of the result vector, do the following: place the contents of bits 32:63 of the corresponding doubleword source element [i] into word element [i] of the result.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x unsigned long integers. |
vrb | a 128-bit vector treated as 2 x unsigned long integers. |
Vector Rotate Left Doubleword.
Vector Rotate Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x unsigned long integers. |
vrb | shift amount in bits 58:63 and 122:127. |
Vector Shift Left Doubleword.
Vector Shift Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x unsigned long integers. |
vrb | shift amount in bits 58:63 and 122:127. |
Vector Shift Right Algebraic Doubleword.
Vector Shift Right Algebraic Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x unsigned long integers. |
vrb | shift amount in bits 58:63 and 122:127. |
Vector Shift Right Doubleword.
Vector Shift Right Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 2 | 2/cycle |
vra | a 128-bit vector treated as 2 x unsigned long integers. |
vrb | shift amount in bits 58:63 and 122:127. |
Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets.
For each doubleword element [i] of vra, Store the doubleword element xs[i] at the address *(char*)array+vra[i] For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).
processor | Latency | Throughput |
---|---|---|
power8 | 12 | 1/cycle |
power9 | 8 | 1/cycle |
xs | Vector of integer doubleword elements to scatter store. |
array | Pointer to array of integer doublewords. |
vra | Vector of doubleword (64-bit) byte offsets from &array. |
|
inlinestatic |
Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes.
For each doubleword element [i] of vra, store the doubleword element xs[i] at array[(vra[i] << scale)]. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 10-19 | 1/cycle |
xs | Vector of integer doubleword elements to scatter store. |
array | Pointer to array of integer doublewords. |
vra | Vector of signed doubleword indexes. |
scale | 8-bit integer. Indexes are multiplying by 2scale. |
Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes.
For each doubleword element [i] of vra, store the doubleword element xs[i] at array[vra[i]]. Indexes are converted to offsets from *array by shifting each doubleword of vra left (3+scale) bits.
processor | Latency | Throughput |
---|---|---|
power8 | 14-23 | 1/cycle |
power9 | 10-19 | 1/cycle |
xs | Vector of integer doubleword elements to scatter store. |
array | Pointer to array of integer doublewords. |
vra | Vector of signed doubleword indexes. |
|
inlinestatic |
Vector Scatter-Store Integer Doublewords to Scalar Offsets.
For each doubleword element [i] of vra, Store the doubleword element xs[i] at *(char*)array+offset[0|1]. For best performance, &array and doubleword offsets should be doubleword aligned (integer multiple of 8).
processor | Latency | Throughput |
---|---|---|
power8 | 12 | 1/cycle |
power9 | 8 | 1/cycle |
xs | Vector of integer doubleword elements to scatter store. |
array | Pointer to array of integer doublewords. |
offset0 | Scalar (64-bit) byte offset from &array. |
offset1 | Scalar (64-bit) byte offset from &array. |
|
inlinestatic |
Vector Store Scalar Integer Doubleword Indexed.
Stores the left most doubleword of vector xs as a scalar doubleword at the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be doubleword aligned (integer multiple of 8).
This operation is an alternate form of vector store element, with the added simplification that data is always left justified in the vector. This simplifies scatter operations.
processor | Latency | Throughput |
---|---|---|
power8 | 0 - 2 | 2/cycle |
power9 | 0 - 2 | 4/cycle |
xs | vector doubleword element 0 to be stored. |
ra | const signed long long index (offset/displacement). |
rb | const doubleword pointer to an array of doubles. |
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result.
The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:
ctl | vrt[0:63] | vrt[64:127] |
---|---|---|
0 | vra[0:63] | vra[0:63] |
1 | vra[64:127] | vra[64:127] |
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector. |
ctl | a const integer encoding the source doubleword. |