POWER Vector Library Manual
1.0.4
|
Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions. More...
Go to the source code of this file.
Macros | |
#define | CONST_VUINT128_QxW(__q0, __q1, __q2, __q3) |
Generate a vector unsigned __int128 constant from words. More... | |
#define | CONST_VUINT128_QxD(__q0, __q1) |
Generate a vector unsigned __int128 constant from doublewords. More... | |
#define | CONST_VUINT128_Qx19d(__q0, __q1) |
Generate a vector unsigned __int128 constant from doublewords. More... | |
#define | CONST_VUINT128_Qx18d(__q0, __q1) |
Generate a vector unsigned __int128 constant from doublewords. More... | |
#define | CONST_VUINT128_Qx16d(__q0, __q1) |
Generate a vector unsigned __int128 constant from doublewords. More... | |
Functions | |
static vui128_t | vec_absduq (vui128_t vra, vui128_t vrb) |
Vector Absolute Difference Unsigned Quadword. More... | |
static vi128_t | vec_abssq (vi128_t vra) |
Vector Absolute Value Signed Quadword. More... | |
static vui128_t | vec_avguq (vui128_t vra, vui128_t vrb) |
Vector Average Unsigned Quadword. More... | |
static vui128_t | vec_addcuq (vui128_t a, vui128_t b) |
Vector Add & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_addecuq (vui128_t a, vui128_t b, vui128_t ci) |
Vector Add Extended & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_addeuqm (vui128_t a, vui128_t b, vui128_t ci) |
Vector Add Extended Unsigned Quadword Modulo. More... | |
static vui128_t | vec_adduqm (vui128_t a, vui128_t b) |
Vector Add Unsigned Quadword Modulo. More... | |
static vui128_t | vec_addcq (vui128_t *cout, vui128_t a, vui128_t b) |
Vector Add with carry Unsigned Quadword. More... | |
static vui128_t | vec_addeq (vui128_t *cout, vui128_t a, vui128_t b, vui128_t ci) |
Vector Add Extend with carry Unsigned Quadword. More... | |
static vui128_t | vec_clzq (vui128_t vra) |
Vector Count Leading Zeros Quadword for unsigned __int128 elements. More... | |
static vui128_t | vec_ctzq (vui128_t vra) |
Vector Count Trailing Zeros Quadword for unsigned __int128 elements. More... | |
static vb128_t | vec_cmpeqsq (vi128_t vra, vi128_t vrb) |
Vector Compare Equal Signed Quadword. More... | |
static vb128_t | vec_cmpequq (vui128_t vra, vui128_t vrb) |
Vector Compare Equal Unsigned Quadword. More... | |
static vb128_t | vec_cmpgesq (vi128_t vra, vi128_t vrb) |
Vector Compare Greater Than or Equal Signed Quadword. More... | |
static vb128_t | vec_cmpgeuq (vui128_t vra, vui128_t vrb) |
Vector Compare Greater Than or Equal Unsigned Quadword. More... | |
static vb128_t | vec_cmpgtsq (vi128_t vra, vi128_t vrb) |
Vector Compare Greater Than Signed Quadword. More... | |
static vb128_t | vec_cmpgtuq (vui128_t vra, vui128_t vrb) |
Vector Compare Greater Than Unsigned Quadword. More... | |
static vb128_t | vec_cmplesq (vi128_t vra, vi128_t vrb) |
Vector Compare Less Than or Equal Signed Quadword. More... | |
static vb128_t | vec_cmpleuq (vui128_t vra, vui128_t vrb) |
Vector Compare Less Than or Equal Unsigned Quadword. More... | |
static vb128_t | vec_cmpltsq (vi128_t vra, vi128_t vrb) |
Vector Compare Less Than Signed Quadword. More... | |
static vb128_t | vec_cmpltuq (vui128_t vra, vui128_t vrb) |
Vector Compare Less Than Unsigned Quadword. More... | |
static vb128_t | vec_cmpnesq (vi128_t vra, vi128_t vrb) |
Vector Compare Equal Signed Quadword. More... | |
static vb128_t | vec_cmpneuq (vui128_t vra, vui128_t vrb) |
Vector Compare Not Equal Unsigned Quadword. More... | |
static int | vec_cmpsq_all_eq (vi128_t vra, vi128_t vrb) |
Vector Compare all Equal Signed Quadword. More... | |
static int | vec_cmpsq_all_ge (vi128_t vra, vi128_t vrb) |
Vector Compare any Greater Than or Equal Signed Quadword. More... | |
static int | vec_cmpsq_all_gt (vi128_t vra, vi128_t vrb) |
Vector Compare any Greater Than Signed Quadword. More... | |
static int | vec_cmpsq_all_le (vi128_t vra, vi128_t vrb) |
Vector Compare any Less Than or Equal Signed Quadword. More... | |
static int | vec_cmpsq_all_lt (vi128_t vra, vi128_t vrb) |
Vector Compare any Less Than Signed Quadword. More... | |
static int | vec_cmpsq_all_ne (vi128_t vra, vi128_t vrb) |
Vector Compare all Not Equal Signed Quadword. More... | |
static int | vec_cmpuq_all_eq (vui128_t vra, vui128_t vrb) |
Vector Compare all Equal Unsigned Quadword. More... | |
static int | vec_cmpuq_all_ge (vui128_t vra, vui128_t vrb) |
Vector Compare any Greater Than or Equal Unsigned Quadword. More... | |
static int | vec_cmpuq_all_gt (vui128_t vra, vui128_t vrb) |
Vector Compare any Greater Than Unsigned Quadword. More... | |
static int | vec_cmpuq_all_le (vui128_t vra, vui128_t vrb) |
Vector Compare any Less Than or Equal Unsigned Quadword. More... | |
static int | vec_cmpuq_all_lt (vui128_t vra, vui128_t vrb) |
Vector Compare any Less Than Unsigned Quadword. More... | |
static int | vec_cmpuq_all_ne (vui128_t vra, vui128_t vrb) |
Vector Compare all Not Equal Unsigned Quadword. More... | |
static vui128_t | vec_cmul10ecuq (vui128_t *cout, vui128_t a, vui128_t cin) |
Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_cmul10cuq (vui128_t *cout, vui128_t a) |
Vector combined Multiply by 10 & write Carry Unsigned Quadword. More... | |
static vi128_t | vec_divsq_10e31 (vi128_t vra) |
Vector Divide by const 10e31 Signed Quadword. More... | |
static vui128_t | vec_divudq_10e31 (vui128_t *qh, vui128_t vra, vui128_t vrb) |
Vector Divide Unsigned Double Quadword by const 10e31. More... | |
static vui128_t | vec_divudq_10e32 (vui128_t *qh, vui128_t vra, vui128_t vrb) |
Vector Divide Unsigned Double Quadword by const 10e32. More... | |
static vui128_t | vec_divuq_10e31 (vui128_t vra) |
Vector Divide by const 10e31 Unsigned Quadword. More... | |
static vui128_t | vec_divuq_10e32 (vui128_t vra) |
Vector Divide by const 10e32 Unsigned Quadword. More... | |
static vi128_t | vec_maxsq (vi128_t vra, vi128_t vrb) |
Vector Maximum Signed Quadword. More... | |
static vui128_t | vec_maxuq (vui128_t vra, vui128_t vrb) |
Vector Maximum Unsigned Quadword. More... | |
static vi128_t | vec_minsq (vi128_t vra, vi128_t vrb) |
Vector Minimum Signed Quadword. More... | |
static vui128_t | vec_minuq (vui128_t vra, vui128_t vrb) |
Vector Minimum Unsigned Quadword. More... | |
static vi128_t | vec_modsq_10e31 (vi128_t vra, vi128_t q) |
Vector Modulo by const 10e31 Signed Quadword. More... | |
static vui128_t | vec_modudq_10e31 (vui128_t vra, vui128_t vrb, vui128_t *ql) |
Vector Modulo Unsigned Double Quadword by const 10e31. More... | |
static vui128_t | vec_modudq_10e32 (vui128_t vra, vui128_t vrb, vui128_t *ql) |
Vector Modulo Unsigned Double Quadword by const 10e32. More... | |
static vui128_t | vec_moduq_10e31 (vui128_t vra, vui128_t q) |
Vector Modulo by const 10e31 Unsigned Quadword. More... | |
static vui128_t | vec_moduq_10e32 (vui128_t vra, vui128_t q) |
Vector Modulo by const 10e32 Unsigned Quadword. More... | |
static vui128_t | vec_mul10cuq (vui128_t a) |
Vector Multiply by 10 & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_mul10ecuq (vui128_t a, vui128_t cin) |
Vector Multiply by 10 Extended & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_mul10euq (vui128_t a, vui128_t cin) |
Vector Multiply by 10 Extended Unsigned Quadword. More... | |
static vui128_t | vec_mul10uq (vui128_t a) |
Vector Multiply by 10 Unsigned Quadword. More... | |
static vui128_t | vec_cmul100cuq (vui128_t *cout, vui128_t a) |
Vector combined Multiply by 100 & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_cmul100ecuq (vui128_t *cout, vui128_t a, vui128_t cin) |
Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword. More... | |
static vui128_t | vec_msumcud (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum and Write Carryout Unsigned Doubleword. More... | |
static vui128_t | vec_msumudm (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Unsigned Doubleword Modulo. More... | |
static vui128_t | vec_muleud (vui64_t a, vui64_t b) |
Vector Multiply Even Unsigned Doublewords. More... | |
static vui64_t | vec_mulhud (vui64_t vra, vui64_t vrb) |
Vector Multiply High Unsigned Doubleword. More... | |
static vui128_t | vec_muloud (vui64_t a, vui64_t b) |
Vector Multiply Odd Unsigned Doublewords. More... | |
static vui64_t | vec_muludm (vui64_t vra, vui64_t vrb) |
Vector Multiply Unsigned Doubleword Modulo. More... | |
static vui128_t | vec_mulhuq (vui128_t a, vui128_t b) |
Vector Multiply High Unsigned Quadword. More... | |
static vui128_t | vec_mulluq (vui128_t a, vui128_t b) |
Vector Multiply Low Unsigned Quadword. More... | |
static vui128_t | vec_muludq (vui128_t *mulu, vui128_t a, vui128_t b) |
Vector Multiply Unsigned Double Quadword. More... | |
static vui128_t | vec_madduq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c) |
Vector Multiply-Add Unsigned Quadword. More... | |
static vui128_t | vec_madd2uq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c1, vui128_t c2) |
Vector Multiply-Add2 Unsigned Quadword. More... | |
static vi128_t | vec_negsq (vi128_t int128) |
Vector Negate Signed Quadword. More... | |
static vui128_t | vec_neguq (vui128_t int128) |
Vector Negate Unsigned Quadword. More... | |
static vui128_t | vec_popcntq (vui128_t vra) |
Vector Population Count Quadword for unsigned __int128 elements. More... | |
static vui128_t | vec_revbq (vui128_t vra) |
Vector Byte Reverse Quadword. More... | |
static vui128_t | vec_rlq (vui128_t vra, vui128_t vrb) |
Vector Rotate Left Quadword. More... | |
static vui128_t | vec_rlqi (vui128_t vra, const unsigned int shb) |
Vector Rotate Left Quadword Immediate. More... | |
static vi128_t | vec_selsq (vi128_t vra, vi128_t vrb, vb128_t vrc) |
Vector Select Signed Quadword. More... | |
static vui128_t | vec_seluq (vui128_t vra, vui128_t vrb, vb128_t vrc) |
Vector Select Unsigned Quadword. More... | |
static vb128_t | vec_setb_cyq (vui128_t vcy) |
Vector Set Bool from Quadword Carry. More... | |
static vb128_t | vec_setb_ncq (vui128_t vcy) |
Vector Set Bool from Quadword not Carry. More... | |
static vb128_t | vec_setb_sq (vi128_t vra) |
Vector Set Bool from Signed Quadword. More... | |
static vui128_t | vec_sldq (vui128_t vrw, vui128_t vrx, vui128_t vrb) |
Vector Shift Left Double Quadword. More... | |
static vui128_t | vec_sldqi (vui128_t vrw, vui128_t vrx, const unsigned int shb) |
Vector Shift Left Double Quadword Immediate. More... | |
static vui128_t | vec_slq (vui128_t vra, vui128_t vrb) |
Vector Shift Left Quadword. More... | |
static vui128_t | vec_slqi (vui128_t vra, const unsigned int shb) |
Vector Shift Left Quadword Immediate. More... | |
static vi128_t | vec_splat_s128 (const int sim) |
Vector Splat Immediate Signed Quadword. Extend a signed integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Signed (Byte | Halfword |Word). More... | |
static vui128_t | vec_splat_u128 (const int sim) |
Vector Splat Immediate Unsigned Quadword. Extend a unsigned integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Unsigned (Byte | Halfword |Word). More... | |
static vi128_t | vec_sraq (vi128_t vra, vui128_t vrb) |
Vector Shift Right Algebraic Quadword. More... | |
static vi128_t | vec_sraqi (vi128_t vra, const unsigned int shb) |
Vector Shift Right Algebraic Quadword Immediate. More... | |
static vui128_t | vec_srq (vui128_t vra, vui128_t vrb) |
Vector Shift Right Quadword. More... | |
static vui128_t | vec_srqi (vui128_t vra, const unsigned int shb) |
Vector Shift Right Quadword Immediate. More... | |
static vui128_t | vec_slq4 (vui128_t vra) |
static vui128_t | vec_slq5 (vui128_t vra) |
static vui128_t | vec_srq4 (vui128_t vra) |
static vui128_t | vec_srq5 (vui128_t vra) |
static vui128_t | vec_subcuq (vui128_t vra, vui128_t vrb) |
Vector Subtract and Write Carry Unsigned Quadword. More... | |
static vui128_t | vec_subecuq (vui128_t vra, vui128_t vrb, vui128_t vrc) |
Vector Subtract Extended and Write Carry Unsigned Quadword. More... | |
static vui128_t | vec_subeuqm (vui128_t vra, vui128_t vrb, vui128_t vrc) |
Vector Subtract Extended Unsigned Quadword Modulo. More... | |
static vui128_t | vec_subuqm (vui128_t vra, vui128_t vrb) |
Vector Subtract Unsigned Quadword Modulo. More... | |
static vui128_t | vec_vmuleud (vui64_t a, vui64_t b) |
Vector Multiply Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c) |
Vector Multiply-Add Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d) |
Vector Multiply-Add2 Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmuloud (vui64_t a, vui64_t b) |
Vector Multiply Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c) |
Vector Multiply-Add Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d) |
Vector Multiply-Add2 Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Even Unsigned Doublewords. More... | |
static vui128_t | vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c) |
Vector Multiply-Sum Odd Unsigned Doublewords. More... | |
static vui128_t | vec_vsldbi (vui128_t vra, vui128_t vrb, const unsigned int shb) |
Vector Shift Left Double Quadword by Bit Immediate. More... | |
static vui128_t | vec_vsrdbi (vui128_t vra, vui128_t vrb, const unsigned int shb) |
Vector Shift Right Double Quadword by Bit Immediate. More... | |
Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions.
Some of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins. Other operations do not exist as instructions on any current processor but are useful and should be provided. This header serves to provide these operations as inline functions using existing vector built-ins or other pveclib operations.
The original VMX (AKA Altivec) only defined a few instructions that operated on the 128-bit vector as a whole. This included the vector shift left/right (bit), vector shift left/right by octet (byte), vector shift left double by octet (select a contiguous 16-bytes from 2 concatenated vectors) 256-bit), and generalized vector permute (select any 16-bytes from 2 concatenated vectors). Use of these instructions can be complicated when;
These instructions can used in combination to provide generalized vector __int128 shift/rotate operations. Pveclib uses these operations to provide vector __int128 shift / rotate left, shift right and shift algebraic right operations. These operations require pre-conditions to avoid multiple instructions or require a combination of (bit and octet shift) instructions to get the quadword result. The compiler <altivec.h> built-ins only supports individual instructions. So using these operations quickly inspires a need for a header (like this) to contain implementations of the common operations.
The VSX facility (introduced with POWER7) did not add any integer doubleword (64-bit) or quadword (128-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector __int128 (128-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.
POWER8 added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend). The add quadword is useful to sum the partial products for a full 128 x 128-bit multiply. The add quadword write carry and extend forms, simplify extending arithmetic to 256-bits and beyond.
While POWER8 provided quadword integer add and subtract operations, it did not provide quadword Signed/Unsigned integer compare operations. It is possible to implement quadword compare operations using existing word / doubleword compares and the the new quadword subtract write-carry operation. The trick it so convert the carry into a vector bool __int128 via the vec_setb_ncq () operation. This header provides easy to use quadword compare operations.
POWER9 (PowerISA 3.0B) adds the Vector Multiply-Sum unsigned Doubleword Modulo instruction. Aspects of this instruction mean it needs to be used carefully as part of larger quadword multiply. It performs only two of the four required doubleword multiplies. The final quadword modulo sum will discard any overflow/carry from the potential 130-bit result. With careful pre-conditioning of doubleword inputs the results are can not overflow from 128-bits. Then separate add quadword add/write carry operations can be used to complete the sum of partial products. These techniques are used in the POWER9 specific implementations of vec_muleud, vec_muloud, vec_mulluq, and vec_muludq.
PowerISA 3.0B also defined additional: Binary Coded Decimal (BCD) and Zoned character format conversions. String processing operations. Vector Parity operations. Integer Extend Sign Operations. Integer Absolute Difference Operations. All of these seem to useful additions to pveclib for older (POWER7/8) processors and across element sizes (including quadword elements).
Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit and 128-bit integer vector intrinsics to be added to altivec.h. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.
Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.
This header covers operations that are either:
See Returning extended quadword results. for more background on extended quadword computation.
Technically operations on quadword elements should not require any endian specific transformation. There is only one element so there can be no confusion about element numbering or order. However some of the more complex quadword operations are constructed from operations on smaller elements. And those operations as provided by <altivec.h> are required by the OpenPOWER ABI to be endian sensitive. See Endian problems with doubleword operations for a more detailed discussion.
In any case the arithmetic (high to low) order of bits in a quadword are defined in the PowerISA (See vec_adduqm() and vec_subuqm()). So pveclib implementations will need to either:
The compilers may not support 128-bit integers for constants and printf (integer to ascii). For example GCC provides ANSI mandated constant and runtime support for integers up to long long which for PowerPC is only 64-bit.
The __int128 type is an extension that provides basic arithmetic operations but does not compile 128-bit constants or support printf formating for integers larger then long long. The following section provides examples and work around's for these restrictions.
The GCC compiler allows integer constants to be assigned/cast to __int128 types. The support also allows __int128 constants to be assigned/cast to vector __int128 types. So the following are allowed:
It gets more complicated when the constant exceeds the range of a long long value. For example the magic numbers for the multiplicative inverse described in Printing Vector __int128 values. The decimal integer constant we need for the quadword multiplier is "76624777043294442917917351357515459181" or the equivalent hexadecimal value "0x39a5652fb1137856d30baf9a1e626a6d". GCC does not allow constants this large to be expressed directly.
GCC supports aggregate initializer lists for the elements of vectors. For example:
So it is possible to compose a quadword constant by initializing a vector of word or doubleword elements then casting the result to a quadword type. For example:
or
There is one small problem with this as element order is endian dependent, while a vector quadword integer is always big endian. So we would need to adjust the element order for endian. For example:
or
Remembering to add the endian correction for constants used quadword operations is an issue and manually reversing the element order can be error prone. There should be an easier way.
The vec_common_ppc.h header provides some helper macros for when quadword operations need big endian element order on little endian platforms. These macros accept 2, 4, 8, or 16 element constants to form an aggregate initializer for a vector of the corresponding element type. The elements are always arranged left to right, high to low order. These macros are endian sensitive and either effectively pass-through for big endian or reverse the element order for little endian.
For example:
or
These macros internally cast to a vector unsigned integer type for the aggregate initializer. This type corresponds to the size and number of elements to fit in a 128-bit vector. This tells the compiler how many elements to expect and the allowed value range for the initializer. A final explicit cast is required to the vector type needed (usually a signed or unsigned __int128). (See: CONST_VINT128_DW(), CONST_VINT128_W(), CONST_VINT128_H(), CONST_VINT128_B() ). Other macros require the programmer to provide a cast to match the element count and size. (See: CONST_VINT64_DW(), CONST_VINT32_W(), CONST_VINT16_H(), CONST_VINT8_B() )
The methods above are effectively forming multi-digit constants where each digit is itself a large (word or doubleword) binary coded integer value. Because the digits are radix 2**N it is normal to convert large decimal constants to hexadecimal. This makes it easier to split the large constants into word or doubleword elements for the initializer.
Most compilers support compile time computation on constants. This is an optimization where only the final computed constant result is used in the generated code. Compile time constant computation supports the usual arithmetic operations on the usual types. Some compilers (including GCC) support constant computation on extended types including __int128.
For example:
produces the quadword integer value for the decimal constant 99999999999999999999999999999999.
With this technique we can split large decimal constants into 16, 18, or 19 digit blocks and then compute effective 32, 36, or 38 digit constant. (see CONST_VUINT128_Qx16d(), CONST_VUINT128_Qx18d(), and CONST_VUINT128_Qx19d()). For example:
Programming with quadword integers will need quadword constants for masking and arithmetic operations. In the sections above we provide means to define large and complex constants. But often there is need for small integer constants for use in boolean logic, masking/select operations, and simple arithmetic.
The technique above can used for small integer constants as well. For example:
In most cases this compiler will allocate these constant values to the read-only data (.rodata) section. When these constants are referenced in programming operations the compiler generates the appropriate vector loads. For example the GCC V11 generates the following for the -mcpu=power8 target:
And the following for the -mcpu=power9 target:
This is expected for POWER8 as PowerISA 2.07B does not have any displacement form (D-Form) vector (VSX) loads/stores instructions. The compiler allocates constants to the .rodata sections and the linker collects .rodata from object files into a combined executable .rodata section. This is placed near the Table of Contents (TOC) section. The ABI dedicates R2 as the base address .TOC. for the TOC and adjacent sections.
The Add Immediate Shifted (addis) Add Immediate (addi) sequence above computes a signed 32-bit .TOC. relative offset to a specific .rodata quadword. Two instructions are required as; addis provides the high adjusted 16-bits shifted left 16-bits, while addi provides the low 16-bits. The sum of R2 and these immediate values is the 64-bit effective address of a .rodata constant value. A signed 32-bit offset is large enough to support most program and library executables.
The load itself has a 5-cycle latency assuming a L1 cache hit. The three instruction sequence is sequentially dependent and requires 9-cycles latency (minimum) to execute. A L1 cache miss will increase the latency by 7-28 cycles, assuming the data resides in the L2/L3 caches.
However the compiler is not following the recommendations of
PowerISA 2.07B, Book II, Chapter 2.1 Performance-Optimized Instruction Sequences. This chapter recommends a specific pattern for the addi/lvx sequence. For example:
In this case rx can be any GPR (including r0) while RA must be a valid base (r1 <-> r31) register.
The POWER8 implementation allows for Instruction Fusion combining information from two adjacentt instructions into one (internal) instruction so that it executes faster than the non-fused case. Effectively the addi/lvx combination above becomes a D-Form load vector instruction.
There are additional restrictions on the definition of adjacent:
This can reduce the latency from 9 to 7-cycles. This would be true even without Instruction Funsion as the addis/addi instructions are now independent and can execute in parallel.
The sequence generated for POWER9 is even more disappointing. The lxv is a D-Form (DQ) instruction and the displacement operand could be used to replace the addi instruction. For example: -mcpu=power9 target:
This provides the equivalent 32-bit TOC relative displacement with one less instructions and reduced latency of 7-cycles.
This is all a little cumbersome and it seems like there should be a better/faster way. Any instruction sequence that loads quadword integer constant in:
is a good deal.
The base (Altivec) vector ISA included Vector Splat Immediate Signed Byte/Halfword/Word instructions. These are fast (2-cycle latency) and convenient for small integer constants in the range -16 to 15. So far the ISA has not added doubleword or quadword forms for these.
POWER9 added a VSX Vector Splat Immediate Byte (xxspltib) instruction. This expands the immediate range to -128 to 127 but does not include larger element sizes. POWER9 does provide Vector Extend Sign Byte To Word/Doubleword (vextsb2w/vextsb2d) instructions. For example the two instruction sequence:
can generate a doubleword splat immediate for integers in the range -128 to 127 with a cycle latency of 5-cycles. So far there is no extend sign byte/halfword/word to quadword. POWER10 does add Vector Extend Sign Doubleword To Quadword (vextsd2q).
The GCC compiler does recognize some vector constants as special case. For example:
will generate:
Another interesting example is the quadword sign mask. For example:
will generate:
The first 2 instructions generate vector constants of all zeros and all ones (same as above). The third instruction uses vector shift left word (vslw) to convert the word elements from 0xffffffff to 0x80000000.
The cleaver bit is shifting elements of the all ones (0xffffffff or -1) vector, left by 31-bits (0b11111), which is the value of low order 5-bits of the all ones element. Fortunately the vsl[bhw] instructions ignores all but the lower order bits needed for the element shift count.
To convert a word sign mask to a quadword sign mask we need the all zeros vector and one additional instruction. The Vector Shift Left Double by Octet Immediate (vsldoi) rotates the low-order signmask word element to the high order word with 3 words of '0' concatenated on the right.
The equivalent C language with <altivec.h> intrinsics implementation is:
This sequence is a little bigger (4 instructions) then we would like but should execute in 6-cycles. The first two instructions are independent and should execute in parallel. Also (as we will see) the all zero/ones constants are common building blocks. So the compiler should treat these as common sub expressions with across all operations using those constants.
So the compiler can do clever things with vector constants. But so far these are the only examples I have found. Other cases that you might expect to be a special case are not. For example:
and
generate the 3 instruction (9-cycle) load from .rodata sequence. also constants using the vector long long or __int128 types may fail to compile on older versions of the compiler.
We can generate small constants in the range 1-15 with using the following pattern:
Which generates:
Here we use the vec_splat_s32() intrinsic to generate the vspltisw instruction for the value 15.
This sequence is only 3 instructions, which should execute in 4-cycles. The first two instructions are independent and should execute in parallel. Also the q_zero constant is commonly used and the compiler should treat it as a common sub expressions.
For small (-16 to -1) negative constants we need to make one small change. We use the q_ones constant to propagate the sign across the quadword.
The generated sequence is also 3 instructions and should execute in 4-cycles.
Putting this all together we can create a static inline function to generate small quadword constants (in the range -16 to 15). For example:
This version uses only <altivec.h> intrinsics supported by POWER8 and earlier. For constants in the range (-16 to 15) the range is divided into three groups:
Values outside this range use the vec_splats() intrinsic which will generate the appropriate quadword constant in .rodata and the load sequence to retrieve that value.
For POWER9 and later we can use the VSX Vector Splat Immediate Byte (xxspltib) instruction and support the extended constant range of -128 to 127.
Here we use the vec_splats() intrinsic to generate the xxspltib instruction. The rest follows the pattern we used for POWER8 but shift left is adjusted for the byte (vs word) element splat to be 1 octet.
The transition from grade school math to computer programming requires the realization that computers handle numbers in fixed sized chunks. For the PowerISA these chunks are byte, halfword, word, doubleword, and quadword. While computer languages like "C" have integer types like char, short, int, long int, and __int128.
Happily these chunks are large enough to hold the equivalent of several decimal digits and handle most of the grotty details of multiply, divide, add, and subtract. But sometimes the chunk (used) is not large enough to hold all the digits you need. Sums may overflow and multiplies may be truncated (modulo the chunk size).
Sometimes we can simply switch to the next larger size (int to long, word to doubleword) and avoid the problem (overflow of sums or truncation of multiply). But sometimes the largest chunk the compiler or hardware supports is still not large enough for the numbers we are dealing with. This requires multiple precision arithmetic with works a lot like grade school arithmetic but with larger digits represented by the most convenient computer sized chunk.
Most programmers would prefer to use an existing multiple precision arithmetic library and move on. Existing libraries are implemented with scalar instructions and loops over storage arrays. But here we need to provide vector quadword multiply and extended quadword add/subtract operations. Any transfers between the libraries multi-precision storage arrays and vector registers are likely to exceed the timing for a direct vector implementation.
We also want to provide the basis for general multiple quadword precision arithmetic operations (see vec_int512_ppc.h). And for security implementations requiring large multiply products we are motivated to leverage the PowerISA large vector register set to avoid exposing these results (and partial products) to memory/cache side channel attacks.
First multiplying a M-digits by N-digits number requires up to (M+N)-digits to store the result. This is true independent of the size of your digit, including decimal, hexadecimal, and computer words/doublewords/quadwords. This explains why a 32-bit (word) by 32-bit integer multiply product is either:
The hardware has to one or the other.
Let's looks at some examples of multiplying two maximal 4-digit numbers:
And to drive home the point, let's look at the case of multiplying two maximal (32-bit word) 4-digit numbers:
This is also a (128-bit quadword) digit multiply with a (256-bit) 2 quadword digit result.
Adding asymmetric example; 4-digit by 1 digit multiply:
This pattern repeats across the all digit bases/size and values of M, N.
Note that the product is not the maximum value for the product width. It seem the product leave room to add another digit or two without overflowing the double wide product. Lets try some 4 digit examples by adding a maximal 4 digit value to the product.
Looks like there is still room in the double wide product to add another maximal 4 digit value.
But any more then that would cause a overflow.
Now we should look addends to asymmetric multiply. For example 4-digit by 1 digit multiply:
Note that when M not equal N then the addends are restrict to size M and/or size N. Two addends of the larger multiplier size can overflow. This pattern repeats across the all digit bases/sizes and values of M, N. For the binary fixed pointer multiply-add or bit sizes M/N we can write the equation:
(2(M+N) - 1) = ((2M - 1) * (2N - 1)) + (2M - 1) + (2N - 1)
Or in terms of fixed sized "words" of W-bits and M by N words.
(2(W*(M+N)) - 1) = ((2(W*M) - 1) * (2(W*N) - 1)) + (2(W*M) - 1) + (2(W*N) - 1)
Because with modern hardware the actual multiply operations are faster and have less impact while the summation across the partial products becomes the major bottleneck. For recent POWER processors fixed-point are 5-7 cycles latency and dual issue (2/cycle). These multiplies are only dependent on the inputs (multiplicands). This allows the compiler and (super-scalar processor) to schedule the multiply operations early to prepare for summation. In many cases the 3rd and 4th multiplies are complete before the summation of the first two multiplies completes.
The add operations involved in partial product summation are dependent on the current column multiply and the high order word of summation of the previous stage. While add operations are nominally faster (2-3 cycles) than multiplies, they can generate carries that have to be propagated.
The Fixed-Point Unit has a dedicated carry-bit (CA) which becomes the critical resource. This dependency on the carry (in addition to the column multiply and previous summation) limits the compiler's (and hardware's) ability to parallelize stages of the summation. The Vector unit (PowerISA 2.07+) has quadword (vs Fixed point doubleword) binary add/subtract with carry/extend. The Vector Unit requires separate write Carry instructions to detect and return the carry to VRs. The write Carry instructions are paired with Unsigned Quadword Modulo instructions that generates the (modulo) 128-bit result.
So knowing how to avoid overflows and carries in the summation of partial products can be useful. To illustrate we can examine the POWER8 implementation of vec_muludq(). POWER8 (PowerISA 2.07) does support add quadword but the largest vector fixed-point multiply is 32-bit Vector Multiply Even/Odd Unsigned Words (vec_muleuw() and (vec_mulouw()). The implementation generates four quadword by word (160-bit) partial products that are summed in four stages to generate the final 256-bit product.
Code for the first stage looks like this:
Note in this case we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example with maximum values for multiplicands a,b:
The high order 128-bits of the sum did not overflow.
The next tree stages are more complex.
Here we need a 3-way sum of the previous partial product, and the odd, even products from this stage. In this case the high 128-bits of previous partial product needs to align with the lower 128-bits of this stages 160-bit product for the first quadword add. This can produce a overflow, so we need to capture the carry and concatenate it the odd sum before shifting right 32-bits. Again we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example stage 2 with maximum values for multiplicands a,b:
For POWER8 this 3-way sum and the required write-carry adds significant latency to stages 2, 3, and 4 of this multiply.
In POWER8 the vector quadword add/subtract instructions are cracked into 2 dependent simple fixed-point (XS) IOPs. So the effective instruction latency is (2+2=4) cycles. Also cracked instructions must be first in group, so back-to-back vaddcuq/vadduqm sequences will be dispatched separately. There no possibility of executing the pair concurrently, so the latency for the pair is 5-6 cycles.
So there is value in finding an alternative summation that avoids/reduces the number write-carry operations. From above (Some useful arithmetic facts (you may of forgotten)) we know it is possible to add one or two unsigned words to each of the doubleword products generated by vmuleuw/vmulouw.
We need to align the words of the quadword addend (zero extended on the left to doublewords) with the corresponding doublewords of the products. We can use Vector Merge Even/Odd Word operations to split and pad the addend into to align with the products. Then we use Vector Add Doubleword for the even/odd product-sums. Finally we use shift and add quadword to produce the 160-bit stage 2 sum.
This sequence replaces two instructions (vaddcuq/vadduqm) with four instructions (vmrgew/vmrgow/vaddudm/vaddudm), all of which;
We expect a latency of 4 cycles over the whole sequence. And splitting the first add into even/odd add blocks allows the compiler (and out-of-order hardware) more flexibility for instruction scheduling.
Multiply-add seems to be a useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation that we can use here. For example:
Which generates the following instruction sequence:
The vspltisw loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function. The vmrgew has a latency of 2 cycles and should execute concurrently with vmuleuw. Similarly for vec_vmaddouw().
These operations (vec_vmaddeuw() and vec_vmaddouw()) are included in vec_int64_ppc.h as they require vec_addudm() and produce doubleword results. With this addition we can improve and simplify the code for stages 2-4 of the _ARCH_PWR8 implementation of vec_muludq(). For example:
From the description above (Some useful arithmetic facts (you may of forgotten)) we know we can add two unsigned words to the doubleword product without overflow. This is another useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation. For example:
Which generates to following instruction sequence:
The vspltisw loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function. The vmrgew/vmrgew/vaddudm sequence has a latency of 4-6 cycles and should execute concurrently with vmuleuw. Similarly for vec_vmadd2ouw().
The PowerISA has a number of Multiply-Sum instructions that look a lot like the Multiply-Add described above? Well not exactly:
First we should look at the arithmetic of Multiply-Sum using maximal unsigned integer values.
Note the sum overflows the word twice and high order bits of the sum will be lost.
For POWER9 we can simulate Vector Multiply Even/Odd Unsigned Doubleword by setting the Odd/Even doubleword of VRB to zero and the whole quadword addend VRC to zero. For example the even doubleword multiply.
And similarly for the odd doubleword multiply.
And review the arithmetic for vec_vmuleud() using maximal quadword values for a and b.
And for vec_vmuldud().
We can also simulate Vector Multiply-Add Even/Odd Unsigned Doubleword by setting the odd/even doubleword of VRB to zero and the whole quadword addend to the even/odd double word of VRC. For example the even doubleword multiply-add.
And similarly for the odd doubleword multiply-add.
And review the arithmetic for vec_vmaddeud() using maximal quadword values for a and b. The even/odd doublewords of c have slightly different values for illustrative purposes.
And for vec_vmaddoud().
This multiply-add even/odd doulbeword form only adds one additional (xxmrghd AKA xxpermdi) instruction over that required for the base multiply even/odd doubleword operation.
The xxspltib loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function.
For POWER9 instruction instruction timing is different and there are some unique trade-offs. The implementations above are small and appropriate for single instances of multiply doubleword or implementations of multiply quadword. However using the vmsumudm (operand VRC) addend creates a serial dependency within the multiply quadword implementation. When multiply quadword and multiply-add quadword are used in the implementation of wider multiplies (see vec_int512_ppc.h) these serial dependencies actually slow down the implementation.
So lets look at some examples using the vmsumudm (VRC) addend and the alternative using VRC (settting VRA to zero) and explicit add quadword. First a 128x128-bit unsigned multiply using vmsumudm and exploiting the VRC addend where appropriate.
Next a 128x128-bit unsigned multiply using vmsumudm but only passing const zero to the VRC addend.
The second example (using explicit add quadword);
We can use multiply-add operation for wider word sizes (quadword and multiple precision quadword). The simplest quadword implementation would create a vec_madduq() operation based on vec_muludq() and add a quadword parameter "c" for the addend. Then modify the first stage of the platform specific multiplies to replace vector multiply even/odd with vector multiply-add even/odd, passing the addend as the the third parameter.
This works well for the POWER8 implementation because the additional vector add doublewords can be scheduled independently of the vector multiply even/odd words. But for POWER9 we need to avoid the serial dependences explained above in Why not Vector Multiply-Sum.
For the POWER9 implementation we use an explicit add quadword (and write-Carry) to sum the addend parameter to the first stage Multiply odd doubleword. For example:
The generated code is the same size as the serially depended version
This is just another example where the shortest instruction sequence or using the most powerful instructions, may not be the fastest implementation. The key point is that avoiding serial dependencies in the code and allowing the compiler to schedule high latency instructions early, allows better performance. This effect is amplified when quadword multiplies (vec_muludq(), vec_madduq(), and vec_madd2uq()) are used to compose wider multiply operations (see vec_int512_ppc.h).
The PowerISA Vector facilities provide logical and integer arithmetic quadword (128-bit) operations. Some operations as direct PowerISA instructions and other operations composed of short instruction sequences. The Power Vector Library provides a higher level and comprehensive API of quadword integer integer arithmetic and support for extended arithmetic to multiple quadwords.
The GCC compiler supports the (vector) __int128 type but the runtime does not support printf() formating for __int128 types. However if we can use divide/modulo operations to split vector __int128 values into modulo 10^16 long int (doubleword) chunks, we can use printf() to convert and concatenate the decimal values into a complete number.
For example, from the __int128 value (39 decimal digits):
We can use signed compare to detect the sign and set a char value to print a ' ' or '+' prefix. If the value is negative we want the absolute value before we do the divide/modulo steps. For example:
Here we use the pveclib operation vec_cmpsq_all_ge() because the ABI and compilers do not define compare built-ins operations for the vector __int128 type. For the negative case we use the pveclib operation vec_subuqm() instead of vec_abs. Again the ABI and compilers do not define vec_abs built-ins for the vector __int128 type. Using pveclib operations have the additional benefit of supporting older compilers and platform specific implementations for POWER7 and POWER8.
Now we have the absolute value in val128 we can factor it into (3) chunks of 16 digits each. Normally scalar codes would use integer divide/modulo by 10000000000000000. And we are reminded that the PowerISA vector unit does not support integer divide operations and definitely not for quadword integers.
Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 128-bit fraction and we have a multiply high (vec_mulhuq()) operation. Multiplying a 128-bit unsigned integer by a 128-bit unsigned fraction generates a 256-bit product with 128-bits above (integer) and below (fraction) the radix point. The high 128-bits of the product is the integer quotient and we can discard the low order 128-bits.
It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.
In the chapter above;
Figure 10-2 Computing the magic number for unsigned division.
provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For quadword and the divisor 10000000000000000,this is { 76624777043294442917917351357515459181, 0 , 51 }:
All the operations used above are defined and implemented by pveclib. Most of these operations is not defined as single instructions in the PowerISA or as built-ins the ABI or require alternative implementations for older processors.
Now we have three vector unsigned __int128 values (t_low, t_mid, t_high) in the range 0-9999999999999999. Fixed point values in that range fit into the low order doubleword of each quadword. We can access these doublewords with array notation ([VEC_DW_L]) and the compiler will transfer them to fixed point (long int) GPRs. Then use normal char and long int printf() formating. For example:
Here is the complete vector __int128 printf example:
POWER8 and POWER9 added a number of Binary Code Decimal (BCD) and Zoned Decimal operations that should be helpful for radix conversion and even faster large integer formatting for print.
The issue remains that __int128 values can represent up to 39 decimal digits while Signed BCD supports only 31 digits. POWER9 provides a Decimal Convert From Signed Quadword instruction with the following restriction:
It would be useful to check for this and if required, factor the __int128 value into to the high order 8 digits and the low order 31 digits. This allows for the safe and correct use of the vec_bcdcfsq() and with some decimal shifts/truncates vec_bcdctz(). This also enables conversion to multiple precision Vector BCD to represent 39 digits and more for radix conversions.
We first address the factoring by providing Vector Divide by const 10e31 Unsigned Quadword and Vector Modulo by const 10e31 Unsigned Quadword operation. This requires the multiplicative inverse using the vec_mulhuq() operation.
As the vec_mulhuq() operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the 0 quotient, seems a prudent optimization.
So far we only have the quotient (the high order 8 digits) and still need to extract the remainder (the low order 31 digits). This is simply the quotient from above multiplied by 10e31 and subtracted from the original input. To avoid the multiple return value issue we define a modulo operation to take the original value and the quotient from vec_divuq_10e31().
Again as the vec_mulluq() operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the input value as the remainder, seems a prudent optimization.
We expect these operations to be used together as in this example.
We also expect the compiler to common the various constant loads across the two operations as the code is in-lined. This header also provides variants for factoring by 10e32 (to use with the Zone conversion) and signed variants of the 10e31 operation for direct conversion to extend precision signed BCD.
Some algorithms require even high integer precision than __int128 provides. this includes:
The POWER8 provides instructions for extending add and subtract to 128-bit integer and beyond with carry/extend operations (see vec_addcuq(), vec_addecuq(), vec_addeuqm(), vec_adduqm(), (see vec_subcuq(), vec_subecuq(), vec_subeuqm(), vec_subuqm()). POWER9 adds instructions to improve decimal / binary conversion to/from 128-bit integer and beyond with carry/extend operations. And while the PowerISA does not yet provide full 128 x 128 bit integer multiply instructions, it has provided wider integer multiply instructions, beginning in POWER8 (see vec_mulesw(), vec_mulosw(), vec_muleuw(), vec_mulouw()) and again in POWER9 (see vec_msumudm()).
This all allows the pveclib to improve (reduce the latency of) the implementation of multiply quadword operations. This includes operations that generate the full 256-bit multiply product (see vec_muludq(), vec_mulhuq(). vec_mulluq()). And this in combination with add/subtract with carry extend quadword allows the coding of even wider (multiple quadword) multiply operations.
The following example performs a 256x256 bit unsigned integer multiply generating a 512-bit product:
This example generates some additional questions:
The detailed rationale for this is documented in section Returning extended quadword results. In this specific case (quadword integer operations that generate two vector values) pveclib provides both alternatives:
Either method should provide the same results. For example:
is equivalent to
and
is equivalent to
So is there any advantage to separate versus combined operations?
Functionally it is useful to have separate operations for the cases where only one quadword part is needed. For example if you know that a add/subtract operation can not overflow, why generate the carry? Alternatively the quadword greater/less-than compares are based solely on the carry from the subtract quadword, why generate lower 128-bit (modulo) difference? For multiplication the modulo (multiply low) operation is the expected semantic or is known to be sufficient. Alternatively the multiplicative inverse only uses the high order (multiply high) quadword of the product.
From the performance (instruction latency and throughput) perspective, if the algorithm requires the extended result or full product, the combined operation is usually the better choice. Otherwise use the specific single return operation needed. At best, the separate operations may generate the same instruction sequence as the combined operation, But this depends on the target platform and specific optimizations implemented by the compiler.
In the section Converting Vector __int128 values to BCD above we used multiplicative inverse to factor a binary quadword value in two (high quotient and low remainder) parts. Here we divide by a large power of 10 (1031 or 1032) of a size where the quotient and remainder allow direct conversion to BCD (see vec_bcdcfsq(), vec_bcdcfuq()). After conversion, the BCD parts can be concatenated to form the larger (39 digit) decimal radix value equivalent of the 128-bit binary value.
We can extend this technique to larger (multiple quadword) binary values but this requires long division. This is the version of the long division you learned in grade school, where a multi-digit value is divided in stages by a single digit. But the digits we are using are really big (1031-1 or 1032-1).
The first step is relatively easy. Start by dividing the left-most digit of the dividend by the divisor, generating the integer quotient and remainder. We already have operations to implement that.
The array d contains the quadwords of the extended precision integer dividend. The array q will contain the quadwords of the extended precision integer quotient. Here we have generated the first quadword q[0] digit of the quotient. The remainder rh will be used in the next step of the long division.
The process repeats except after the first step we have an intermediate dividend formed from:
So for each additional step we need to divide two quadwords (256-bits) by the quadword divisor. Actually this dividend should be less than a full 256-bits because we know the remainder is less than the divisor. So the intermediate dividend is less than ((divisor - 1) * 2128). So we know the quotient can not exceed (2128-1) or one quadword.
Now we need an operation that will divide this double quadword value and provide quotient and remainder that are correct (or close enough). Remember your grade school long division where you would:
So we don't need to be perfect, but close enough. As long as we can detect any problems and (if needed) correct the results, we can implement long division to any size.
We already have an operation for dividing a quadword by 1031 using the magic numbers for multiplicative inverse. This can easily be extended to multiply double quadword high. For example:
Here we generate a 256-bit multiply high using the vec_mulhuq() for the low dividend (vrb) and vec_muludq() for high dividend (vra). Then sum the partial products ([t||q1] + [0||q]) to get initial 256-bit product [q1||q]. Then apply the corrective add ([q1||q] + [vra||vrb]). This may generate a carry which needs to be included in the final shift.
Technically we only expect a 128-bit quotient after the shift, but we have 3 quadwords (2 quadwords and a carry) going into the shift right. Also our (estimated) quotient may be off by 1 and generate a 129-bit result. This is due to using a the magic numbers for 128-bit multiplicative inverse and not regenerating magic numbers for 256-bits. We can't do anything about that now and so return a 256-bit double quadword quotient.
The 256-bits we want are spanning multiple quadwords so we replace a simple quadword shift right with two Shift Left Double Quadword Immediate operations and complement the shift count (128 - shift_ten31). This gives a 256-bit quotient which we expect to have zero in the high quadword.
As this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for an short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the quotient is zero. This also helps when the long division dividend may have leading quadword zeros that need to be skipped over. For the full implementation looks like:
To complete the long division operation we need to perform double quadword modulo operations. Here the dividend is two quadwords and the low quadword of the quotient from the divide double quadword operation above. We use multiply double quadword to compute the remainder ([vra||vrb] - (q * 1031). Generating the 256-bit product and difference ensure we can detect the case where the quotient is off-by-1 on the high side.
In this case we need to correct both remainder and the (estimated) quotient. This is a bit tricky as the quotient is normally passed by value, but for this operation we need to pass by reference, which allows the corrected quotient to be passed on to the next step.
Again as this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for and short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the remainder is simply the dividend.
Now we have all the operations needed to complete the implementation of long division by the decimal constant (1031).
The result of each call to example_longdiv_10e31() is the output array q of quadwords containing the extended quotient, and the remainder as the return value. The input array d and output array q should not overlap in storage. The remainder is in the range 0-9999999999999999999999999999999 and is suitable for conversion to BCD or decimal characters. (see vec_bcdcfsq()). Repeated calls passing the quotient from the previous call as the dividend, reduces the quotient by 31 digits and returns another 31 digits in the remainder for conversion. This continues until the quotient is less than 1031 which provides the highest order digits of the decimal result.
High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.
#define CONST_VUINT128_Qx16d | ( | __q0, | |
__q1 | |||
) |
Generate a vector unsigned __int128 constant from doublewords.
Combine 2 x 16 decimal digit long long constants into a single 32 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.
For example
#define CONST_VUINT128_Qx18d | ( | __q0, | |
__q1 | |||
) |
Generate a vector unsigned __int128 constant from doublewords.
Combine 2 x 18 decimal digit long long constants into a single 36 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.
For example
#define CONST_VUINT128_Qx19d | ( | __q0, | |
__q1 | |||
) |
Generate a vector unsigned __int128 constant from doublewords.
Combine 2 x 19 decimal digit long long constants into a single 38 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.
For example
#define CONST_VUINT128_QxD | ( | __q0, | |
__q1 | |||
) |
Generate a vector unsigned __int128 constant from doublewords.
Combine 2 x 64-bit long long constants into a single __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.
For example
#define CONST_VUINT128_QxW | ( | __q0, | |
__q1, | |||
__q2, | |||
__q3 | |||
) |
Generate a vector unsigned __int128 constant from words.
Combine 4 x 32-bit int constants into a single __int128 constant. The 4 parameters are integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.
The effect is to compute an unsigned __int128 constant from 4 x 32-bit unsigned int constants.
For example
Vector Absolute Difference Unsigned Quadword.
Compute the absolute difference of the quadwords. For each unsigned quadword, subtract VRB from VRA and return the absolute value of the difference.
processor | Latency | Throughput |
---|---|---|
power8 | 14 | 1/cycle |
power9 | 11 | 1/cycle |
vra | vector of unsigned __int128 |
vrb | vector of unsigned __int128 |
Vector Absolute Value Signed Quadword.
Compute the absolute value of a signed quadwords.
processor | Latency | Throughput |
---|---|---|
power8 | 6-8 | 1/cycle |
power9 | 9-12 | 1/cycle |
vra | vector of signed __int128 |
Vector Add with carry Unsigned Quadword.
Add two vector __int128 values and return sum and the carry out.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 1/2 cycles |
power9 | 6 | 2/cycle |
*cout | carry out from the sum of a and b. |
a | 128-bit vector treated a __int128. |
b | 128-bit vector treated a __int128. |
Vector Add & write Carry Unsigned Quadword.
Add two vector __int128 values and return the carry out.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
a | 128-bit vector treated a __int128. |
b | 128-bit vector treated a __int128. |
Vector Add Extended & write Carry Unsigned Quadword.
Add two vector __int128 values plus a carry-in (0|1) and return the carry out bit.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
a | 128-bit vector treated a __int128. |
b | 128-bit vector treated a __int128. |
ci | Carry-in from vector bit[127]. |
Vector Add Extend with carry Unsigned Quadword.
Add two vector __int128 values plus a carry-in (0|1) and return sum and the carry out.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 1/2 cycles |
power9 | 6 | 2/cycle |
*cout | carry out from the sum of a and b. |
a | 128-bit vector treated a __int128. |
b | 128-bit vector treated a __int128. |
ci | Carry-in from vector bit[127]. |
Vector Add Extended Unsigned Quadword Modulo.
Add two vector __int128 values plus a carry (0|1) and return the modulo 128-bit result.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
a | 128-bit vector treated a __int128. |
b | 128-bit vector treated a __int128. |
ci | Carry-in from vector bit[127]. |
Vector Add Unsigned Quadword Modulo.
Add two vector __int128 values and return result modulo 128-bits.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
a | 128-bit vector treated as a __int128. |
b | 128-bit vector treated as a __int128. |
Vector Average Unsigned Quadword.
Compute the average of two unsigned quadwords as (VRA + VRB + 1) / 2.
processor | Latency | Throughput |
---|---|---|
power8 | 14 | 1/cycle |
power9 | 11 | 1/cycle |
vra | vector unsigned quadwords |
vrb | vector unsigned quadwords |
Vector Count Leading Zeros Quadword for unsigned __int128 elements.
Count leading zeros for a vector __int128 and return the count in a vector suitable for use with vector shift (left|right) and vector shift (left|right) by octet instructions.
processor | Latency | Throughput |
---|---|---|
power8 | 8-10 | 1/cycle |
power9 | 10-12 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
Vector Compare Equal Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra == vrb, otherwise all '0's. We use vec_cmpequq as it works for both signed and unsigned compares.
processor | Latency | Throughput |
---|---|---|
power8 | 6 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Equal Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra == vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. To get the correct quadword result, the doubleword element equal truth values are swapped, then anded with the original compare results. Otherwise use vector word compare and additional boolean logic to insure all word elements are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 6 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128s. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare Greater Than or Equal Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra >= vrb, otherwise all '0's.
Flip the operand sign bits and use vec_cmpgeuq for signed compare.
processor | Latency | Throughput |
---|---|---|
power8 | 10-16 | 1/ 2cycles |
power9 | 8-14 | 1/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Greater Than or Equal Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra >= vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_cyq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_cyq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare Greater Than Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra > vrb, otherwise all '0's.
Flip the operand sign bits and use vec_cmpgtuq for signed compare.
processor | Latency | Throughput |
---|---|---|
power8 | 10-16 | 1/ 2cycles |
power9 | 8-14 | 1/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Greater Than Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra > vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction with the parameters reversed. This generates a carry for less than or equal and NOT carry for greater than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_ncq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare Less Than or Equal Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra <= vrb, otherwise all '0's.
Flip the operand sign bits and use vec_cmpleuq for signed compare.
processor | Latency | Throughput |
---|---|---|
power8 | 10-16 | 1/ 2cycles |
power9 | 8-14 | 1/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Less Than or Equal Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra <= vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_cyq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare Less Than Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra < vrb, otherwise all '0's.
Flip the operand sign bits and use vec_cmpltuq for signed compare.
processor | Latency | Throughput |
---|---|---|
power8 | 10-16 | 1/ 2cycles |
power9 | 8-14 | 1/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Less Than Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra < vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_ncq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare Equal Signed Quadword.
Compare signed __int128 (128-bit) integers and return all '1's, if vra != vrb, otherwise all '0's. We use vec_cmpequq as it works for both signed and unsigned compares.
processor | Latency | Throughput |
---|---|---|
power8 | 6 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector treated as an signed __int128. |
vrb | 128-bit vector treated as an signed __int128. |
Vector Compare Not Equal Unsigned Quadword.
Compare unsigned __int128 (128-bit) integers and return all '1's, if vra != vrb, otherwise all '0's.
For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. To get the correct quadword result, the doubleword element equal truth values are swapped, then not anded with the original compare results. Otherwise use vector word compare and additional boolean logic to insure all word elements are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 6 | 2/cycle |
power9 | 7 | 2/cycle |
vra | 128-bit vector treated as an unsigned __int128. |
vrb | 128-bit vector treated as an unsigned __int128. |
Vector Compare all Equal Signed Quadword.
Compare vector signed __int128 values and return true if vra and vrb are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare any Greater Than or Equal Signed Quadword.
Compare vector unsigned __int128 values and return true if vra >= vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10-15 | 1/ 2cycles |
power9 | 8 | 1/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare any Greater Than Signed Quadword.
Compare vector signed __int128 values and return true if vra > vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10-15 | 1/ 2cycles |
power9 | 8 | 1/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare any Less Than or Equal Signed Quadword.
Compare vector signed __int128 values and return true if vra <= vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10-15 | 1/ 2cycles |
power9 | 8 | 1/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare any Less Than Signed Quadword.
Compare vector signed __int128 values and return true if vra < vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10-15 | 1/ 2cycles |
power9 | 8 | 1/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare all Not Equal Signed Quadword.
Compare vector signed __int128 values and return true if vra and vrb are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed __int128 (qword) element. |
vrb | 128-bit vector treated as an vector signed __int128 (qword) element. |
Vector Compare all Equal Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra and vrb are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector Compare any Greater Than or Equal Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra >= vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 8-13 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector Compare any Greater Than Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra > vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 8-13 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector Compare any Less Than or Equal Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra <= vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 8-13 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector Compare any Less Than Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra < vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 8-13 | 2/ 2cycles |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector Compare all Not Equal Unsigned Quadword.
Compare vector unsigned __int128 values and return true if vra and vrb are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 4-9 | 2/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
vrb | 128-bit vector treated as an vector unsigned __int128 (qword) element. |
Vector combined Multiply by 100 & write Carry Unsigned Quadword.
compute the product of a 128 bit values a * 100. Only the low order 128 bits of the product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 6 | 1/cycle |
*cout | pointer to upper 128-bits of the product. |
a | 128-bit vector treated as unsigned __int128. |
Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword.
Compute the product of a 128 bit value a * 100 + digit(cin). The function return its low order 128 bits of the extended product. The first parameter (*cout) it the address of the vector to receive the generated carry out in the range 0-99.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 9 | 1/cycle |
*cout | pointer to upper 128-bits of the product. |
a | 128-bit vector treated as unsigned __int128. |
cin | values 0-99 in bits 120:127 of a vector. |
Vector combined Multiply by 10 & write Carry Unsigned Quadword.
compute the product of a 128 bit values a * 10. Only the low order 128 bits of the product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 3 | 1/ 2cycles |
*cout | pointer to upper 128-bits of the product. |
a | 128-bit vector treated as a unsigned __int128. |
Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword.
Compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 3 | 1/ 2cycles |
*cout | pointer to upper 128-bits of the product. |
a | 128-bit vector treated as a unsigned __int128. |
cin | values 0-9 in bits 124:127 of a vector. |
Vector Count Trailing Zeros Quadword for unsigned __int128 elements.
Count trailing zeros for a vector __int128 and return the count in a vector suitable for use with vector shift (left|right) and vector shift (left|right) by octet instructions.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 13-16 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
Vector Divide by const 10e31 Signed Quadword.
Compute the quotient of a 128 bit values vra / 10e31.
processor | Latency | Throughput |
---|---|---|
power8 | 18-60 | 1/cycle |
power9 | 20-45 | 1/cycle |
vra | the dividend as a vector treated as a unsigned __int128. |
Vector Divide Unsigned Double Quadword by const 10e31.
Compute the quotient of 256 bit value vra||vrb / 10e31.
processor | Latency | Throughput |
---|---|---|
power8 | 12-192 | 1/cycle |
power9 | 9-127 | 1/cycle |
*qh | the high quotient as a vector unsigned __int128. |
vra | the high dividend as a vector unsigned __int128. |
vrb | the low dividend as a vector unsigned __int128. |
Vector Divide Unsigned Double Quadword by const 10e32.
Compute the quotient of 256 bit value vra||vrb / 10e32.
processor | Latency | Throughput |
---|---|---|
power8 | 12-192 | 1/cycle |
power9 | 9-127 | 1/cycle |
*qh | the high quotient as a vector unsigned __int128. |
vra | the high dividend as a vector unsigned __int128. |
vrb | the low dividend as a vector unsigned __int128. |
Vector Divide by const 10e31 Unsigned Quadword.
Compute the quotient of a 128 bit values vra / 10e31.
processor | Latency | Throughput |
---|---|---|
power8 | 8-48 | 1/cycle |
power9 | 9-31 | 1/cycle |
vra | the dividend as a vector treated as a unsigned __int128. |
Vector Divide by const 10e32 Unsigned Quadword.
Compute the quotient of a 128 bit values vra / 10e32.
processor | Latency | Throughput |
---|---|---|
power8 | 8-48 | 1/cycle |
power9 | 9-31 | 1/cycle |
vra | the dividend as a vector treated as a unsigned __int128. |
|
inlinestatic |
Vector Multiply-Add2 Unsigned Quadword.
Compute the sum of the 256 bit product of two 128 bit values a, b plus the sum of 128 bit values c1 and c2. The low order 128 bits of the sum are returned, while the high order 128-bits are "stored" via the mulu pointer.
processor | Latency | Throughput |
---|---|---|
power8 | 60-66 | 1/cycle |
power9 | 30-36 | 1/cycle |
*mulu | pointer to vector unsigned __int128 to receive the upper 128-bits of the 256 bit sum ((a * b) + c1 + c2). |
a | 128-bit vector treated as unsigned __int128. |
b | 128-bit vector treated as unsigned __int128. |
c1 | 128-bit vector treated as unsigned __int128. |
c2 | 128-bit vector treated as unsigned __int128. |
Vector Multiply-Add Unsigned Quadword.
Compute the sum of the 256 bit product of two 128 bit values a, b plus the 128 bit value c. The low order 128 bits of the sum are returned, while the high order 128-bits are "stored" via the mulu pointer.
processor | Latency | Throughput |
---|---|---|
power8 | 56-62 | 1/cycle |
power9 | 27-33 | 1/cycle |
*mulu | pointer to vector unsigned __int128 to receive the upper 128-bits of the 256 bit sum ((a * b) + c). |
a | 128-bit vector treated as unsigned __int128. |
b | 128-bit vector treated as unsigned __int128. |
c | 128-bit vector treated as unsigned __int128. |
Vector Maximum Signed Quadword.
Compare Quadwords vra and vrb as signed integers and return the larger value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 12-18 | 2/cycle |
power9 | 10-18 | 2/cycle |
vra | 128-bit vector __int128. |
vrb | 128-bit vector __int128. |
Vector Maximum Unsigned Quadword.
Compare Quadwords vra and vrb as unsigned integers and return the larger value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 2/cycle |
power9 | 8 | 2/cycle |
vra | 128-bit vector unsigned __int128. |
vrb | 128-bit vector unsigned __int128. |
Vector Minimum Signed Quadword.
Compare Quadwords vra and vrb as signed integers and return the smaller value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 12-18 | 2/cycle |
power9 | 10-18 | 2/cycle |
vra | 128-bit vector __int128. |
vrb | 128-bit vector __int128. |
Vector Minimum Unsigned Quadword.
Compare Quadwords vra and vrb as unsigned integers and return the smaller value in the result.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 2/cycle |
power9 | 8 | 2/cycle |
vra | 128-bit vector unsigned __int128 int. |
vrb | 128-bit vector unsigned __int128 int. |
Vector Modulo by const 10e31 Signed Quadword.
Compute the remainder of a 128 bit values vra % 10e31.
processor | Latency | Throughput |
---|---|---|
power8 | 8-52 | 1/cycle |
power9 | 9-23 | 2/cycle |
vra | the dividend as a vector treated as a signed __int128. |
q | 128-bit signed __int128 containing the quotient from vec_divuq_10e31(). |
Vector Modulo Unsigned Double Quadword by const 10e31.
Compute the remainder (vra||vrb) - (ql * 10e31).
processor | Latency | Throughput |
---|---|---|
power8 | 12-124 | 1/cycle |
power9 | 12-75 | 1/cycle |
vra | the high dividend as a vector unsigned __int128. |
vrb | the low dividend as a vector unsigned __int128. |
*ql | 128-bit unsigned __int128 containing the quotient from vec_divudq_10e31(). |
Vector Modulo Unsigned Double Quadword by const 10e32.
Compute the remainder (vra||vrb) - (ql * 10e32).
processor | Latency | Throughput |
---|---|---|
power8 | 12-124 | 1/cycle |
power9 | 12-75 | 1/cycle |
vra | the high dividend as a vector unsigned __int128. |
vrb | the low dividend as a vector unsigned __int128. |
*ql | 128-bit unsigned __int128 containing the quotient from vec_divudq_10e31(). |
Vector Modulo by const 10e31 Unsigned Quadword.
Compute the remainder of a 128 bit values vra % 10e31.
processor | Latency | Throughput |
---|---|---|
power8 | 8-52 | 1/cycle |
power9 | 9-23 | 2/cycle |
vra | the dividend as a vector treated as a unsigned __int128. |
q | 128-bit unsigned __int128 containing the quotient from vec_divuq_10e31(). |
Vector Modulo by const 10e32 Unsigned Quadword.
Compute the remainder of a 128 bit values vra % 10e32.
processor | Latency | Throughput |
---|---|---|
power8 | 8-52 | 1/cycle |
power9 | 9-23 | 2/cycle |
vra | the dividend as a vector treated as a unsigned __int128. |
q | 128-bit unsigned __int128 containing the quotient from vec_divuq_10e32(). |
Vector Multiply-Sum and Write Carryout Unsigned Doubleword.
Compute the even and odd 128-bit products of doubleword 64-bit element values from a, b. Then compute the carry-out of the low order 128-bits of the sum of (aeven * beven) + (aodd * bodd) + c. Only the high order 2 bits of the 130-bit Multiply-Sum are returned and the low order 128-bits of the sum are ignored/lost. Results are in the range 0-2.
processor | Latency | Throughput |
---|---|---|
power8 | 30-32 | 1/cycle |
power9 | 5-7 | 2/cycle |
a | 128-bit __vector unsigned long long. |
b | 128-bit __vector unsigned long long. |
c | 128-bit __vector unsigned __int128. |
Vector Multiply-Sum Unsigned Doubleword Modulo.
compute the even and odd 128-bit products of doubleword 64-bit element values from a, b. Then compute the 128-bit sum (aeven * beven) + (aodd * bodd) + c. Only the low order 128 bits of the Multiply-Sum are returned and any overflow/carry-out is ignored/lost.
processor | Latency | Throughput |
---|---|---|
power8 | 30-32 | 1/cycle |
power9 | 5-7 | 2/cycle |
a | 128-bit __vector unsigned long int. |
b | 128-bit __vector unsigned long int. |
c | 128-bit __vector unsigned __int128. |
Vector Multiply by 10 & write Carry Unsigned Quadword.
compute the product of a 128 bit value a * 10. Only the high order 128 bits of the product are returned. This will be binary coded decimal value 0-9 in bits 124-127, Bits 0-123 will be '0'.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 3 | 1/cycle |
a | 128-bit vector treated as a unsigned __int128. |
Vector Multiply by 10 Extended & write Carry Unsigned Quadword.
Compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 3 | 1/cycle |
a | 128-bit vector treated as unsigned __int128. |
cin | values 0-9 in bits 124:127 of a vector. |
Vector Multiply by 10 Extended Unsigned Quadword.
compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 3 | 1/cycle |
a | 128-bit vector treated as unsigned __int128. |
cin | values 0-9 in bits 124:127 of a vector. |
Vector Multiply by 10 Unsigned Quadword.
compute the product of a 128 bit value a * 10. Only the low order 128 bits of the product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 13-15 | 1/cycle |
power9 | 3 | 1/cycle |
a | 128-bit vector treated as unsigned __int128. |
Vector Multiply Even Unsigned Doublewords.
Multiple the even 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the even doublewords.
processor | Latency | Throughput |
---|---|---|
power8 | 21-23 | 1/cycle |
power9 | 8-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
Vector Multiply High Unsigned Doubleword.
Multiple the corresponding doubleword elements of two vector unsigned long values and return the high order 64-bits, from each 128-bit product.
processor | Latency | Throughput |
---|---|---|
power8 | 28-32 | 1/cycle |
power9 | 11-16 | 1/cycle |
Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.
vra | 128-bit vector unsigned long int. |
vrb | 128-bit vector unsigned long int. |
Vector Multiply High Unsigned Quadword.
compute the 256 bit product of two 128 bit values a, b. The high order 128 bits of the product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 56-64 | 1/cycle |
power9 | 33-39 | 1/cycle |
a | 128-bit vector treated as unsigned __int128. |
b | 128-bit vector treated as unsigned __int128. |
Vector Multiply Low Unsigned Quadword.
compute the 256 bit product of two 128 bit values a, b. Only the low order 128 bits of the product are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 42-48 | 1/cycle |
power9 | 16-20 | 2/cycle |
a | 128-bit vector treated as unsigned __int128. |
b | 128-bit vector treated as unsigned __int128. |
Vector Multiply Odd Unsigned Doublewords.
Multiple the odd 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the odd doublewords.
processor | Latency | Throughput |
---|---|---|
power8 | 21-23 | 1/cycle |
power9 | 8-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
Vector Multiply Unsigned Doubleword Modulo.
Multiple the corresponding doubleword elements of two vector unsigned long values and return the low order 64-bits of the 128-bit product for each element.
processor | Latency | Throughput |
---|---|---|
power8 | 19-28 | 1/cycle |
power9 | 11-16 | 1/cycle |
vra | 128-bit vector unsigned long long. |
vrb | 128-bit vector unsigned long long. |
Vector Multiply Unsigned Double Quadword.
compute the 256 bit product of two 128 bit values a, b. The low order 128 bits of the product are returned, while the high order 128-bits are "stored" via the mulu pointer.
processor | Latency | Throughput |
---|---|---|
power8 | 52-56 | 1/cycle |
power9 | 24-30 | 1/cycle |
*mulu | pointer to vector unsigned __int128 to receive the upper 128-bits of the product. |
a | 128-bit vector treated as unsigned __int128. |
b | 128-bit vector treated as unsigned __int128. |
Vector Negate Signed Quadword.
Negate (0 - int128) the quadword.
processor | Latency | Throughput |
---|---|---|
power8 | 6-8 | 2/cycle |
power9 | 9-12 | 2/cycle |
int128 | a 128-bit vector treated as signed __int128. |
Vector Negate Unsigned Quadword.
Negate (0 - int128) the quadword.
processor | Latency | Throughput |
---|---|---|
power8 | 6-8 | 2/cycle |
power9 | 9-12 | 2/cycle |
int128 | a 128-bit vector treated as unsigned __int128. |
Vector Population Count Quadword for unsigned __int128 elements.
Count the number of '1' bits within a vector unsigned __int128 and return the count (0-128) in a vector unsigned __int128.
processor | Latency | Throughput |
---|---|---|
power8 | 9-11 | 2/cycle |
power9 | 9-12 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
Vector Byte Reverse Quadword.
Return the bytes / octets of a 128-bit vector in reverse order.
processor | Latency | Throughput |
---|---|---|
power8 | 2-13 | 2 cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
Vector Rotate Left Quadword.
Vector Rotate Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1 cycle |
power9 | 14 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
vrb | Shift amount in bits 121:127. |
Vector Rotate Left Quadword Immediate.
Vector Rotate Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1 cycle |
power9 | 14 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
shb | Shift amount in the range 0-127. |
Vector Select Signed Quadword.
Return the value, (vra & ~vrc) | (vrb & vrc).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
vrb | a 128-bit vector treated as unsigned __int128. |
vrc | a 128-bit vector treated as bool __int128. |
Vector Select Unsigned Quadword.
Return the value, (vra & ~vrc) | (vrb & vrc).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
vrb | a 128-bit vector treated as unsigned __int128. |
vrc | a 128-bit vector treated as bool __int128. |
Vector Set Bool from Quadword Carry.
If the vector quadword carry bit (vcy.bit[127]) is '1' then return a vector bool __int128 that is all '1's. Otherwise return all '0's.
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 6 | 2/cycle |
power9 | 3 - 5 | 2/cycle |
Vector quadword carries are normally the result of a write-Carry operation. For example; vec_addcuq(), vec_addecuq(), vec_subcuq(), vec_subecuq(), vec_addcq(), vec_addeq().
vcy | a 128-bit vector generated from a write-Carry operation. |
Vector Set Bool from Quadword not Carry.
If the vector quadword carry bit (vcy.bit[127]) is '1' then return a vector bool __int128 that is all '0's. Otherwise return all '1's.
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 6 | 2/cycle |
power9 | 3 - 5 | 2/cycle |
Vector quadword carries are normally the result of a write-Carry operation. For example; vec_addcuq(), vec_addecuq(), vec_subcuq(), vec_subecuq(), vec_addcq(), vec_addeq().
vcy | a 128-bit vector generated from a write-Carry operation. |
Vector Set Bool from Signed Quadword.
If the quadword's sign bit is '1' then return a vector bool __int128 that is all '1's. Otherwise return all '0's.
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 6 | 2/cycle |
power9 | 5 - 8 | 2/cycle |
vra | a 128-bit vector treated as signed __int128. |
Vector Shift Left Double Quadword.
Vector Shift Left double Quadword 0-127 bits. Return a vector __int128 that is the left most 128-bits after shifting left 0-127-bits of the 256-bit double vector (vrw||vrx). The shift amount is from bits 121:127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1 cycle |
power9 | 14 | 1/cycle |
vrw | upper 128-bits of the 256-bit double vector. |
vrx | lower 128-bits of the 256-bit double vector. |
vrb | Shift amount in bits 121:127. |
Vector Shift Left Double Quadword Immediate.
Vector Shift Left double Quadword 0-127 bits. Return a vector __int128 that is the left most 128-bits after shifting left 0-127-bits of the 256-bit double vector (vrw||vrx). The shift amount is from bits 121:127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1 cycle |
power9 | 14 | 1/cycle |
vrw | upper 128-bits of the 256-bit double vector. |
vrx | lower 128-bits of the 256-bit double vector. |
shb | Shift amount in the range 0-127. |
Vector Shift Left Quadword.
Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 1/cycle |
power9 | 6 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
vrb | Shift amount in bits 121:127. |
Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
vra | a 128-bit vector treated a __int128. |
Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
@param vra a 128-bit vector treated a __int128. @return Left shifted vector.
Vector Shift Left Quadword Immediate.
Shift left Quadword 0-127 bits. The shift amount is a const unsigned int in the range 0-127. A shift count of 0 returns the original value of vra. Shift counts greater then 127 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 2-13 | 2 cycle |
power9 | 3-15 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
shb | Shift amount in the range 0-127. |
|
inlinestatic |
Vector Splat Immediate Signed Quadword. Extend a signed integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Signed (Byte | Halfword |Word).
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 9 | 1/cycle |
power9 | 5 - 9 | 2/cycle |
sim | a small signed integer const. |
|
inlinestatic |
Vector Splat Immediate Unsigned Quadword. Extend a unsigned integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Unsigned (Byte | Halfword |Word).
processor | Latency | Throughput |
---|---|---|
power8 | 4 - 9 | 1/cycle |
power9 | 5 - 9 | 2/cycle |
sim | a small unsigned integer const. |
Vector Shift Right Algebraic Quadword.
Vector Shift Right Algebraic Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 10 | 1 cycle |
power9 | 14 | 1/cycle |
vra | a 128-bit vector treated as signed __int128. |
vrb | Shift amount in bits 121:127. |
Vector Shift Right Algebraic Quadword Immediate.
Vector Shift Right Algebraic Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 6-15 | 1 cycle |
power9 | 9-18 | 1/cycle |
vra | a 128-bit vector treated as signed __int128. |
shb | Shift amount in the range 0-127. |
Vector Shift Right Quadword.
Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 1/cycle |
power9 | 6 | 1/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
vrb | Shift amount in bits 121:127. |
Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
vra | a 128-bit vector treated as a __int128. |
Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.
vra | a 128-bit vector treated a __int128. |
Vector Shift Right Quadword Immediate.
Shift right Quadword 0-127 bits. The shift amount is a const unsigned int in the range 0-127. A shift count of 0 returns the original value of vra. Shift counts greater then 127 bits return zero.
processor | Latency | Throughput |
---|---|---|
power8 | 2-13 | 2 cycle |
power9 | 3-15 | 2/cycle |
vra | a 128-bit vector treated as unsigned __int128. |
shb | Shift amount in the range 0-127. |
Vector Subtract and Write Carry Unsigned Quadword.
Generate the carry-out of the sum (vra + NOT(vrb) + 1).
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as unsigned __int128. |
vrb | 128-bit vector treated as unsigned __int128. |
Vector Subtract Extended and Write Carry Unsigned Quadword.
Generate the carry-out of the sum (vra + NOT(vrb) + vrc.bit[127]).
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as unsigned __int128. |
vrb | 128-bit vector treated as unsigned __int128. |
vrc | 128-bit vector carry-in from bit 127. |
Vector Subtract Extended Unsigned Quadword Modulo.
Subtract two vector __int128 values and return result modulo 128-bits.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as unsigned __int128. |
vrb | 128-bit vector treated as unsigned __int128. |
vrc | 128-bit vector carry-in from bit 127. |
Vector Subtract Unsigned Quadword Modulo.
Subtract two vector __int128 values and return result modulo 128-bits.
processor | Latency | Throughput |
---|---|---|
power8 | 4 | 2/2 cycles |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as unsigned __int128. |
vrb | 128-bit vector treated as unsigned __int128. |
Vector Multiply-Add2 Even Unsigned Doublewords.
Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and the even doublewords of c and d ((aeven * beven) + ceven + deven).
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 13-18 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned long int. |
d | 128-bit vector unsigned long int. |
Vector Multiply-Add2 Odd Unsigned Doublewords.
Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and the odd doublewords of c and d ((aodd * bodd) + codd + dodd).
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 13-18 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned long int. |
d | 128-bit vector unsigned long int. |
Vector Multiply-Add Even Unsigned Doublewords.
Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and the even doubleword of c (aeven * beven) + ceven.
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 10-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned long int. |
Vector Multiply-Add Odd Unsigned Doublewords.
Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and the odd doubleword of c (aodd * bodd) + codd.
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 10-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned long int. |
Vector Multiply-Sum Even Unsigned Doublewords.
Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and c (aeven * beven) + c.
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 10-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned __int128. |
Vector Multiply-Sum Odd Unsigned Doublewords.
Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and variable c (aodd * bodd) + c>.
processor | Latency | Throughput |
---|---|---|
power8 | 25-28 | 1/cycle |
power9 | 10-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
c | 128-bit vector unsigned __int128. |
Vector Multiply Even Unsigned Doublewords.
Multiply the even 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the even doublewords.
processor | Latency | Throughput |
---|---|---|
power8 | 21-23 | 1/cycle |
power9 | 8-11 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
Vector Multiply Odd Unsigned Doublewords.
Multiply the odd 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the odd doublewords.
processor | Latency | Throughput |
---|---|---|
power8 | 21-23 | 1/cycle |
power9 | 8-13 | 2/cycle |
a | 128-bit vector unsigned long int. |
b | 128-bit vector unsigned long int. |
Vector Shift Left Double Quadword by Bit Immediate.
Return a vector __int128 that is bits shb:shb+127 from the (256-bit) double quadword (vra || vrb). The shift amount is constant immediate value in the range 0-7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 1 cycle |
power9 | 11 | 1/cycle |
vra | upper 128-bits of the 256-bit double quadword vector. |
vrb | lower 128-bits of the 256-bit double quadword vector. |
shb | Shift amount in the range 0-7. |
Vector Shift Right Double Quadword by Bit Immediate.
Return a vector __int128 that is bits 128-shb:255-shb from the (256-bit) double quadword (vra || vrb). The shift amount is constant immediate value in the range 0-7.
processor | Latency | Throughput |
---|---|---|
power8 | 8 | 1 cycle |
power9 | 11 | 1/cycle |
vra | upper 128-bits of the 256-bit double quadword vector. |
vrb | lower 128-bits of the 256-bit double quadword vector. |
shb | Shift amount in the range 0-7. |