Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions. More...

#include <pveclib/vec_common_ppc.h>
#include <pveclib/vec_int64_ppc.h>

Macros
#define	CONST_VUINT128_QxW(__q0, __q1, __q2, __q3)
	Generate a vector unsigned __int128 constant from words. More...

#define	CONST_VUINT128_QxD(__q0, __q1)
	Generate a vector unsigned __int128 constant from doublewords. More...

#define	CONST_VUINT128_Qx19d(__q0, __q1)
	Generate a vector unsigned __int128 constant from doublewords. More...

#define	CONST_VUINT128_Qx18d(__q0, __q1)
	Generate a vector unsigned __int128 constant from doublewords. More...

#define	CONST_VUINT128_Qx16d(__q0, __q1)
	Generate a vector unsigned __int128 constant from doublewords. More...

Functions
static vui128_t	vec_absduq (vui128_t vra, vui128_t vrb)
	Vector Absolute Difference Unsigned Quadword. More...

static vi128_t	vec_abssq (vi128_t vra)
	Vector Absolute Value Signed Quadword. More...

static vui128_t	vec_avguq (vui128_t vra, vui128_t vrb)
	Vector Average Unsigned Quadword. More...

static vui128_t	vec_addcuq (vui128_t a, vui128_t b)
	Vector Add & write Carry Unsigned Quadword. More...

static vui128_t	vec_addecuq (vui128_t a, vui128_t b, vui128_t ci)
	Vector Add Extended & write Carry Unsigned Quadword. More...

static vui128_t	vec_addeuqm (vui128_t a, vui128_t b, vui128_t ci)
	Vector Add Extended Unsigned Quadword Modulo. More...

static vui128_t	vec_adduqm (vui128_t a, vui128_t b)
	Vector Add Unsigned Quadword Modulo. More...

static vui128_t	vec_addcq (vui128_t *cout, vui128_t a, vui128_t b)
	Vector Add with carry Unsigned Quadword. More...

static vui128_t	vec_addeq (vui128_t *cout, vui128_t a, vui128_t b, vui128_t ci)
	Vector Add Extend with carry Unsigned Quadword. More...

static vui128_t	vec_clzq (vui128_t vra)
	Vector Count Leading Zeros Quadword for unsigned __int128 elements. More...

static vui128_t	vec_ctzq (vui128_t vra)
	Vector Count Trailing Zeros Quadword for unsigned __int128 elements. More...

static vb128_t	vec_cmpeqsq (vi128_t vra, vi128_t vrb)
	Vector Compare Equal Signed Quadword. More...

static vb128_t	vec_cmpequq (vui128_t vra, vui128_t vrb)
	Vector Compare Equal Unsigned Quadword. More...

static vb128_t	vec_cmpgesq (vi128_t vra, vi128_t vrb)
	Vector Compare Greater Than or Equal Signed Quadword. More...

static vb128_t	vec_cmpgeuq (vui128_t vra, vui128_t vrb)
	Vector Compare Greater Than or Equal Unsigned Quadword. More...

static vb128_t	vec_cmpgtsq (vi128_t vra, vi128_t vrb)
	Vector Compare Greater Than Signed Quadword. More...

static vb128_t	vec_cmpgtuq (vui128_t vra, vui128_t vrb)
	Vector Compare Greater Than Unsigned Quadword. More...

static vb128_t	vec_cmplesq (vi128_t vra, vi128_t vrb)
	Vector Compare Less Than or Equal Signed Quadword. More...

static vb128_t	vec_cmpleuq (vui128_t vra, vui128_t vrb)
	Vector Compare Less Than or Equal Unsigned Quadword. More...

static vb128_t	vec_cmpltsq (vi128_t vra, vi128_t vrb)
	Vector Compare Less Than Signed Quadword. More...

static vb128_t	vec_cmpltuq (vui128_t vra, vui128_t vrb)
	Vector Compare Less Than Unsigned Quadword. More...

static vb128_t	vec_cmpnesq (vi128_t vra, vi128_t vrb)
	Vector Compare Equal Signed Quadword. More...

static vb128_t	vec_cmpneuq (vui128_t vra, vui128_t vrb)
	Vector Compare Not Equal Unsigned Quadword. More...

static int	vec_cmpsq_all_eq (vi128_t vra, vi128_t vrb)
	Vector Compare all Equal Signed Quadword. More...

static int	vec_cmpsq_all_ge (vi128_t vra, vi128_t vrb)
	Vector Compare any Greater Than or Equal Signed Quadword. More...

static int	vec_cmpsq_all_gt (vi128_t vra, vi128_t vrb)
	Vector Compare any Greater Than Signed Quadword. More...

static int	vec_cmpsq_all_le (vi128_t vra, vi128_t vrb)
	Vector Compare any Less Than or Equal Signed Quadword. More...

static int	vec_cmpsq_all_lt (vi128_t vra, vi128_t vrb)
	Vector Compare any Less Than Signed Quadword. More...

static int	vec_cmpsq_all_ne (vi128_t vra, vi128_t vrb)
	Vector Compare all Not Equal Signed Quadword. More...

static int	vec_cmpuq_all_eq (vui128_t vra, vui128_t vrb)
	Vector Compare all Equal Unsigned Quadword. More...

static int	vec_cmpuq_all_ge (vui128_t vra, vui128_t vrb)
	Vector Compare any Greater Than or Equal Unsigned Quadword. More...

static int	vec_cmpuq_all_gt (vui128_t vra, vui128_t vrb)
	Vector Compare any Greater Than Unsigned Quadword. More...

static int	vec_cmpuq_all_le (vui128_t vra, vui128_t vrb)
	Vector Compare any Less Than or Equal Unsigned Quadword. More...

static int	vec_cmpuq_all_lt (vui128_t vra, vui128_t vrb)
	Vector Compare any Less Than Unsigned Quadword. More...

static int	vec_cmpuq_all_ne (vui128_t vra, vui128_t vrb)
	Vector Compare all Not Equal Unsigned Quadword. More...

static vui128_t	vec_cmul10ecuq (vui128_t *cout, vui128_t a, vui128_t cin)
	Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword. More...

static vui128_t	vec_cmul10cuq (vui128_t *cout, vui128_t a)
	Vector combined Multiply by 10 & write Carry Unsigned Quadword. More...

static vi128_t	vec_divsq_10e31 (vi128_t vra)
	Vector Divide by const 10e31 Signed Quadword. More...

static vui128_t	vec_divudq_10e31 (vui128_t *qh, vui128_t vra, vui128_t vrb)
	Vector Divide Unsigned Double Quadword by const 10e31. More...

static vui128_t	vec_divudq_10e32 (vui128_t *qh, vui128_t vra, vui128_t vrb)
	Vector Divide Unsigned Double Quadword by const 10e32. More...

static vui128_t	vec_divuq_10e31 (vui128_t vra)
	Vector Divide by const 10e31 Unsigned Quadword. More...

static vui128_t	vec_divuq_10e32 (vui128_t vra)
	Vector Divide by const 10e32 Unsigned Quadword. More...

static vi128_t	vec_maxsq (vi128_t vra, vi128_t vrb)
	Vector Maximum Signed Quadword. More...

static vui128_t	vec_maxuq (vui128_t vra, vui128_t vrb)
	Vector Maximum Unsigned Quadword. More...

static vi128_t	vec_minsq (vi128_t vra, vi128_t vrb)
	Vector Minimum Signed Quadword. More...

static vui128_t	vec_minuq (vui128_t vra, vui128_t vrb)
	Vector Minimum Unsigned Quadword. More...

static vi128_t	vec_modsq_10e31 (vi128_t vra, vi128_t q)
	Vector Modulo by const 10e31 Signed Quadword. More...

static vui128_t	vec_modudq_10e31 (vui128_t vra, vui128_t vrb, vui128_t *ql)
	Vector Modulo Unsigned Double Quadword by const 10e31. More...

static vui128_t	vec_modudq_10e32 (vui128_t vra, vui128_t vrb, vui128_t *ql)
	Vector Modulo Unsigned Double Quadword by const 10e32. More...

static vui128_t	vec_moduq_10e31 (vui128_t vra, vui128_t q)
	Vector Modulo by const 10e31 Unsigned Quadword. More...

static vui128_t	vec_moduq_10e32 (vui128_t vra, vui128_t q)
	Vector Modulo by const 10e32 Unsigned Quadword. More...

static vui128_t	vec_mul10cuq (vui128_t a)
	Vector Multiply by 10 & write Carry Unsigned Quadword. More...

static vui128_t	vec_mul10ecuq (vui128_t a, vui128_t cin)
	Vector Multiply by 10 Extended & write Carry Unsigned Quadword. More...

static vui128_t	vec_mul10euq (vui128_t a, vui128_t cin)
	Vector Multiply by 10 Extended Unsigned Quadword. More...

static vui128_t	vec_mul10uq (vui128_t a)
	Vector Multiply by 10 Unsigned Quadword. More...

static vui128_t	vec_cmul100cuq (vui128_t *cout, vui128_t a)
	Vector combined Multiply by 100 & write Carry Unsigned Quadword. More...

static vui128_t	vec_cmul100ecuq (vui128_t *cout, vui128_t a, vui128_t cin)
	Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword. More...

static vui128_t	vec_msumcud (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum and Write Carryout Unsigned Doubleword. More...

static vui128_t	vec_msumudm (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Unsigned Doubleword Modulo. More...

static vui128_t	vec_muleud (vui64_t a, vui64_t b)
	Vector Multiply Even Unsigned Doublewords. More...

static vui64_t	vec_mulhud (vui64_t vra, vui64_t vrb)
	Vector Multiply High Unsigned Doubleword. More...

static vui128_t	vec_muloud (vui64_t a, vui64_t b)
	Vector Multiply Odd Unsigned Doublewords. More...

static vui64_t	vec_muludm (vui64_t vra, vui64_t vrb)
	Vector Multiply Unsigned Doubleword Modulo. More...

static vui128_t	vec_mulhuq (vui128_t a, vui128_t b)
	Vector Multiply High Unsigned Quadword. More...

static vui128_t	vec_mulluq (vui128_t a, vui128_t b)
	Vector Multiply Low Unsigned Quadword. More...

static vui128_t	vec_muludq (vui128_t *mulu, vui128_t a, vui128_t b)
	Vector Multiply Unsigned Double Quadword. More...

static vui128_t	vec_madduq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c)
	Vector Multiply-Add Unsigned Quadword. More...

static vui128_t	vec_madd2uq (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c1, vui128_t c2)
	Vector Multiply-Add2 Unsigned Quadword. More...

static vi128_t	vec_negsq (vi128_t int128)
	Vector Negate Signed Quadword. More...

static vui128_t	vec_neguq (vui128_t int128)
	Vector Negate Unsigned Quadword. More...

static vui128_t	vec_popcntq (vui128_t vra)
	Vector Population Count Quadword for unsigned __int128 elements. More...

static vui128_t	vec_revbq (vui128_t vra)
	Vector Byte Reverse Quadword. More...

static vui128_t	vec_rlq (vui128_t vra, vui128_t vrb)
	Vector Rotate Left Quadword. More...

static vui128_t	vec_rlqi (vui128_t vra, const unsigned int shb)
	Vector Rotate Left Quadword Immediate. More...

static vi128_t	vec_selsq (vi128_t vra, vi128_t vrb, vb128_t vrc)
	Vector Select Signed Quadword. More...

static vui128_t	vec_seluq (vui128_t vra, vui128_t vrb, vb128_t vrc)
	Vector Select Unsigned Quadword. More...

static vb128_t	vec_setb_cyq (vui128_t vcy)
	Vector Set Bool from Quadword Carry. More...

static vb128_t	vec_setb_ncq (vui128_t vcy)
	Vector Set Bool from Quadword not Carry. More...

static vb128_t	vec_setb_sq (vi128_t vra)
	Vector Set Bool from Signed Quadword. More...

static vui128_t	vec_sldq (vui128_t vrw, vui128_t vrx, vui128_t vrb)
	Vector Shift Left Double Quadword. More...

static vui128_t	vec_sldqi (vui128_t vrw, vui128_t vrx, const unsigned int shb)
	Vector Shift Left Double Quadword Immediate. More...

static vui128_t	vec_slq (vui128_t vra, vui128_t vrb)
	Vector Shift Left Quadword. More...

static vui128_t	vec_slqi (vui128_t vra, const unsigned int shb)
	Vector Shift Left Quadword Immediate. More...

static vi128_t	vec_splat_s128 (const int sim)
	Vector Splat Immediate Signed Quadword. Extend a signed integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Signed (Byte \| Halfword \|Word). More...

static vui128_t	vec_splat_u128 (const int sim)
	Vector Splat Immediate Unsigned Quadword. Extend a unsigned integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Unsigned (Byte \| Halfword \|Word). More...

static vi128_t	vec_sraq (vi128_t vra, vui128_t vrb)
	Vector Shift Right Algebraic Quadword. More...

static vi128_t	vec_sraqi (vi128_t vra, const unsigned int shb)
	Vector Shift Right Algebraic Quadword Immediate. More...

static vui128_t	vec_srq (vui128_t vra, vui128_t vrb)
	Vector Shift Right Quadword. More...

static vui128_t	vec_srqi (vui128_t vra, const unsigned int shb)
	Vector Shift Right Quadword Immediate. More...

static vui128_t	vec_slq4 (vui128_t vra)

static vui128_t	vec_slq5 (vui128_t vra)

static vui128_t	vec_srq4 (vui128_t vra)

static vui128_t	vec_srq5 (vui128_t vra)

static vui128_t	vec_subcuq (vui128_t vra, vui128_t vrb)
	Vector Subtract and Write Carry Unsigned Quadword. More...

static vui128_t	vec_subecuq (vui128_t vra, vui128_t vrb, vui128_t vrc)
	Vector Subtract Extended and Write Carry Unsigned Quadword. More...

static vui128_t	vec_subeuqm (vui128_t vra, vui128_t vrb, vui128_t vrc)
	Vector Subtract Extended Unsigned Quadword Modulo. More...

static vui128_t	vec_subuqm (vui128_t vra, vui128_t vrb)
	Vector Subtract Unsigned Quadword Modulo. More...

static vui128_t	vec_vmuleud (vui64_t a, vui64_t b)
	Vector Multiply Even Unsigned Doublewords. More...

static vui128_t	vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c)
	Vector Multiply-Add Even Unsigned Doublewords. More...

static vui128_t	vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
	Vector Multiply-Add2 Even Unsigned Doublewords. More...

static vui128_t	vec_vmuloud (vui64_t a, vui64_t b)
	Vector Multiply Odd Unsigned Doublewords. More...

static vui128_t	vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c)
	Vector Multiply-Add Odd Unsigned Doublewords. More...

static vui128_t	vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
	Vector Multiply-Add2 Odd Unsigned Doublewords. More...

static vui128_t	vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Even Unsigned Doublewords. More...

static vui128_t	vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Odd Unsigned Doublewords. More...

static vui128_t	vec_vsldbi (vui128_t vra, vui128_t vrb, const unsigned int shb)
	Vector Shift Left Double Quadword by Bit Immediate. More...

static vui128_t	vec_vsrdbi (vui128_t vra, vui128_t vrb, const unsigned int shb)
	Vector Shift Right Double Quadword by Bit Immediate. More...

Detailed Description

Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions.

Some of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins. Other operations do not exist as instructions on any current processor but are useful and should be provided. This header serves to provide these operations as inline functions using existing vector built-ins or other pveclib operations.

The original VMX (AKA Altivec) only defined a few instructions that operated on the 128-bit vector as a whole. This included the vector shift left/right (bit), vector shift left/right by octet (byte), vector shift left double by octet (select a contiguous 16-bytes from 2 concatenated vectors) 256-bit), and generalized vector permute (select any 16-bytes from 2 concatenated vectors). Use of these instructions can be complicated when;

the shift amount is more than 8 bits,
the shift amount is not a multiple of 8-bits (octet),
the shift amount is a constant and needs to be generated/loaded before use.

These instructions can used in combination to provide generalized vector __int128 shift/rotate operations. Pveclib uses these operations to provide vector __int128 shift / rotate left, shift right and shift algebraic right operations. These operations require pre-conditions to avoid multiple instructions or require a combination of (bit and octet shift) instructions to get the quadword result. The compiler <altivec.h> built-ins only supports individual instructions. So using these operations quickly inspires a need for a header (like this) to contain implementations of the common operations.

The VSX facility (introduced with POWER7) did not add any integer doubleword (64-bit) or quadword (128-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector __int128 (128-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.

POWER8 added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend). The add quadword is useful to sum the partial products for a full 128 x 128-bit multiply. The add quadword write carry and extend forms, simplify extending arithmetic to 256-bits and beyond.

While POWER8 provided quadword integer add and subtract operations, it did not provide quadword Signed/Unsigned integer compare operations. It is possible to implement quadword compare operations using existing word / doubleword compares and the the new quadword subtract write-carry operation. The trick it so convert the carry into a vector bool __int128 via the vec_setb_ncq () operation. This header provides easy to use quadword compare operations.

POWER9 (PowerISA 3.0B) adds the Vector Multiply-Sum unsigned Doubleword Modulo instruction. Aspects of this instruction mean it needs to be used carefully as part of larger quadword multiply. It performs only two of the four required doubleword multiplies. The final quadword modulo sum will discard any overflow/carry from the potential 130-bit result. With careful pre-conditioning of doubleword inputs the results are can not overflow from 128-bits. Then separate add quadword add/write carry operations can be used to complete the sum of partial products. These techniques are used in the POWER9 specific implementations of vec_muleud, vec_muloud, vec_mulluq, and vec_muludq.

PowerISA 3.0B also defined additional: Binary Coded Decimal (BCD) and Zoned character format conversions. String processing operations. Vector Parity operations. Integer Extend Sign Operations. Integer Absolute Difference Operations. All of these seem to useful additions to pveclib for older (POWER7/8) processors and across element sizes (including quadword elements).

Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit and 128-bit integer vector intrinsics to be added to altivec.h. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.

Note: The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example, if you compile with -mcpu=power7, vec_vadduqm and vec_vsubudm will not be defined. But vec_adduqm() and vec_subudm() and always be defined in this header, will generate the minimum code, appropriate for the target, and produce correct results.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.

This header covers operations that are either:

Operations implemented in hardware instructions for later processors and useful to programmers, on slightly older processors, even if the equivalent function requires more instructions. Examples include quadword byte reverse, add and subtract.
Defined in the OpenPOWER ABI but not yet defined in <altivec.n> provided by available compilers in common use. Examples include quadword byte reverse, add and subtract.
Are commonly used operations, not covered by the ABI or <altivec.h>, and require multiple instructions or are not obvious. Examples include quadword; Signed and Unsigned compare, shift immediate, multiply, multiply by 10 immediate, count leading zeros and population count.

Note: The Multiply sum/even/odd doubleword operations are currently implemented here (in <vec_int128_ppc.h>) which resolves a dependency on Add Quadword. These functions (vec_msumudm, vec_muleud, vec_muloud) all produce a quadword results and may use the vec_adduqm implementation to sum partial products.

See Returning extended quadword results. for more background on extended quadword computation.

Endian problems with quadword implementations

Technically operations on quadword elements should not require any endian specific transformation. There is only one element so there can be no confusion about element numbering or order. However some of the more complex quadword operations are constructed from operations on smaller elements. And those operations as provided by <altivec.h> are required by the OpenPOWER ABI to be endian sensitive. See Endian problems with doubleword operations for a more detailed discussion.

In any case the arithmetic (high to low) order of bits in a quadword are defined in the PowerISA (See vec_adduqm() and vec_subuqm()). So pveclib implementations will need to either:

Nullify little endian transforms of <altivec.h> operations. The <altivec.h> built-ins vec_muleuw(), vec_mulouw(), vec_mergel(), and vec_mergeh() are endian sensitive and often require nullification that restores the original operation.
Use new operations that are specifically defined to be stable across BE/LE implementations. The pveclib operations; vec_vmuleud() vec_vmuloud(), vec_mrgahd(), vec_mrgald(). and vec_permdi() are defined to be endian stable.

Quadword Integer Constants

The compilers may not support 128-bit integers for constants and printf (integer to ascii). For example GCC provides ANSI mandated constant and runtime support for integers up to long long which for PowerPC is only 64-bit.

The __int128 type is an extension that provides basic arithmetic operations but does not compile 128-bit constants or support printf formating for integers larger then long long. The following section provides examples and work around's for these restrictions.

The GCC compiler allows integer constants to be assigned/cast to __int128 types. The support also allows __int128 constants to be assigned/cast to vector __int128 types. So the following are allowed:

const vui128_t vec128_zeros = {(vui128_t) ((unsigned __int128) 0)};
const vui128_t vec128_10 = {(vui128_t) ((unsigned __int128) 10)};
const vui128_t vec128_10to16 = {(vui128_t) ((unsigned __int128)
                                10000000000000000UL)};
const vui128_t vec128_maxLong = {(vui128_t) ((unsigned __int128)
                                __INT64_MAX__)};
const vui128_t vec128_max_Long = {(vui128_t) ((unsigned __int128)
                                0x7fffffffffffffffL)};
// -1 signed extended to __int128 is 0xFFFF...FFFF
const vui128_t vec128_foxes = {(vui128_t) ((__int128) -1L)};

It gets more complicated when the constant exceeds the range of a long long value. For example the magic numbers for the multiplicative inverse described in Printing Vector __int128 values. The decimal integer constant we need for the quadword multiplier is "76624777043294442917917351357515459181" or the equivalent hexadecimal value "0x39a5652fb1137856d30baf9a1e626a6d". GCC does not allow constants this large to be expressed directly.

GCC supports aggregate initializer lists for the elements of vectors. For example:

vui32_t xyzw = (vector int) { 1, 2, 3, 4 };

So it is possible to compose a quadword constant by initializing a vector of word or doubleword elements then casting the result to a quadword type. For example:

const vui128_t invmul = (vui128_t) (vector unsigned long long)

{ 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL };

or

const vui128_t invmul = (vui128_t) (vector unsigned int)

{ 0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d };

There is one small problem with this as element order is endian dependent, while a vector quadword integer is always big endian. So we would need to adjust the element order for endian. For example:

   const vui128_t invmul = (vui128_t) (vector unsigned long long)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
                     { 0xd30baf9a1e626a6dUL, 0x39a5652fb1137856UL };
#else
                     { 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL };
#endif

or

   const vui128_t invmul = (vui128_t) (vector unsigned int)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
                     { 0x1e626a6d, 0xd30baf9a, 0xb1137856, 0x39a5652f };
#else
                     { 0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d };
#endif

Remembering to add the endian correction for constants used quadword operations is an issue and manually reversing the element order can be error prone. There should be an easier way.

Support for Quadword Integer Constants

The vec_common_ppc.h header provides some helper macros for when quadword operations need big endian element order on little endian platforms. These macros accept 2, 4, 8, or 16 element constants to form an aggregate initializer for a vector of the corresponding element type. The elements are always arranged left to right, high to low order. These macros are endian sensitive and either effectively pass-through for big endian or reverse the element order for little endian.

For example:

const vui128_t mul_invs_ten16 = (vui128_t) CONST_VINT128_DW(

0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);

or

const vui128_t mul_invs_ten16 = (vui128_t) CONST_VINT128_W(

0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d);

These macros internally cast to a vector unsigned integer type for the aggregate initializer. This type corresponds to the size and number of elements to fit in a 128-bit vector. This tells the compiler how many elements to expect and the allowed value range for the initializer. A final explicit cast is required to the vector type needed (usually a signed or unsigned __int128). (See: CONST_VINT128_DW(), CONST_VINT128_W(), CONST_VINT128_H(), CONST_VINT128_B() ). Other macros require the programmer to provide a cast to match the element count and size. (See: CONST_VINT64_DW(), CONST_VINT32_W(), CONST_VINT16_H(), CONST_VINT8_B() )

The methods above are effectively forming multi-digit constants where each digit is itself a large (word or doubleword) binary coded integer value. Because the digits are radix 2**N it is normal to convert large decimal constants to hexadecimal. This makes it easier to split the large constants into word or doubleword elements for the initializer.

Most compilers support compile time computation on constants. This is an optimization where only the final computed constant result is used in the generated code. Compile time constant computation supports the usual arithmetic operations on the usual types. Some compilers (including GCC) support constant computation on extended types including __int128.

For example:

const vui128_t ten32_minus1 = (vui128_t)
      (((unsigned __int128) 9999999999999999UL) * 10000000000000000UL)
     + ((unsigned __int128) 9999999999999999UL);

produces the quadword integer value for the decimal constant 99999999999999999999999999999999.

Note: we must cast any int or long long constants to [unsigned] __int128 so the compiler will use 128-bits arithmetic to compute the final constant.

With this technique we can split large decimal constants into 16, 18, or 19 digit blocks and then compute effective 32, 36, or 38 digit constant. (see CONST_VUINT128_Qx16d(), CONST_VUINT128_Qx18d(), and CONST_VUINT128_Qx19d()). For example:

  const vui128_t ten32_minus1 = CONST_VUINT128_Qx16d
        ( 9999999999999999UL, 9999999999999999UL );
// The quadword multiplicative inverse to divide by 10**16
// is 76624777043294442917917351357515459181.
// Which is 38 digits, so we split into 2 consts of 19 digits each.
  const vui128_t mul_invs_ten16 = CONST_VUINT128_Qx19d(
     7662477704329444291UL, 7917351357515459181UL);

Loading small Quadword constants

Programming with quadword integers will need quadword constants for masking and arithmetic operations. In the sections above we provide means to define large and complex constants. But often there is need for small integer constants for use in boolean logic, masking/select operations, and simple arithmetic.

The technique above can used for small integer constants as well. For example:

const vui128_t qw_one = CONST_VINT128_DW(0, 1);
const vui128_t qw_ten = CONST_VINT128_DW(0, 10);
const vui128_t qw_digit_mask = CONST_VINT128_DW(0, 0xf);

In most cases this compiler will allocate these constant values to the read-only data (.rodata) section. When these constants are referenced in programming operations the compiler generates the appropriate vector loads. For example the GCC V11 generates the following for the -mcpu=power8 target:

addis   r9,r2,.rodata.cst16+0x30@toc@ha
addi    r9,r9,.rodata.cst16+0x30@toc@l
lvx     v2,0,r9

And the following for the -mcpu=power9 target:

addis   r9,r2,.rodata.cst16+0x30@toc@ha
addi    r9,r9,.rodata.cst16+0x30@toc@l
lxv     v2,0(r9)

This is expected for POWER8 as PowerISA 2.07B does not have any displacement form (D-Form) vector (VSX) loads/stores instructions. The compiler allocates constants to the .rodata sections and the linker collects .rodata from object files into a combined executable .rodata section. This is placed near the Table of Contents (TOC) section. The ABI dedicates R2 as the base address .TOC. for the TOC and adjacent sections.

The Add Immediate Shifted (addis) Add Immediate (addi) sequence above computes a signed 32-bit .TOC. relative offset to a specific .rodata quadword. Two instructions are required as; addis provides the high adjusted 16-bits shifted left 16-bits, while addi provides the low 16-bits. The sum of R2 and these immediate values is the 64-bit effective address of a .rodata constant value. A signed 32-bit offset is large enough to support most program and library executables.

The load itself has a 5-cycle latency assuming a L1 cache hit. The three instruction sequence is sequentially dependent and requires 9-cycles latency (minimum) to execute. A L1 cache miss will increase the latency by 7-28 cycles, assuming the data resides in the L2/L3 caches.

However the compiler is not following the recommendations of

PowerISA 2.07B, Book II, Chapter 2.1 Performance-Optimized Instruction Sequences. This chapter recommends a specific pattern for the addi/lvx sequence. For example:

addis   rA,r2,.rodata.cst16+0x30@toc@ha
addi    rx,0,.rodata.cst16+0x30@toc@l
lvx     v2,rA,rx

In this case rx can be any GPR (including r0) while RA must be a valid base (r1 <-> r31) register.

The POWER8 implementation allows for Instruction Fusion combining information from two adjacentt instructions into one (internal) instruction so that it executes faster than the non-fused case. Effectively the addi/lvx combination above becomes a D-Form load vector instruction.

There are additional restrictions on the definition of adjacent:

The instruction must be in the same dispatch group.
- In single-threaded mode, up to six non-branch and up to two branch instructions (6/2 groups).
- In multi-threaded mode, up to three non-branch and up to one branch instructions (3/1 groups).
Without any intervening branch instructions.
Instructions may span an I-cache line, but with both fetched and residing in the i-buffer.

This can reduce the latency from 9 to 7-cycles. This would be true even without Instruction Funsion as the addis/addi instructions are now independent and can execute in parallel.

The sequence generated for POWER9 is even more disappointing. The lxv is a D-Form (DQ) instruction and the displacement operand could be used to replace the addi instruction. For example: -mcpu=power9 target:

addis r9,r2,.rodata.cst16+0x30@toc@ha

lxv v2,.rodata.cst16+0x30@toc@l(r9)

This provides the equivalent 32-bit TOC relative displacement with one less instructions and reduced latency of 7-cycles.

Alternatives to loading from .rodata

This is all a little cumbersome and it seems like there should be a better/faster way. Any instruction sequence that loads quadword integer constant in:

three instruction or less,
latency of 6 cycles or less,
and avoids cache misses

is a good deal.

The base (Altivec) vector ISA included Vector Splat Immediate Signed Byte/Halfword/Word instructions. These are fast (2-cycle latency) and convenient for small integer constants in the range -16 to 15. So far the ISA has not added doubleword or quadword forms for these.

POWER9 added a VSX Vector Splat Immediate Byte (xxspltib) instruction. This expands the immediate range to -128 to 127 but does not include larger element sizes. POWER9 does provide Vector Extend Sign Byte To Word/Doubleword (vextsb2w/vextsb2d) instructions. For example the two instruction sequence:

xxspltib vs34,127

vextsb2d v2,v2

can generate a doubleword splat immediate for integers in the range -128 to 127 with a cycle latency of 5-cycles. So far there is no extend sign byte/halfword/word to quadword. POWER10 does add Vector Extend Sign Doubleword To Quadword (vextsd2q).

Note: POWER10 does add the interesting VSX Vector Splat Immediate Double-Precision instruction. This is a 64-bit instruction with a 32-bit single precision immediate operand. Interesting but not helpful for quadword integer.

Some special quadword constants

The GCC compiler does recognize some vector constants as special case. For example:

vi128_t
__test_splatisq_n1_V0 (void)
{
  const vui32_t q_ones = {-1, -1, -1, -1};
  return (vi128_t) q_ones;
}
 
vi128_t
__test_splatisq_0_V0 (void)
{
  const vui32_t q_zero = {0, 0, 0, 0};
  return (vi128_t) q_zero;
}

will generate:

0000000000000080 <__test_splatisq_n1_V0>:
     vspltisw v2,-1
     blr
00000000000000a0 <__test_splatisq_0_V0>:
     vspltisw v2,0
     blr

Another interesting example is the quadword sign mask. For example:

vui32_t
__test_splatisq_signmask_V0 (void)
{
  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0, 0);
  return signmask;
}

will generate:

00000000000000c0 <__test_splatisq_signmask_V0>:
     vspltisw v0,-1
     vspltisw v2,0
     vslw    v0,v0,v0
     vsldoi  v2,v0,v2,12
     blr

The first 2 instructions generate vector constants of all zeros and all ones (same as above). The third instruction uses vector shift left word (vslw) to convert the word elements from 0xffffffff to 0x80000000.

The cleaver bit is shifting elements of the all ones (0xffffffff or -1) vector, left by 31-bits (0b11111), which is the value of low order 5-bits of the all ones element. Fortunately the vsl[bhw] instructions ignores all but the lower order bits needed for the element shift count.

Note: This applies for element sizes byte, halfword and word. It also applies to doubleword elements on POWER8/9 using vsld but the compiler does not the recognize this case. And with POWER10 this technique applies to quadwords using vslq.

To convert a word sign mask to a quadword sign mask we need the all zeros vector and one additional instruction. The Vector Shift Left Double by Octet Immediate (vsldoi) rotates the low-order signmask word element to the high order word with 3 words of '0' concatenated on the right.

The equivalent C language with <altivec.h> intrinsics implementation is:

static inline vui32_t
vec_mask128_f128sign (void)
{
  const vui32_t q_zero = {0, 0, 0, 0};
  const vui32_t q_ones = {-1, -1, -1, -1};
  vui32_t signmask;
  signmask = vec_sl (q_ones, q_ones);
  return vec_sld (signmask, q_zero, 12);
}

This sequence is a little bigger (4 instructions) then we would like but should execute in 6-cycles. The first two instructions are independent and should execute in parallel. Also (as we will see) the all zero/ones constants are common building blocks. So the compiler should treat these as common sub expressions with across all operations using those constants.

Defining our own vec_splat_s128

So the compiler can do clever things with vector constants. But so far these are the only examples I have found. Other cases that you might expect to be a special case are not. For example:

vi128_t
__test_splatisq_15_V1 (void)
{
  const vui128_t qw_15 = {15};
  return (vi128_t) qw_15;
}

and

vi128_t
__test_splatisq_15_V0 (void)
{
  const vui32_t qw_15 = CONST_VINT128_W(0, 0, 0, 15);
  return (vi128_t) qw_15;
}

generate the 3 instruction (9-cycle) load from .rodata sequence. also constants using the vector long long or __int128 types may fail to compile on older versions of the compiler.

Note: PVECLIB has found it best to consistently use vector unsigned int (vui32_t) internally for these operations. First older compiles may fail to compile specific combinations of vector long long or __int128 types and <altivec.h> intrinsics. Second the compiler may consider the vector long long constants as not quadword aligned and generate lxvd2x/xxswapd instead of lvx.

We can generate small constants in the range 1-15 with using the following pattern:

vi128_t
__test_splatisq_15_V2 (void)
{
  //  const vui32_t qw_15 = CONST_VINT128_W(0, 0, 0, 15);
  const vui32_t q_zero = CONST_VINT128_W (0, 0, 0, 0);
  vui32_t qw_15 = (vui32_t) vec_splat_s32(15);
  return (vi128_t) vec_sld (q_zero, qw_15, 4);
}

Which generates:

00000000000000e0 <__test_splatisq_15_V2>:
     vspltisw v0,0
     vspltisw v2,15
     vsldoi  v2,v0,v2,4
     blr

Here we use the vec_splat_s32() intrinsic to generate the vspltisw instruction for the value 15.

This sequence is only 3 instructions, which should execute in 4-cycles. The first two instructions are independent and should execute in parallel. Also the q_zero constant is commonly used and the compiler should treat it as a common sub expressions.

For small (-16 to -1) negative constants we need to make one small change. We use the q_ones constant to propagate the sign across the quadword.

vi128_t
__test_splatisq_n16_V2 (void)
{
  //  const vui32_t qw_16 = CONST_VINT128_W(-1, -1, -1, -16);
  const vui32_t q_ones = {-1, -1, -1, -1};
  vui32_t qw_16 = (vui32_t) vec_splat_s32(-16);
  return (vi128_t) vec_sld (q_ones, qw_16, 4);
}

The generated sequence is also 3 instructions and should execute in 4-cycles.

Putting this all together we can create a static inline function to generate small quadword constants (in the range -16 to 15). For example:

static inline vi128_t
vec_splat_s128_PWR8 (const int sim)
{
  vi128_t result;
  if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))
    {
      vui32_t vwi = (vui32_t) vec_splat_s32(sim);
 
      if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
        {
          // Special case for -1 and 0. Skip vec_sld().
          result = (vi128_t) vwi;
        }
      else
        {
          if (__builtin_constant_p (sim) && (sim > 0))
            {
              const vui32_t q_zero = {0, 0, 0, 0};
              result = (vi128_t) vec_sld (q_zero, vwi, 4);
            }
          else
            {
              const vui32_t q_ones = {-1, -1, -1, -1};
              result = (vi128_t) vec_sld (q_ones, vwi, 4);
            }
        }
    }
  else
    result = vec_splats ((signed __int128) sim);
 
  return (result);
}

This version uses only <altivec.h> intrinsics supported by POWER8 and earlier. For constants in the range (-16 to 15) the range is divided into three groups:

Special values -1 and 0 that can be gnerated in a single instruction.
Values 1 to 15 that require the q_zero constant to sign extend.
Values -16 to -2 that require the q_ones constant to sign extend.

Values outside this range use the vec_splats() intrinsic which will generate the appropriate quadword constant in .rodata and the load sequence to retrieve that value.

For POWER9 and later we can use the VSX Vector Splat Immediate Byte (xxspltib) instruction and support the extended constant range of -128 to 127.

static inline vi128_t
vec_splat_s128_PWR9 (const int sim)
{
  vi128_t result;
  if (__builtin_constant_p (sim) && ((sim >= -128) && (sim < 128)))
    {
      // Expect the compiler to generate a single xxspltib for this.
      vi8_t vbi = vec_splats ((signed char) sim);
 
      if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
        {
          // Special case for -1 and 0. Skip vec_sld().
          result = (vi128_t) vbi;
        }
      else
        {
          if (__builtin_constant_p (sim) && (sim > 0))
            {
              const vui32_t q_zero = {0, 0, 0, 0};
              result = (vi128_t) vec_sld ((vi8_t) q_zero, vbi, 1);
            }
          else
            {
              const vui32_t q_ones = {-1, -1, -1, -1};
              result = (vi128_t) vec_sld ((vi8_t) q_ones, vbi, 1);
            }
        }
    }
  else
    result = vec_splats ((signed __int128) sim);
 
  return (result);
}

Here we use the vec_splats() intrinsic to generate the xxspltib instruction. The rest follows the pattern we used for POWER8 but shift left is adjusted for the byte (vs word) element splat to be 1 octet.

Some facts about fixed precision integers

The transition from grade school math to computer programming requires the realization that computers handle numbers in fixed sized chunks. For the PowerISA these chunks are byte, halfword, word, doubleword, and quadword. While computer languages like "C" have integer types like char, short, int, long int, and __int128.

Happily these chunks are large enough to hold the equivalent of several decimal digits and handle most of the grotty details of multiply, divide, add, and subtract. But sometimes the chunk (used) is not large enough to hold all the digits you need. Sums may overflow and multiplies may be truncated (modulo the chunk size).

Sometimes we can simply switch to the next larger size (int to long, word to doubleword) and avoid the problem (overflow of sums or truncation of multiply). But sometimes the largest chunk the compiler or hardware supports is still not large enough for the numbers we are dealing with. This requires multiple precision arithmetic with works a lot like grade school arithmetic but with larger digits represented by the most convenient computer sized chunk.

Most programmers would prefer to use an existing multiple precision arithmetic library and move on. Existing libraries are implemented with scalar instructions and loops over storage arrays. But here we need to provide vector quadword multiply and extended quadword add/subtract operations. Any transfers between the libraries multi-precision storage arrays and vector registers are likely to exceed the timing for a direct vector implementation.

Note: The PowerISA 2.07 provides direct vector quadword integer add/subtract with carry/extend. PowerISA 3.0 provides unsigned doubleword multiply with quadword product. This exceeds the capability of the PowerISA 64-bit (doubleword) Fixed Point unit which requires multiple instructions to generate quadword results.

We also want to provide the basis for general multiple quadword precision arithmetic operations (see vec_int512_ppc.h). And for security implementations requiring large multiply products we are motivated to leverage the PowerISA large vector register set to avoid exposing these results (and partial products) to memory/cache side channel attacks.

Some useful arithmetic facts (you may of forgotten)

First multiplying a M-digits by N-digits number requires up to (M+N)-digits to store the result. This is true independent of the size of your digit, including decimal, hexadecimal, and computer words/doublewords/quadwords. This explains why a 32-bit (word) by 32-bit integer multiply product is either:

Truncated (modulo) to 32-bits, potentially loosing the high order precision.
Expanded to the next larger (double) size (in this case 64-bit doubleword).

The hardware has to one or the other.

Let's looks at some examples of multiplying two maximal 4-digit numbers:

Decimal: 9999 x 9999 = 99980001

Hexadecimal: FFFF x FFFF = FFFE0001

And to drive home the point, let's look at the case of multiplying two maximal (32-bit word) 4-digit numbers:

quadword:      FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
             x FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
             = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
               00000000 00000000 00000000 00000001

This is also a (128-bit quadword) digit multiply with a (256-bit) 2 quadword digit result.

Adding asymmetric example; 4-digit by 1 digit multiply:

Decimal:       9999 x 9 = 89991
Hexadecimal:   FFFF x F = EFFF1
quadword:      FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
             x FFFFFFFF
             = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF 00000001

This pattern repeats across the all digit bases/size and values of M, N.

Note that the product is not the maximum value for the product width. It seem the product leave room to add another digit or two without overflowing the double wide product. Lets try some 4 digit examples by adding a maximal 4 digit value to the product.

Decimal:       9999 x 9999 = 99980001
                           +     9999
                           = 99990000
 
Hexadecimal:   FFFF x FFFF = FFFE0001
                           +     FFFF
                           = FFFF0000

Looks like there is still room in the double wide product to add another maximal 4 digit value.

Decimal:       9999 x 9999 = 99980001
                           +     9999
                           +     9999
                           = 99999999
 
Hexadecimal:   FFFF x FFFF = FFFE0001
                           +     FFFF
                           +     FFFF
                           = FFFFFFFF

But any more then that would cause a overflow.

Now we should look addends to asymmetric multiply. For example 4-digit by 1 digit multiply:

Decimal:       9999 x 9 = 89991
                        +  9999
                        +     9
                        = 99999
Hexadecimal:   FFFF x F = EFFF1
                        +  FFFF
                        +     F
                        = FFFFF
quadword:      FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
             x FFFFFFFF
             = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF 00000001
             +          FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
             +                                     FFFFFFFF
             = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF

Note that when M not equal N then the addends are restrict to size M and/or size N. Two addends of the larger multiplier size can overflow. This pattern repeats across the all digit bases/sizes and values of M, N. For the binary fixed pointer multiply-add or bit sizes M/N we can write the equation:

(2^(M+N) - 1) = ((2^M - 1) * (2^N - 1)) + (2^M - 1) + (2^N - 1)

Or in terms of fixed sized "words" of W-bits and M by N words.

(2^(W*(M+N)) - 1) = ((2^(W*M) - 1) * (2^(W*N) - 1)) + (2^(W*M) - 1) + (2^(W*N) - 1)

Why does this matter?

Because with modern hardware the actual multiply operations are faster and have less impact while the summation across the partial products becomes the major bottleneck. For recent POWER processors fixed-point are 5-7 cycles latency and dual issue (2/cycle). These multiplies are only dependent on the inputs (multiplicands). This allows the compiler and (super-scalar processor) to schedule the multiply operations early to prepare for summation. In many cases the 3rd and 4th multiplies are complete before the summation of the first two multiplies completes.

The add operations involved in partial product summation are dependent on the current column multiply and the high order word of summation of the previous stage. While add operations are nominally faster (2-3 cycles) than multiplies, they can generate carries that have to be propagated.

The Fixed-Point Unit has a dedicated carry-bit (CA) which becomes the critical resource. This dependency on the carry (in addition to the column multiply and previous summation) limits the compiler's (and hardware's) ability to parallelize stages of the summation. The Vector unit (PowerISA 2.07+) has quadword (vs Fixed point doubleword) binary add/subtract with carry/extend. The Vector Unit requires separate write Carry instructions to detect and return the carry to VRs. The write Carry instructions are paired with Unsigned Quadword Modulo instructions that generates the (modulo) 128-bit result.

Note: In PowerISA 3.0B has a new add extended (addex) instruction that can use the overflow-bit (OF) as a second carry (independent of CA). However the OF must be explicitly cleared (using subfo) before use as a carry flag.; The Vector Unit has the effective use of up to 32 carry bits. The down-side is it requires an extra instruction and whole 128-bit VR ro generate and hold each carry bit.

So knowing how to avoid overflows and carries in the summation of partial products can be useful. To illustrate we can examine the POWER8 implementation of vec_muludq(). POWER8 (PowerISA 2.07) does support add quadword but the largest vector fixed-point multiply is 32-bit Vector Multiply Even/Odd Unsigned Words (vec_muleuw() and (vec_mulouw()). The implementation generates four quadword by word (160-bit) partial products that are summed in four stages to generate the final 256-bit product.

Code for the first stage looks like this:

// Splat the lowest order word of b to tsw for word multiply
tsw = vec_splat ((vui32_t) b, VEC_WE_3);
// Multiply quadword a by lowest order word of b
t_even = (vui32_t)vec_vmuleuw((vui32_t)a, tsw);
t_odd = (vui32_t)vec_vmulouw((vui32_t)a, tsw);
// Rotate the low 32-bits (right) into tmq. This is actually
// implemented as 96-bit (12-byte) shift left.
tmq = vec_sld (t_odd, z, 12);
// shift the low 128 bits of partial product right 32-bits
t_odd = vec_sld (z, t_odd, 12);
// add the high 128 bits of even / odd partial products
t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

Note in this case we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example with maximum values for multiplicands a,b:

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b   x FFFFFFFF[3]
t_even       = FFFFFFFE 00000001 FFFFFFFE 00000001
t_odd >> 32  + 00000000 FFFFFFFE 00000001 FFFFFFFE
t            = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF
tmq          = 00000001

The high order 128-bits of the sum did not overflow.

The next tree stages are more complex.

// Splat the next word of b to tsw for word multiply
tsw = vec_splat ((vui32_t) b, VEC_WE_2);
// Multiply quadword a by next word of b
t_even = (vui32_t)vec_vmuleuw((vui32_t)a, tsw);
t_odd = (vui32_t)vec_vmulouw((vui32_t)a, tsw);
// Add with carry the odd multiply with previous partial product
tc = (vui32_t) vec_addcuq ((vui128_t) t_odd, (vui128_t) t);
t_odd = (vui32_t) vec_adduqm ((vui128_t) t_odd, (vui128_t) t);
// Rotate the low 32-bits (right) into tmq.
tmq = vec_sld (t_odd, tmq, 12);
// shift the low 128 bits (with carry) right 32-bits
t_odd = vec_sld (tc, t_odd, 12);
// add the high 128 bits of even / odd partial products
t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

Here we need a 3-way sum of the previous partial product, and the odd, even products from this stage. In this case the high 128-bits of previous partial product needs to align with the lower 128-bits of this stages 160-bit product for the first quadword add. This can produce a overflow, so we need to capture the carry and concatenate it the odd sum before shifting right 32-bits. Again we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example stage 2 with maximum values for multiplicands a,b:

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b   x FFFFFFFF[2]
 
t_odd          FFFFFFFE 00000001 FFFFFFFE 00000001
t            + FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF
t_odd        = FFFFFFFD 00000001 FFFFFFFE 00000000
tc           = 00000000 00000000 00000000 00000001
 
tc|t_odd>>32 = 00000001 FFFFFFFD 00000001 FFFFFFFE
t_odd|tmq    = 00000000 00000001
 
t_even       = FFFFFFFE 00000001 FFFFFFFE 00000001
tc|t_odd>>32 + 00000001 FFFFFFFD 00000001 FFFFFFFE
t            = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF

For POWER8 this 3-way sum and the required write-carry adds significant latency to stages 2, 3, and 4 of this multiply.

In POWER8 the vector quadword add/subtract instructions are cracked into 2 dependent simple fixed-point (XS) IOPs. So the effective instruction latency is (2+2=4) cycles. Also cracked instructions must be first in group, so back-to-back vaddcuq/vadduqm sequences will be dispatched separately. There no possibility of executing the pair concurrently, so the latency for the pair is 5-6 cycles.

So there is value in finding an alternative summation that avoids/reduces the number write-carry operations. From above (Some useful arithmetic facts (you may of forgotten)) we know it is possible to add one or two unsigned words to each of the doubleword products generated by vmuleuw/vmulouw.

We need to align the words of the quadword addend (zero extended on the left to doublewords) with the corresponding doublewords of the products. We can use Vector Merge Even/Odd Word operations to split and pad the addend into to align with the products. Then we use Vector Add Doubleword for the even/odd product-sums. Finally we use shift and add quadword to produce the 160-bit stage 2 sum.

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b   x FFFFFFFF[2]
quadword t:    FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF
 
t_even       = FFFFFFFE 00000001 FFFFFFFE 00000001
mrgew(z,t)   + 00000000 FFFFFFFE 00000000 FFFFFFFF
             = FFFFFFFE FFFFFFFF FFFFFFFF 00000000
 
t_odd        = FFFFFFFE 00000001 FFFFFFFE 00000001
mrgow(z,t)   + 00000000 FFFFFFFF 00000000 FFFFFFFF
             = FFFFFFFF 00000000 FFFFFFFF 00000000
 
t_odd>>32    = 00000000 FFFFFFFF 00000000 FFFFFFFF
t_odd|tmq>>32= 00000000 00000001
 
t_even       = FFFFFFFE FFFFFFFF FFFFFFFF 00000000
t_odd>>32    + 00000000 FFFFFFFF 00000000 FFFFFFFF
t            = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF
t_odd|tmq    = 00000000 00000001

This sequence replaces two instructions (vaddcuq/vadduqm) with four instructions (vmrgew/vmrgow/vaddudm/vaddudm), all of which;

have 2 cycle latency
are dual issue
without dispatch restrictions

We expect a latency of 4 cycles over the whole sequence. And splitting the first add into even/odd add blocks allows the compiler (and out-of-order hardware) more flexibility for instruction scheduling.

Vector Multiply-Add

Multiply-add seems to be a useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation that we can use here. For example:

static inline vui64_t
vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c)
{
  const vui32_t zero = { 0, 0, 0, 0 };
  vui64_t res;
  vui32_t c_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) c);
  res = vec_vmuleuw (a, b);
  return vec_addudm (res, (vui64_t) c_euw);
}

Which generates the following instruction sequence:

<__vec_vmaddeuw_PWR8>:
     d70:       vmuleuw v2,v2,v3
     d74:       vspltisw v0,0
     d78:       vmrgew  v4,v0,v4
     d7c:       vaddudm v2,v2,v4

The vspltisw loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function. The vmrgew has a latency of 2 cycles and should execute concurrently with vmuleuw. Similarly for vec_vmaddouw().

These operations (vec_vmaddeuw() and vec_vmaddouw()) are included in vec_int64_ppc.h as they require vec_addudm() and produce doubleword results. With this addition we can improve and simplify the code for stages 2-4 of the _ARCH_PWR8 implementation of vec_muludq(). For example:

// Splat the next word of b to tsw for word multiply
tsw = vec_splat ((vui32_t) b, VEC_WE_2);
// Multiply quadword a by next word of b and add previous partial
// product using multiply-add even/odd
t_even = (vui32_t)vec_vmaddeuw((vui32_t)a, tsw, t);
t_odd = (vui32_t)vec_vmaddouw((vui32_t)a, tsw, t);
// Rotate the low 32-bits (right) into tmq.
tmq = vec_sld (t_odd, tmq, 12);
// shift the low 128 bits (with carry) right 32-bits
t_odd = vec_sld (z, t_odd, 12);
// add the high 128 bits of even / odd partial products
t = (vui32_t) vec_adduqm ((vui128_t) t_even, (vui128_t) t_odd);

And Vector Multiply-Add2

From the description above (Some useful arithmetic facts (you may of forgotten)) we know we can add two unsigned words to the doubleword product without overflow. This is another useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation. For example:

static inline vui64_t
vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
{
  const vui32_t zero = { 0, 0, 0, 0 };
  vui64_t res, sum;
  vui32_t c_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) c);
  vui32_t d_euw = vec_mrgahw ((vui64_t) zero, (vui64_t) d);
  res = vec_vmuleuw (a, b);
  sum = vec_addudm ( (vui64_t) c_euw, (vui64_t) d_euw);
  return vec_addudm (res, sum);
}

Which generates to following instruction sequence:

<__vec_vmadd2euw_PWR8>:
    db0:       vmuleuw v2,v2,v3
    db4:       vspltisw v0,0
    db8:       vmrgew  v4,v0,v4
    dbc:       vmrgew  v5,v0,v5
    dc0:       vaddudm v5,v4,v5
    dc4:       vaddudm v2,v2,v5

The vspltisw loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function. The vmrgew/vmrgew/vaddudm sequence has a latency of 4-6 cycles and should execute concurrently with vmuleuw. Similarly for vec_vmadd2ouw().

Why not Vector Multiply-Sum

The PowerISA has a number of Multiply-Sum instructions that look a lot like the Multiply-Add described above? Well not exactly:

The behavior of Multiply-Sum allows overflow without any architected way to detect/capture and propagate the carry.
- Each of the two (even/odd) halves of each "word" element of VRA and VRB: Multiply the even halves of each "word" element. Then multiply the odd halves of each "word" element. This generates two unsigned integer "word" products for each "word" element.
- The sum of these two integer "word" products is added to the corresponding integer "word" element in VRC.
- This 3-way sum of can overflow without notification.
Multiply-Sum instructions can be used to emulate Multiply Even/Odd and Multiply-Add Even/Odd by constraining the inputs.
- Using Multiply-Sum to add prior partial-sums creates a serial dependency that limits instruction scheduling and slows execution.
The PowerISA does not have Multiply-Sum Word instructions.
The PowerISA 3.0 has a Multiply-Sum Unsigned Doubleword instruction but it does not exist in POWER8.
The base Altivec has Multiply-Sum Halfword/Byte instructions. But using POWER8's Multiply Even/Odd Unsigned Word is better for implementing quadword multiply on POWER8.

First we should look at the arithmetic of Multiply-Sum using maximal unsigned integer values.

VRA:           FFFF x FFFF
VRB:           FFFF x FFFF
VRC:           FFFF   FFFF
 
Even half:     FFFF x FFFF ->    FFFE0001
odd half:      FFFF x FFFF -> +  FFFE0001
Word addend                -> +  FFFFFFFF
                           =   2 FFFC0001

Note the sum overflows the word twice and high order bits of the sum will be lost.

For POWER9 we can simulate Vector Multiply Even/Odd Unsigned Doubleword by setting the Odd/Even doubleword of VRB to zero and the whole quadword addend VRC to zero. For example the even doubleword multiply.

static inline vui128_t
vec_vmuleud (vui64_t a, vui64_t b)
{
  const vui64_t zero = { 0, 0 };
  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
  return vec_msumudm(a, b_eud, zero);
}

And similarly for the odd doubleword multiply.

static inline vui128_t
vec_vmuloud (vui64_t a, vui64_t b)
{
  const vui64_t zero = { 0, 0 };
  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);
  return vec_msumudm(a, b_oud, (vui128_t) zero);
}

And review the arithmetic for vec_vmuleud() using maximal quadword values for a and b.

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword z:    00000000 00000000 00000000 00000000
 
mrged(b,z)   = FFFFFFFF FFFFFFFF 00000000 00000000
 
Even prod:     FFFFFFFF FFFFFFFE 00000000 00000001
odd prod     + 00000000 00000000 00000000 00000000
Word addend  + 00000000 00000000 00000000 00000000
msumudm      = FFFFFFFF FFFFFFFE 00000000 00000001

And for vec_vmuldud().

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword z:    00000000 00000000 00000000 00000000
 
mrgod(z,b)   = 00000000 00000000 FFFFFFFF FFFFFFFF
 
Even prod:     00000000 00000000 00000000 00000000
odd prod     + FFFFFFFF FFFFFFFE 00000000 00000001
Word addend  + 00000000 00000000 00000000 00000000
msumudm      = FFFFFFFF FFFFFFFE 00000000 00000001

We can also simulate Vector Multiply-Add Even/Odd Unsigned Doubleword by setting the odd/even doubleword of VRB to zero and the whole quadword addend to the even/odd double word of VRC. For example the even doubleword multiply-add.

static inline vui128_t
vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c)
{
  const vui64_t zero = { 0, 0 };
  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
  vui64_t c_eud = vec_mrgahd ((vui128_t) zero, (vui128_t) c);
  return vec_msumudm(a, b_eud, (vui128_t) c_eud);
}

And similarly for the odd doubleword multiply-add.

static inline vui128_t
vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c)
{
  const vui64_t zero = { 0, 0 };
  vui64_t b_oud = vec_mrgald ((vui128_t) zero, (vui128_t) b);
  vui64_t c_oud = vec_mrgald ((vui128_t) zero, (vui128_t) c);
  return vec_msumudm(a, b_oud, (vui128_t) c_oud);
}

And review the arithmetic for vec_vmaddeud() using maximal quadword values for a and b. The even/odd doublewords of c have slightly different values for illustrative purposes.

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword c:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
 
mrged(b,z)   = FFFFFFFF FFFFFFFF 00000000 00000000
mrged(z,c)   = 00000000 00000000 FFFFFFFF FFFFFFFF
 
Even prod:     FFFFFFFF FFFFFFFE 00000000 00000001
odd prod     + 00000000 00000000 00000000 00000000
Word addend  + 00000000 00000000 FFFFFFFF FFFFFFFF
msumudm      = FFFFFFFF FFFFFFFF 00000000 00000000

And for vec_vmaddoud().

quadword a:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword b:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
quadword c:    FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
 
mrgod(z,b)   = 00000000 00000000 FFFFFFFF FFFFFFFF
mrgod(z,c)   = 00000000 00000000 FFFFFFFF FFFFFFFE
 
Even prod:     00000000 00000000 00000000 00000000
odd prod     + FFFFFFFF FFFFFFFE 00000000 00000001
Word addend  + 00000000 00000000 FFFFFFFF FFFFFFFE
msumudm      = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF

This multiply-add even/odd doulbeword form only adds one additional (xxmrghd AKA xxpermdi) instruction over that required for the base multiply even/odd doubleword operation.

<__vmuleud_PWR9>:
     120:       xxspltib v0,0
     124:       xxmrghd v3,v3,v0
     128:       vmsumudm v2,v2,v3,v0
 
<__vmaddeud_PWR9>:
     1a0:       xxspltib v0,0
     1a4:       xxmrghd v3,v3,v0
     1a8:       xxmrghd v4,v0,v4
     1ac:       vmsumudm v2,v2,v3,v4

The xxspltib loads (immediate) the zero vector and the compiler should common this across operations and schedule this instruction once, early in the function.

For POWER9 instruction instruction timing is different and there are some unique trade-offs. The implementations above are small and appropriate for single instances of multiply doubleword or implementations of multiply quadword. However using the vmsumudm (operand VRC) addend creates a serial dependency within the multiply quadword implementation. When multiply quadword and multiply-add quadword are used in the implementation of wider multiplies (see vec_int512_ppc.h) these serial dependencies actually slow down the implementation.

A full 128 x 128-bit multiply only requires two stages of even/odd doubleword multiplies. This allows some simplification.
- Alignment shifts can be replaced with permute doubleword immediate (xxmrgld/xxmrghd/xxpermdi) operations.
- Careful rearrangement of the operations and operands allow the compiler to optimize (as common subexpressions) some of the doubleword masking operations.
The multiply even/odd doubleword operations require explicit masking of the even/odd multiplicands.
- Doubleword masking can be done with xxmrgld/xxmrghd/xxpermdi instructions which are dual issue with a 3 cycle latency.
- The multiplies (vmsumudm) are serially dependent on these masking instructions.
- In the POWER8 implementation (using vmuleuw/vmulouw) the multiplicand masking is implicit to the instruction.
The vmsumudm with the VRC addend can be used to combine the multiply-add of the partial production from the previous stage.
- This also requires explicit doubleword masking to avoid overflowing the quadword sum.
- This can make the masking operation and the multiply itself, serially dependent on the partial product sum from the previous stage.
The add (modulo/write-carry/extend) quadword instructions are dual issue with a 3 cycle latency. So the cost of quadword sums and generating/propagating carries is of less concern (than on POWER8).
- It can be better to use explicit add quadword and avoid the serial dependency on the vmsumudm (VRC) addend.
- This allows the compiler (and out-of-order hardware) more flexibility for instruction scheduling.

So lets look at some examples using the vmsumudm (VRC) addend and the alternative using VRC (settting VRA to zero) and explicit add quadword. First a 128x128-bit unsigned multiply using vmsumudm and exploiting the VRC addend where appropriate.

vui128_t
__test_muludq_y_PWR9 (vui128_t *mulu, vui128_t a, vui128_t b)
{
  vui32_t t, tmq;
  // compute the 256 bit product of two 128 bit values a, b.
  // The high 128 bits are accumulated in t and the low 128-bits
  // in tmq. The high 128-bits of the product are returned to the
  // address of the 1st parm. The low 128-bits are the return
  // value.
  const vui64_t zero = { 0, 0 };
  vui64_t a_swap = vec_swapd ((vui64_t) a);
  vui128_t tmh, tab, tba, tb0, tc1, tc2;
  // multiply the low 64-bits of a and b.  For PWR9 this is just
  // vmsumudm with conditioned inputs.
  tmq = (vui32_t) vec_vmuloud ((vui64_t)a, (vui64_t)b);
  // compute the 2 middle partial projects.  Use vmaddeud to add the
  // high 64-bits of the low product to one of the middle products.
  // This can not overflow.
  tab = vec_vmuloud (a_swap, (vui64_t) b);
  tba = vec_vmaddeud (a_swap, (vui64_t) b, (vui64_t) tmq);
  // sum the two middle products (plus the high 64-bits of the low
  // product.  This will generate a carry that we need to capture.
  t   = (vui32_t) vec_adduqm (tab, tba);
  tc1 = vec_addcuq (tab, tba);
  // result = t[l] || tmq[l].
  tmq = (vui32_t) vec_mrgald ((vui128_t) t, (vui128_t) tmq);
  // we can use multiply sum here because the high product plus the
  // high sum of middle partial products can't overflow.
  t   = (vui32_t) vec_permdi ((vui64_t) tc1, (vui64_t) t, 2);
  // This is equivalent to vec_vmadd2eud(a, b, tab, tba)
  // were (tab_even + tba_even) was pre-computed including the carry,
  // so no masking is required.
  t   = (vui32_t) vec_vmsumeud ((vui64_t) a, (vui64_t) b, (vui128_t) t);
 
  *mulu = (vui128_t) t;
  return ((vui128_t) tmq);
}

<__test_muludq_y_PWR9>:
    370:       xxspltib v1,0
    374:       xxswapd v12,v2
    378:       xxlor   v13,v2,v2
    37c:       xxmrgld v0,v1,v3
    380:       xxmrghd v3,v3,v1
    384:       vmsumudm v2,v2,v0,v1
    388:       vmsumudm v0,v12,v0,v1
    38c:       xxmrghd v1,v1,v2
    390:       vmsumudm v1,v12,v3,v1
    394:       vadduqm v12,v1,v0
    398:       vaddcuq v0,v0,v1
    39c:       xxmrgld v2,v12,v2
    3a0:       xxpermdi v0,v0,v12,2
    3a4:       vmsumudm v13,v13,v3,v0
    3a8:       stxv    v13,0(r3)
    3ac:       blr

Note: that first vmsumudm instruction is only dependent on the parameters a, masked b_odd, and const zero. The second vmsumudm instruction is only dependent on the parameters a_swap, masked b_odd, and const zero. The swap/mask operations requires 3-4 cycles and 7 cycles to complete first two vmsumudm's. The third vmsumudm instruction is dependent on the parameters a_swap, masked b_even, and masked tmq_even. The masked tmq_even is dependent on the xxmrghd of the results of the first vmsumudm. This adds another 10 cycles. The forth and final vmsumudm instruction is dependent on the parameters a, masked b_even, and the shifted sum (with carry) of (tab + tba). This is in turn dependent on the results from the second and third vmsumudm instructions. This adds another (6+7= 13) cycles for a total of 34 cycles. When this operation is expanded in-line the stxv and xxspltib will be optimized and can be ignored for this analysis.

Next a 128x128-bit unsigned multiply using vmsumudm but only passing const zero to the VRC addend.

vui128_t
__test_muludq_x_PWR9 (vui128_t *mulu, vui128_t a, vui128_t b)
{
  // compute the 256 bit product of two 128 bit values a, b.
  // The high 128 bits are accumulated in t and the low 128-bits
  // in tmq. The high 128-bits of the product are returned to the
  // address of the 1st parm. The low 128-bits are the return
  // value.
  const vui64_t zero = { 0, 0 };
  vui64_t a_swap = vec_swapd ((vui64_t) a);
  vui128_t thq, tlq, tx;
  vui128_t t0l, tc1;
  vui128_t thh, thl, tlh, tll;
  // multiply the low 64-bits of a and b.  For PWR9 this is just
  // vmsumudm with conditioned inputs.
  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);
  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);
  thl = vec_vmuloud (a_swap, (vui64_t)b);
  tlh = vec_vmuleud (a_swap, (vui64_t)b);
  // sum the two middle products (plus the high 64-bits of the low
  // product.  This will generate a carry that we need to capture.
  t0l   = (vui128_t) vec_mrgahd ( (vui128_t) zero, tll);
  tc1 = vec_addcuq (thl, tlh);
  tx   = vec_adduqm (thl, tlh);
  tx   = vec_adduqm (tx, t0l);
  // result = t[l] || tll[l].
  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);
  // Sum the high product plus the high sum (with carry) of middle
  // partial products.  This can't overflow.
  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);
  thq = vec_adduqm ( thh, thq);
 
  *mulu = (vui128_t) thq;
  return ((vui128_t) tlq);
}

<__test_muludq_x_PWR9>:
      xxspltib v0,0
      xxswapd v12,v2
      xxmrgld v13,v0,v3
    32c:       xxmrghd v3,v3,v0
      vmsumudm v1,v12,v13,v0
      vmsumudm v13,v2,v13,v0
      vmsumudm v12,v12,v3,v0
    33c:       xxmrghd v10,v0,v13
      vadduqm v11,v12,v1
      vmsumudm v3,v2,v3,v0
      vaddcuq v1,v1,v12
    34c:       vadduqm v2,v11,v10
      xxpermdi v1,v1,v2,2
      xxmrgld v2,v2,v13
      vadduqm v3,v3,v1
    35c:       stxv    v3,0(r3)
      blr

Note: that the vmsumudm instructions only depend on the parameters a/a_swap, masked b_odd/b_even, and const zero. After the parameters are conditioned (swapped/masked) the independent vmsumudm's can be scheduled early. The swap/mask operations requires 3-4 cycles and 8 cycles to complete four independent vmsumudm's. The partial product alignment and sums require another 12 cycles, for a total of 24 cycles. When this operation is expanded in-line the stxv and xxspltib will be optimized and can be ignored for this analysis.

The second example (using explicit add quadword);

only adds 1 instruction over the first example,
and executes 10 cycles faster.

Vector Multiply-Add Quadword

We can use multiply-add operation for wider word sizes (quadword and multiple precision quadword). The simplest quadword implementation would create a vec_madduq() operation based on vec_muludq() and add a quadword parameter "c" for the addend. Then modify the first stage of the platform specific multiplies to replace vector multiply even/odd with vector multiply-add even/odd, passing the addend as the the third parameter.

This works well for the POWER8 implementation because the additional vector add doublewords can be scheduled independently of the vector multiply even/odd words. But for POWER9 we need to avoid the serial dependences explained above in Why not Vector Multiply-Sum.

For the POWER9 implementation we use an explicit add quadword (and write-Carry) to sum the addend parameter to the first stage Multiply odd doubleword. For example:

 vui128_t
__test_madduq_y_PWR9 (vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c)
{
  // compute the 256 bit sum of product of two 128 bit values a, b
  // plus the quadword addend c.
  vui64_t a_swap = vec_swapd ((vui64_t) a);
  vui128_t thq, tlq, tx;
  vui128_t t0l, tc1, tcl;
  vui128_t thh, thl, tlh, tll;
  // multiply the four combinations of a_odd/a_even by b_odd/b_even.
  tll = vec_vmuloud ((vui64_t)a, (vui64_t)b);
  thh = vec_vmuleud ((vui64_t)a, (vui64_t)b);
  thl = vec_vmuloud (a_swap, (vui64_t)b);
  tlh = vec_vmuleud (a_swap, (vui64_t)b);
  // Add c to lower 128-bits of the partial product.
  tcl = vec_addcuq (tll, c);
  tll = vec_adduqm (tll, c);
  t0l = (vui128_t) vec_permdi ((vui64_t) tcl, (vui64_t) tll, 2);
  // sum the two middle products (plus the high 65-bits of the low
  // product-sum).
  tc1 = vec_addcuq (thl, tlh);
  tx  = vec_adduqm (thl, tlh);
  tx  = vec_adduqm (tx, t0l);
  // result = tx[l]_odd || tll[l]_odd.
  tlq = (vui128_t) vec_mrgald ((vui128_t) tx, (vui128_t) tll);
  // Sum the high product plus the high sum (with carry) of middle
  // partial products.  This can't overflow.
  thq = (vui128_t) vec_permdi ((vui64_t) tc1, (vui64_t) tx, 2);
  thq = vec_adduqm ( thh, thq);
 
  *mulu = (vui128_t) thq;
  return ((vui128_t) tlq);
}

The generated code is the same size as the serially depended version

This is just another example where the shortest instruction sequence or using the most powerful instructions, may not be the fastest implementation. The key point is that avoiding serial dependencies in the code and allowing the compiler to schedule high latency instructions early, allows better performance. This effect is amplified when quadword multiplies (vec_muludq(), vec_madduq(), and vec_madd2uq()) are used to compose wider multiply operations (see vec_int512_ppc.h).

Vector Quadword Examples

The PowerISA Vector facilities provide logical and integer arithmetic quadword (128-bit) operations. Some operations as direct PowerISA instructions and other operations composed of short instruction sequences. The Power Vector Library provides a higher level and comprehensive API of quadword integer integer arithmetic and support for extended arithmetic to multiple quadwords.

Printing Vector __int128 values

The GCC compiler supports the (vector) __int128 type but the runtime does not support printf() formating for __int128 types. However if we can use divide/modulo operations to split vector __int128 values into modulo 10^16 long int (doubleword) chunks, we can use printf() to convert and concatenate the decimal values into a complete number.

For example, from the __int128 value (39 decimal digits):

Detect the sign and set a char to "+' or '-'
Then from the absolute value, divide/modulo by 10000000000000000. Producing:
- The highest 7 digits (t_high)
- The middle 16 digits (t_mid)
- The lowest 16 digits (t_low)

We can use signed compare to detect the sign and set a char value to print a ' ' or '+' prefix. If the value is negative we want the absolute value before we do the divide/modulo steps. For example:

if (vec_cmpsq_all_ge (value, zero128))
  {
    sign = ' ';
    val128 = (vui128_t) value;
  }
else
  {
    sign = '-';
    val128 = vec_subuqm ((vui128_t) zero128, (vui128_t) value);
  }

Here we use the pveclib operation vec_cmpsq_all_ge() because the ABI and compilers do not define compare built-ins operations for the vector __int128 type. For the negative case we use the pveclib operation vec_subuqm() instead of vec_abs. Again the ABI and compilers do not define vec_abs built-ins for the vector __int128 type. Using pveclib operations have the additional benefit of supporting older compilers and platform specific implementations for POWER7 and POWER8.

Now we have the absolute value in val128 we can factor it into (3) chunks of 16 digits each. Normally scalar codes would use integer divide/modulo by 10000000000000000. And we are reminded that the PowerISA vector unit does not support integer divide operations and definitely not for quadword integers.

Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 128-bit fraction and we have a multiply high (vec_mulhuq()) operation. Multiplying a 128-bit unsigned integer by a 128-bit unsigned fraction generates a 256-bit product with 128-bits above (integer) and below (fraction) the radix point. The high 128-bits of the product is the integer quotient and we can discard the low order 128-bits.

It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.

See also: "Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

In the chapter above;

Figure 10-2 Computing the magic number for unsigned division.

provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For quadword and the divisor 10000000000000000,this is { 76624777043294442917917351357515459181, 0 , 51 }:

the multiplier is 76624777043294442917917351357515459181.
no corrective add is required.
the final shift is 51-bits right.

const vui128_t mul_ten16 = (vui128_t) CONST_VINT128_DW(
    0UL, 10000000000000000UL);
// Magic numbers for multiplicative inverse to divide by 10**16
// are 76624777043294442917917351357515459181, no corrective add,
// and shift right 51 bits.
const vui128_t mul_invs_ten16 = (vui128_t) CONST_VINT128_DW(
    0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);
const int shift_ten16 = 51;
...
 
// first divide/modulo the 39 digits __int128 by 10**16.
// This separates the high/middle 23 digits (tmpq) and low 16 digits.
tmpq = vec_mulhuq (val128, mul_invs_ten16);
tmpq = vec_srqi (tmpq, shift_ten16);
// Compute remainder of val128 / 10**16
// t_low = val128 - (tmpq * 10**16)
// Here we know tmpq and mul_ten16 are less then 64-bits
// so can use vec_vmuloud instead of vec_mulluq
tmp = vec_vmuloud ((vui64_t) tmpq, (vui64_t) mul_ten16);
t_low = (vui64_t) vec_subuqm (val128, tmp);
 
// Next divide/modulo the high/middle digits by 10**16.
// This separates the high 7 and middle 16 digits.
val128 = tmpq;
tmpq = vec_mulhuq (tmpq, mul_invs_ten16);
t_high = (vui64_t) vec_srqi (tmpq, shift_ten16);
tmp = vec_vmuloud (t_high, (vui64_t) mul_ten16);
t_mid = (vui64_t) vec_subuqm (val128, tmp);

All the operations used above are defined and implemented by pveclib. Most of these operations is not defined as single instructions in the PowerISA or as built-ins the ABI or require alternative implementations for older processors.

Now we have three vector unsigned __int128 values (t_low, t_mid, t_high) in the range 0-9999999999999999. Fixed point values in that range fit into the low order doubleword of each quadword. We can access these doublewords with array notation ([VEC_DW_L]) and the compiler will transfer them to fixed point (long int) GPRs. Then use normal char and long int printf() formating. For example:

printf ("%c%07lld%016lld%016lld", sign,

t_high[VEC_DW_L], t_mid[VEC_DW_L], t_low[VEC_DW_L]);

Here is the complete vector __int128 printf example:

 void
example_print_vint128 (vi128_t value)
{
  const vi128_t max_neg = (vi128_t) CONST_VINT128_DW(
      0x8000000000000000L, 0UL);
  const vi128_t zero128 = (vi128_t) CONST_VINT128_DW(
      0x0L, 0UL);
  const vui128_t mul_ten16 = (vui128_t) CONST_VINT128_DW(
      0UL, 10000000000000000UL);
  // Magic numbers for multiplicative inverse to divide by 10**16
  // are 76624777043294442917917351357515459181, no corrective add,
  // and shift right 51 bits.
  const vui128_t mul_invs_ten16 = (vui128_t) CONST_VINT128_DW(
      0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);
  const int shift_ten16 = 51;
 
  vui128_t tmpq, tmp;
  vui64_t t_low, t_mid, t_high;
  vui128_t val128;
  char sign;
 
  if (vec_cmpsq_all_ge (value, zero128))
    {
      sign = ' ';
      val128 = (vui128_t) value;
    }
  else
    {
      sign = '-';
      val128 = vec_subuqm ((vui128_t) zero128, (vui128_t) value);
    }
  // Convert the absolute (unsigned) value to Decimal and
  // prefix the sign.
 
  // first divide/modulo the 39 digits __int128 by 10**16.
  // This separates the high/middle 23 digits (tmpq) and low 16 digits.
  tmpq = vec_mulhuq (val128, mul_invs_ten16);
  tmpq = vec_srqi (tmpq, shift_ten16);
  // Compute remainder of val128 / 10**16
  // t_low = val128 - (tmpq * 10**16)
  // Here we know tmpq and mul_ten16 are less then 64-bits
  // so can use vec_vmuloud instead of vec_mulluq
  tmp = vec_vmuloud ((vui64_t) tmpq, (vui64_t) mul_ten16);
  t_low = (vui64_t) vec_subuqm (val128, tmp);
 
  // Next divide/modulo the high/middle digits by 10**16.
  // This separates the high 7 and middle 16 digits.
  val128 = tmpq;
  tmpq = vec_mulhuq (tmpq, mul_invs_ten16);
  t_high = (vui64_t) vec_srqi (tmpq, shift_ten16);
  tmp = vec_vmuloud (t_high, (vui64_t) mul_ten16);
  t_mid = (vui64_t) vec_subuqm (val128, tmp);
 
  printf ("%c%07lld%016lld%016lld", sign, t_high[VEC_DW_L],
          t_mid[VEC_DW_L], t_low[VEC_DW_L]);
}

Converting Vector __int128 values to BCD

POWER8 and POWER9 added a number of Binary Code Decimal (BCD) and Zoned Decimal operations that should be helpful for radix conversion and even faster large integer formatting for print.

See also: vec_bcd_ppc.h

The issue remains that __int128 values can represent up to 39 decimal digits while Signed BCD supports only 31 digits. POWER9 provides a Decimal Convert From Signed Quadword instruction with the following restriction:

Note: If the signed value of vrb is less then -(10**31-1) or greater than 10**31-1 the result is too large for the BCD format and the result is undefined.

It would be useful to check for this and if required, factor the __int128 value into to the high order 8 digits and the low order 31 digits. This allows for the safe and correct use of the vec_bcdcfsq() and with some decimal shifts/truncates vec_bcdctz(). This also enables conversion to multiple precision Vector BCD to represent 39 digits and more for radix conversions.

We first address the factoring by providing Vector Divide by const 10e31 Unsigned Quadword and Vector Modulo by const 10e31 Unsigned Quadword operation. This requires the multiplicative inverse using the vec_mulhuq() operation.

static inline vui128_t
vec_divuq_10e31 (vui128_t vra)
  // ten32  = +100000000000000000000000000000000UQ
  const vui128_t ten31 = (vui128_t)
          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
  // Magic numbers for multiplicative inverse to divide by 10**31
  // are 4804950418589725908363185682083061167, corrective add,
  // and shift right 107 bits.
  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(
      0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);
  const int shift_ten31 = 103;
  vui128_t result, t, q;
 
  if (vec_cmpuq_all_ge (vra, ten31))
    {
      q = vec_mulhuq (vra, mul_invs_ten31);
      // Need corrective add but want to avoid carry & double quad shift
      // The following avoids the carry and less instructions
      t = vec_subuqm (vra, q);
      t = vec_srqi (t, 1);
      t = vec_adduqm (t, q);
      result = vec_srqi (t, (shift_ten31 - 1));
    }
  else
    result = (vui128_t) { (__int128) 0 };
 
  return result;
}

As the vec_mulhuq() operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the 0 quotient, seems a prudent optimization.

So far we only have the quotient (the high order 8 digits) and still need to extract the remainder (the low order 31 digits). This is simply the quotient from above multiplied by 10e31 and subtracted from the original input. To avoid the multiple return value issue we define a modulo operation to take the original value and the quotient from vec_divuq_10e31().

static inline vui128_t
vec_moduq_10e31 (vui128_t vra, vui128_t q)
{
  // ten32  = +100000000000000000000000000000000UQ
  const vui128_t ten31 = (vui128_t)
          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
  vui128_t result, t;
 
  if (vec_cmpuq_all_ge (vra, ten31))
    {
      t = vec_mulluq (q, ten31);
      result = vec_subuqm (vra, t);
    }
  else
    result = vra;
 
  return result;
}

Again as the vec_mulluq() operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the input value as the remainder, seems a prudent optimization.

We expect these operations to be used together as in this example.

q = vec_divuq_10e31 (a);

r = vec_moduq_10e31 (a, q);

We also expect the compiler to common the various constant loads across the two operations as the code is in-lined. This header also provides variants for factoring by 10e32 (to use with the Zone conversion) and signed variants of the 10e31 operation for direct conversion to extend precision signed BCD.

See also: vec_divuq_10e32(), vec_moduq_10e32(), vec_divsq_10e31, vec_modsq_10e31.

Extending integer operations beyond Quadword

Some algorithms require even high integer precision than __int128 provides. this includes:

POSIX compliant conversion between __float128 and _Decimal128 types
POSIX compliant conversion from double and __float128 to decimal for print.
Cryptographic operations for Public-key cryptography and Elliptic Curves

The POWER8 provides instructions for extending add and subtract to 128-bit integer and beyond with carry/extend operations (see vec_addcuq(), vec_addecuq(), vec_addeuqm(), vec_adduqm(), (see vec_subcuq(), vec_subecuq(), vec_subeuqm(), vec_subuqm()). POWER9 adds instructions to improve decimal / binary conversion to/from 128-bit integer and beyond with carry/extend operations. And while the PowerISA does not yet provide full 128 x 128 bit integer multiply instructions, it has provided wider integer multiply instructions, beginning in POWER8 (see vec_mulesw(), vec_mulosw(), vec_muleuw(), vec_mulouw()) and again in POWER9 (see vec_msumudm()).

This all allows the pveclib to improve (reduce the latency of) the implementation of multiply quadword operations. This includes operations that generate the full 256-bit multiply product (see vec_muludq(), vec_mulhuq(). vec_mulluq()). And this in combination with add/subtract with carry extend quadword allows the coding of even wider (multiple quadword) multiply operations.

Extended Quadword multiply

The following example performs a 256x256 bit unsigned integer multiply generating a 512-bit product:

void
test_mul4uq (vui128_t *__restrict__ mulu, vui128_t m1h, vui128_t m1l,
             vui128_t m2h, vui128_t m2l)
{
  vui128_t mc, mp, mq, mqhl;
  vui128_t mphh, mphl, mplh, mpll;
  mpll = vec_muludq (&mplh, m1l, m2l);
  mp = vec_muludq (&mphl, m1h, m2l);
  mplh = vec_addcq (&mc, mplh, mp);
  mphl = vec_adduqm (mphl, mc);
  mp = vec_muludq (&mqhl, m2h, m1l);
  mplh = vec_addcq (&mq, mplh, mp);
  mphl = vec_addeq (&mc, mphl, mqhl, mq);
  mp = vec_muludq (&mphh, m2h, m1h);
  mphl = vec_addcq (&mq, mphl, mp);
  mphh = vec_addeuqm (mphh, mq, mc);
 
  mulu[0] = mpll;
  mulu[1] = mplh;
  mulu[2] = mphl;
  mulu[3] = mphh;
}

This example generates some additional questions:

Why use vec_muludq() instead of pairing vec_mulhuq() and vec_mulluq()?
Why use vec_addcq() instead of pairing vec_addcuq() and vec_adduqm()?
Why return the 512-bit product via a pointer instead of returning a struct or array of 4 x vui128_t (homogeneous aggregates)?

The detailed rationale for this is documented in section Returning extended quadword results. In this specific case (quadword integer operations that generate two vector values) pveclib provides both alternatives:

separate operations each returning a single (high or low order) vector.
combined operations providing:
- the lower order vector as the function return value.
- the high order (carry or high product) vector via a pointer reference parameter.

Either method should provide the same results. For example:

mplh = vec_addcq (&mc, mplh, mp);

is equivalent to

mc = vec_addcuq (mplh, mp);

mplh = vec_adduqm (mplh, mp);

and

mpll = vec_muludq (&mplh, m1l, m2l);

is equivalent to

mpll = vec_mulluq (m1l, m2l);

mplh = vec_mulhud (m1l, m2l);

So is there any advantage to separate versus combined operations?

Functionally it is useful to have separate operations for the cases where only one quadword part is needed. For example if you know that a add/subtract operation can not overflow, why generate the carry? Alternatively the quadword greater/less-than compares are based solely on the carry from the subtract quadword, why generate lower 128-bit (modulo) difference? For multiplication the modulo (multiply low) operation is the expected semantic or is known to be sufficient. Alternatively the multiplicative inverse only uses the high order (multiply high) quadword of the product.

From the performance (instruction latency and throughput) perspective, if the algorithm requires the extended result or full product, the combined operation is usually the better choice. Otherwise use the specific single return operation needed. At best, the separate operations may generate the same instruction sequence as the combined operation, But this depends on the target platform and specific optimizations implemented by the compiler.

Note: For inlined operations the pointer reference in the combined form, is usually optimized to a simple register assignment, by the compiler.; For platform targets where the separate operations each generate a single instruction, we expect the compiler to generate the same instructions as the combined operation. But this is only likely for add/sub quadword on the POWER8 and multiply by 10 quadword on POWER9.

Quadword Long Division

In the section Converting Vector __int128 values to BCD above we used multiplicative inverse to factor a binary quadword value in two (high quotient and low remainder) parts. Here we divide by a large power of 10 (10³¹ or 10³²) of a size where the quotient and remainder allow direct conversion to BCD (see vec_bcdcfsq(), vec_bcdcfuq()). After conversion, the BCD parts can be concatenated to form the larger (39 digit) decimal radix value equivalent of the 128-bit binary value.

We can extend this technique to larger (multiple quadword) binary values but this requires long division. This is the version of the long division you learned in grade school, where a multi-digit value is divided in stages by a single digit. But the digits we are using are really big (10³¹-1 or 10³²-1).

The first step is relatively easy. Start by dividing the left-most digit of the dividend by the divisor, generating the integer quotient and remainder. We already have operations to implement that.

// initial step for the top digits
dn = d[0];
qh = vec_divuq_10e31 (dn);
rh = vec_moduq_10e31 (dn, qh);
q[0] = qh;

The array d contains the quadwords of the extended precision integer dividend. The array q will contain the quadwords of the extended precision integer quotient. Here we have generated the first quadword q[0] digit of the quotient. The remainder rh will be used in the next step of the long division.

The process repeats except after the first step we have an intermediate dividend formed from:

The remainder from the previous step
Concatenated with the next digit of the extended precision quadword dividend.

So for each additional step we need to divide two quadwords (256-bits) by the quadword divisor. Actually this dividend should be less than a full 256-bits because we know the remainder is less than the divisor. So the intermediate dividend is less than ((divisor - 1) * 2¹²⁸). So we know the quotient can not exceed (2¹²⁸-1) or one quadword.

Now we need an operation that will divide this double quadword value and provide quotient and remainder that are correct (or close enough). Remember your grade school long division where you would:

estimate the quotient
multiply the quotient by the divisor
subtract this product from the current 2 digit dividend
check that the remainder is less than the divisor.
- if the remainder is greater than the divisor; the estimated quotient is too small
- if the remainder is negative (the product was greater than the dividend); the estimated quotient is too large.
correct the quotient and remainder if needed before doing the next step.

So we don't need to be perfect, but close enough. As long as we can detect any problems and (if needed) correct the results, we can implement long division to any size.

We already have an operation for dividing a quadword by 10³¹ using the magic numbers for multiplicative inverse. This can easily be extended to multiply double quadword high. For example:

// Multiply high [vra||vrb] * mul_invs_ten31
q = vec_mulhuq (vrb, mul_invs_ten31);
q1 = vec_muludq (&t, vra, mul_invs_ten31);
c = vec_addcuq (q1, q);
q = vec_adduqm (q1, q);
q1 = vec_adduqm (t, c);
// corrective add [q2||q1||q] = [q1||q] + [vra||vrb]
c = vec_addcuq (vrb, q);
q = vec_adduqm (vrb, q);
// q2 is the carry-out from the corrective add
q2 = vec_addecuq (q1, vra, c);
q1 = vec_addeuqm (q1, vra, c);
// shift 384-bits (including the carry) right 107 bits
// Using shift left double quadword shift by (128-107)-bits
r2 = vec_sldqi (q2, q1, (128 - shift_ten31));
result = vec_sldqi (q1, q, (128 - shift_ten31));

Here we generate a 256-bit multiply high using the vec_mulhuq() for the low dividend (vrb) and vec_muludq() for high dividend (vra). Then sum the partial products ([t||q1] + [0||q]) to get initial 256-bit product [q1||q]. Then apply the corrective add ([q1||q] + [vra||vrb]). This may generate a carry which needs to be included in the final shift.

Technically we only expect a 128-bit quotient after the shift, but we have 3 quadwords (2 quadwords and a carry) going into the shift right. Also our (estimated) quotient may be off by 1 and generate a 129-bit result. This is due to using a the magic numbers for 128-bit multiplicative inverse and not regenerating magic numbers for 256-bits. We can't do anything about that now and so return a 256-bit double quadword quotient.

Note: This is where only needing to be "close enough", works in our favor. We will check and correct the quotient in the modulo operation.

The 256-bits we want are spanning multiple quadwords so we replace a simple quadword shift right with two Shift Left Double Quadword Immediate operations and complement the shift count (128 - shift_ten31). This gives a 256-bit quotient which we expect to have zero in the high quadword.

As this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for an short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the quotient is zero. This also helps when the long division dividend may have leading quadword zeros that need to be skipped over. For the full implementation looks like:

static inline vui128_t
vec_divudq_10e31 (vui128_t *qh, vui128_t vra, vui128_t vrb)
{
  const vui128_t ten31 = (vui128_t)
          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
  const vui128_t zero = (vui128_t) { (__int128) 0UL };
  // Magic numbers for multiplicative inverse to divide by 10**31
  // are 4804950418589725908363185682083061167, corrective add,
  // and shift right 103 bits.
  const vui128_t mul_invs_ten31 = (vui128_t) CONST_VINT128_DW(
      0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);
  const int shift_ten31 = 103;
  vui128_t result, r2, t, q, q1, q2, c;
 
  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))
    {
      // Multiply high [vra||vrb] * mul_invs_ten31
      q = vec_mulhuq (vrb, mul_invs_ten31);
      q1 = vec_muludq (&t, vra, mul_invs_ten31);
      c = vec_addcuq (q1, q);
      q = vec_adduqm (q1, q);
      q1 = vec_adduqm (t, c);
      // corrective add [q2||q1||q] = [q1||q] + [vra||vrb]
      c = vec_addcuq (vrb, q);
      q = vec_adduqm (vrb, q);
      // q2 is the carry-out from the corrective add
      q2 = vec_addecuq (q1, vra, c);
      q1 = vec_addeuqm (q1, vra, c);
      // shift 384-bits (including the carry) right 103 bits
      // Using shift left double quadword shift by (128-103)-bits
      r2 = vec_sldqi (q2, q1, (128 - shift_ten31));
      result = vec_sldqi (q1, q, (128 - shift_ten31));
    }
  else
    {
      // Dividend is less than divisor then return zero quotient
      r2 = zero;
      result = zero;
    }
 
  // return 256-bit quotient
  *qh = r2;
  return result;
}

To complete the long division operation we need to perform double quadword modulo operations. Here the dividend is two quadwords and the low quadword of the quotient from the divide double quadword operation above. We use multiply double quadword to compute the remainder ([vra||vrb] - (q * 10³¹). Generating the 256-bit product and difference ensure we can detect the case where the quotient is off-by-1 on the high side.

t = vec_muludq (&th, *ql, ten31);
c = vec_subcuq (vrb, t);
t = vec_subuqm (vrb, t);
th = vec_subeuqm (vra, th, c);
// The remainder should be less than the divisor
if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten31))
  {
    // Otherwise the estimated quotient is off by 1
     *ql = vec_adduqm (*ql, minus_one);
    // And the remainder is negative, so add the divisor
    t = vec_adduqm (t, ten31);
  }
result = t;

In this case we need to correct both remainder and the (estimated) quotient. This is a bit tricky as the quotient is normally passed by value, but for this operation we need to pass by reference, which allows the corrected quotient to be passed on to the next step.

Again as this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for and short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the remainder is simply the dividend.

static inline vui128_t
vec_modudq_10e31 (vui128_t vra, vui128_t vrb, vui128_t *ql)
{
  // ten31  = +100000000000000000000000000000000UQ
  const vui128_t ten31 = (vui128_t)
          { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };
  const vui128_t zero = (vui128_t) { (__int128) 0UL };
  const vui128_t minus_one = (vui128_t) { (__int128) -1L };
  vui128_t result, t, th, c;
 
  if (vec_cmpuq_all_ne (vra, zero) || vec_cmpuq_all_ge (vrb, ten31))
    {
      t = vec_muludq (&th, *ql, ten31);
      c = vec_subcuq (vrb, t);
      t = vec_subuqm (vrb, t);
      th = vec_subeuqm (vra, th, c);
      // The remainder should be less than the divisor
      if (vec_cmpuq_all_ne (th, zero) && vec_cmpuq_all_ge (t, ten31))
        {
          // If not the estimated quotient is off by 1
           *ql = vec_adduqm (*ql, minus_one);
          // And the remainder is negative, so add the divisor
          t = vec_adduqm (t, ten31);
        }
      result = t;
    }
  else
    result = vrb;
 
  return result;
}

Now we have all the operations needed to complete the implementation of long division by the decimal constant (10³¹).

vui128_t
example_longdiv_10e31 (vui128_t *q, vui128_t *d, long int _N)
{
  vui128_t dn, qh, ql, rh;
  long int i;
 
  // initial step for the top digits
  dn = d[0];
  qh = vec_divuq_10e31 (dn);
  rh = vec_moduq_10e31 (dn, qh);
  q[0] = qh;
 
  // now we know the remainder is less than the divisor.
  for (i=1; i<_N; i++)
    {
      dn = d[i];
      ql = vec_divudq_10e31 (&qh, rh, dn);
      rh = vec_modudq_10e31 (rh, dn, &ql);
      q[i] = ql;
    }
  // return the final remainder
  return rh;
}

The result of each call to example_longdiv_10e31() is the output array q of quadwords containing the extended quotient, and the remainder as the return value. The input array d and output array q should not overlap in storage. The remainder is in the range 0-9999999999999999999999999999999 and is suitable for conversion to BCD or decimal characters. (see vec_bcdcfsq()). Repeated calls passing the quotient from the previous call as the dividend, reduces the quotient by 31 digits and returns another 31 digits in the remainder for conversion. This continues until the quotient is less than 10³¹ which provides the highest order digits of the decimal result.

Note: Similarly for long division in support of unsigned 32-digit BCD conversion using operations; vec_divuq_10e32(), vec_moduq_10e32(), vec_divudq_10e32(), and vec_modudq_10e32(). Long division for other constant divisors or multiple quadword divisors is an exercise for the student.

Todo:: The implementation above gives correct results for all the cases tested for divide by constants 10³¹ and 10³²). This is not a mathematical proof of correctness, just an observation. Anyone who finds a counter example or offers a mathematical proof should submit a bug report.

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Macro Definition Documentation

◆ CONST_VUINT128_Qx16d

#define CONST_VUINT128_Qx16d	(	__q0,
		__q1
	)

Value:

    ( (vui128_t) \
    (((unsigned __int128) __q0) * 10000000000000000UL) \
    + ((unsigned __int128) __q1) )

Generate a vector unsigned __int128 constant from doublewords.

Combine 2 x 16 decimal digit long long constants into a single 32 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.

For example

const vui128_t ten32 = CONST_VUINT128_Qx16d (10000000000000000UL, 0UL);

◆ CONST_VUINT128_Qx18d

#define CONST_VUINT128_Qx18d	(	__q0,
		__q1
	)

Value:

    ( (vui128_t) \
    (((unsigned __int128) __q0) * 1000000000000000000UL) \
    + ((unsigned __int128) __q1) )

Generate a vector unsigned __int128 constant from doublewords.

Combine 2 x 18 decimal digit long long constants into a single 36 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.

For example

vui128_t ten36-1 = CONST_VUINT128_Qx18d (999999999999999999UL, 999999999999999999UL);

◆ CONST_VUINT128_Qx19d

#define CONST_VUINT128_Qx19d	(	__q0,
		__q1
	)

Value:

    ( (vui128_t) \
    (((unsigned __int128) __q0) * 10000000000000000000UL) \
    + ((unsigned __int128) __q1) )

Generate a vector unsigned __int128 constant from doublewords.

Combine 2 x 19 decimal digit long long constants into a single 38 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.

For example

const vui128_t mul_invs_ten16 = CONST_VUINT128_Qx19d(

7662477704329444291UL, 7917351357515459181UL);

◆ CONST_VUINT128_QxD

#define CONST_VUINT128_QxD	(	__q0,
		__q1
	)

Value:

    ( (vui128_t) \
    (((unsigned __int128) __q0) << 64) \
    + ((unsigned __int128) __q1) )

Generate a vector unsigned __int128 constant from doublewords.

Combine 2 x 64-bit long long constants into a single __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.

For example

vui128_t ten32 = CONST_VUINT128_QxD (0x000004ee2d6d415bUL, 0x85acef8100000000UL);

◆ CONST_VUINT128_QxW

#define CONST_VUINT128_QxW	(	__q0,
		__q1,
		__q2,
		__q3
	)

Value:

      ( (vui128_t) \
      (((unsigned __int128) __q0) << 96) \
    + (((unsigned __int128) __q1) << 64) \
    + (((unsigned __int128) __q2) << 32) \
    +  ((unsigned __int128) __q3) )

Generate a vector unsigned __int128 constant from words.

Combine 4 x 32-bit int constants into a single __int128 constant. The 4 parameters are integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.

The effect is to compute an unsigned __int128 constant from 4 x 32-bit unsigned int constants.

int128 = (__q0 << 96) + (__q1 << 64) + (__q2 << 32) + q3

For example

// const for 100000000000000000000000000000000 (AKA 10**32)
vui128_t ten32 = CONST_VUINT128_QxW (0x000004ee, 0x2d6d415b,
                                     0x85acef81, 0x00000000);

Function Documentation

◆ vec_absduq()

static vui128_t vec_absduq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Absolute Difference Unsigned Quadword.

Compute the absolute difference of the quadwords. For each unsigned quadword, subtract VRB from VRA and return the absolute value of the difference.

processor	Latency	Throughput
power8	14	1/cycle
power9	11	1/cycle

Parameters

vra	vector of unsigned __int128
vrb	vector of unsigned __int128

Returns: vector of the absolute difference.

◆ vec_abssq()

static vi128_t vec_abssq ( vi128_t vra )

inlinestatic

Vector Absolute Value Signed Quadword.

Compute the absolute value of a signed quadwords.

processor	Latency	Throughput
power8	6-8	1/cycle
power9	9-12	1/cycle

Parameters

vra	vector of signed __int128

Returns: vector of the absolute value of vra.

◆ vec_addcq()

static vui128_t vec_addcq	(	vui128_t *	cout,
		vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Add with carry Unsigned Quadword.

Add two vector __int128 values and return sum and the carry out.

processor	Latency	Throughput
power8	8	1/2 cycles
power9	6	2/cycle

Parameters

*cout	carry out from the sum of a and b.
a	128-bit vector treated a __int128.
b	128-bit vector treated a __int128.

Returns: __int128 (lower 128-bits) sum of a and b.

◆ vec_addcuq()

static vui128_t vec_addcuq	(	vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Add & write Carry Unsigned Quadword.

Add two vector __int128 values and return the carry out.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

a	128-bit vector treated a __int128.
b	128-bit vector treated a __int128.

Returns: __int128 carry of the sum of a and b.

◆ vec_addecuq()

static vui128_t vec_addecuq	(	vui128_t	a,
		vui128_t	b,
		vui128_t	ci
	)

inlinestatic

Vector Add Extended & write Carry Unsigned Quadword.

Add two vector __int128 values plus a carry-in (0|1) and return the carry out bit.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

a	128-bit vector treated a __int128.
b	128-bit vector treated a __int128.
ci	Carry-in from vector bit[127].

Returns: carry-out in bit[127] of the sum of a + b + c.

◆ vec_addeq()

static vui128_t vec_addeq	(	vui128_t *	cout,
		vui128_t	a,
		vui128_t	b,
		vui128_t	ci
	)

inlinestatic

Vector Add Extend with carry Unsigned Quadword.

Add two vector __int128 values plus a carry-in (0|1) and return sum and the carry out.

processor	Latency	Throughput
power8	8	1/2 cycles
power9	6	2/cycle

Parameters

*cout	carry out from the sum of a and b.
a	128-bit vector treated a __int128.
b	128-bit vector treated a __int128.
ci	Carry-in from vector bit[127].

Returns: __int128 (lower 128-bits) sum of a + b + c.

◆ vec_addeuqm()

static vui128_t vec_addeuqm	(	vui128_t	a,
		vui128_t	b,
		vui128_t	ci
	)

inlinestatic

Vector Add Extended Unsigned Quadword Modulo.

Add two vector __int128 values plus a carry (0|1) and return the modulo 128-bit result.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

a	128-bit vector treated a __int128.
b	128-bit vector treated a __int128.
ci	Carry-in from vector bit[127].

Returns: __int128 sum of a + b + c, modulo 128-bits.

◆ vec_adduqm()

static vui128_t vec_adduqm	(	vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Add Unsigned Quadword Modulo.

Add two vector __int128 values and return result modulo 128-bits.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

a	128-bit vector treated as a __int128.
b	128-bit vector treated as a __int128.

Returns: __int128 sum of a and b.

◆ vec_avguq()

static vui128_t vec_avguq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Average Unsigned Quadword.

Compute the average of two unsigned quadwords as (VRA + VRB + 1) / 2.

processor	Latency	Throughput
power8	14	1/cycle
power9	11	1/cycle

Parameters

vra	vector unsigned quadwords
vrb	vector unsigned quadwords

Returns: vector of the absolute differences.

◆ vec_clzq()

static vui128_t vec_clzq ( vui128_t vra )

inlinestatic

Vector Count Leading Zeros Quadword for unsigned __int128 elements.

Count leading zeros for a vector __int128 and return the count in a vector suitable for use with vector shift (left|right) and vector shift (left|right) by octet instructions.

processor	Latency	Throughput
power8	8-10	1/cycle
power9	10-12	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.

Returns: a 128-bit vector with bits 121:127 containing the count of leading zeros.

◆ vec_cmpeqsq()

static vb128_t vec_cmpeqsq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Equal Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra == vrb, otherwise all '0's. We use vec_cmpequq as it works for both signed and unsigned compares.

processor	Latency	Throughput
power8	6	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector signed __int128 compare equal.

◆ vec_cmpequq()

static vb128_t vec_cmpequq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Equal Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra == vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. To get the correct quadword result, the doubleword element equal truth values are swapped, then anded with the original compare results. Otherwise use vector word compare and additional boolean logic to insure all word elements are equal.

processor	Latency	Throughput
power8	6	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128s.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare equal.

◆ vec_cmpgesq()

static vb128_t vec_cmpgesq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Greater Than or Equal Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra >= vrb, otherwise all '0's.

Flip the operand sign bits and use vec_cmpgeuq for signed compare.

processor	Latency	Throughput
power8	10-16	1/ 2cycles
power9	8-14	1/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector signed __int128 compare greater than.

◆ vec_cmpgeuq()

static vb128_t vec_cmpgeuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Greater Than or Equal Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra >= vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_cyq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_cyq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.

processor	Latency	Throughput
power8	8	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare greater than.

◆ vec_cmpgtsq()

static vb128_t vec_cmpgtsq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Greater Than Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra > vrb, otherwise all '0's.

Flip the operand sign bits and use vec_cmpgtuq for signed compare.

processor	Latency	Throughput
power8	10-16	1/ 2cycles
power9	8-14	1/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector signed __int128 compare greater than.

◆ vec_cmpgtuq()

static vb128_t vec_cmpgtuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Greater Than Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra > vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction with the parameters reversed. This generates a carry for less than or equal and NOT carry for greater than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_ncq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.

processor	Latency	Throughput
power8	8	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare greater than.

◆ vec_cmplesq()

static vb128_t vec_cmplesq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Less Than or Equal Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra <= vrb, otherwise all '0's.

Flip the operand sign bits and use vec_cmpleuq for signed compare.

processor	Latency	Throughput
power8	10-16	1/ 2cycles
power9	8-14	1/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector signed __int128 compare less than or equal.

◆ vec_cmpleuq()

static vb128_t vec_cmpleuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Less Than or Equal Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra <= vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_cyq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.

processor	Latency	Throughput
power8	8	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare less than or equal.

◆ vec_cmpltsq()

static vb128_t vec_cmpltsq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Less Than Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra < vrb, otherwise all '0's.

Flip the operand sign bits and use vec_cmpltuq for signed compare.

processor	Latency	Throughput
power8	10-16	1/ 2cycles
power9	8-14	1/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare less than.

◆ vec_cmpltuq()

static vb128_t vec_cmpltuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Less Than Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra < vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Subtract & write Carry QuadWord (vsubcuq) instruction. This generates a carry for greater than or equal and NOT carry for less than. Then use vec_setb_ncq ro convert the carry into a vector bool. Here we use the pveclib implementations (vec_subcuq() and vec_setb_ncq()), instead of <altivec.h> intrinsics, to address older compilers and POWER7.

processor	Latency	Throughput
power8	8	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare less than.

◆ vec_cmpnesq()

static vb128_t vec_cmpnesq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare Equal Signed Quadword.

Compare signed __int128 (128-bit) integers and return all '1's, if vra != vrb, otherwise all '0's. We use vec_cmpequq as it works for both signed and unsigned compares.

processor	Latency	Throughput
power8	6	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector treated as an signed __int128.
vrb	128-bit vector treated as an signed __int128.

Returns: 128-bit vector boolean reflecting vector signed __int128 compare not equal.

◆ vec_cmpneuq()

static vb128_t vec_cmpneuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare Not Equal Unsigned Quadword.

Compare unsigned __int128 (128-bit) integers and return all '1's, if vra != vrb, otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. To get the correct quadword result, the doubleword element equal truth values are swapped, then not anded with the original compare results. Otherwise use vector word compare and additional boolean logic to insure all word elements are equal.

processor	Latency	Throughput
power8	6	2/cycle
power9	7	2/cycle

Parameters

vra	128-bit vector treated as an unsigned __int128.
vrb	128-bit vector treated as an unsigned __int128.

Returns: 128-bit vector boolean reflecting vector unsigned __int128 compare equal.

◆ vec_cmpsq_all_eq()

static int vec_cmpsq_all_eq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare all Equal Signed Quadword.

Compare vector signed __int128 values and return true if vra and vrb are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsq_all_ge()

static int vec_cmpsq_all_ge	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare any Greater Than or Equal Signed Quadword.

Compare vector unsigned __int128 values and return true if vra >= vrb.

processor	Latency	Throughput
power8	10-15	1/ 2cycles
power9	8	1/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Greater Than or Equal, false otherwise.

◆ vec_cmpsq_all_gt()

static int vec_cmpsq_all_gt	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare any Greater Than Signed Quadword.

Compare vector signed __int128 values and return true if vra > vrb.

processor	Latency	Throughput
power8	10-15	1/ 2cycles
power9	8	1/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Greater Than, false otherwise.

◆ vec_cmpsq_all_le()

static int vec_cmpsq_all_le	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare any Less Than or Equal Signed Quadword.

Compare vector signed __int128 values and return true if vra <= vrb.

processor	Latency	Throughput
power8	10-15	1/ 2cycles
power9	8	1/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Less Than or Equal, false otherwise.

◆ vec_cmpsq_all_lt()

static int vec_cmpsq_all_lt	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare any Less Than Signed Quadword.

Compare vector signed __int128 values and return true if vra < vrb.

processor	Latency	Throughput
power8	10-15	1/ 2cycles
power9	8	1/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Less Than, false otherwise.

◆ vec_cmpsq_all_ne()

static int vec_cmpsq_all_ne	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Compare all Not Equal Signed Quadword.

Compare vector signed __int128 values and return true if vra and vrb are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as an vector signed __int128 (qword) element.
vrb	128-bit vector treated as an vector signed __int128 (qword) element.

Returns: boolean __int128 for all 128-bits, true if equal, false otherwise.

◆ vec_cmpuq_all_eq()

static int vec_cmpuq_all_eq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare all Equal Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra and vrb are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpuq_all_ge()

static int vec_cmpuq_all_ge	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare any Greater Than or Equal Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra >= vrb.

processor	Latency	Throughput
power8	8-13	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Greater Than or Equal, false otherwise.

◆ vec_cmpuq_all_gt()

static int vec_cmpuq_all_gt	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare any Greater Than Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra > vrb.

processor	Latency	Throughput
power8	8-13	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Greater Than, false otherwise.

◆ vec_cmpuq_all_le()

static int vec_cmpuq_all_le	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare any Less Than or Equal Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra <= vrb.

processor	Latency	Throughput
power8	8-13	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Less Than or Equal, false otherwise.

◆ vec_cmpuq_all_lt()

static int vec_cmpuq_all_lt	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare any Less Than Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra < vrb.

processor	Latency	Throughput
power8	8-13	2/ 2cycles
power9	6	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean int for all 128-bits, true if Less Than, false otherwise.

◆ vec_cmpuq_all_ne()

static int vec_cmpuq_all_ne	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Compare all Not Equal Unsigned Quadword.

Compare vector unsigned __int128 values and return true if vra and vrb are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector treated as an vector unsigned __int128 (qword) element.
vrb	128-bit vector treated as an vector unsigned __int128 (qword) element.

Returns: boolean __int128 for all 128-bits, true if equal, false otherwise.

◆ vec_cmul100cuq()

static vui128_t vec_cmul100cuq	(	vui128_t *	cout,
		vui128_t	a
	)

inlinestatic

Vector combined Multiply by 100 & write Carry Unsigned Quadword.

compute the product of a 128 bit values a * 100. Only the low order 128 bits of the product are returned.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	6	1/cycle

Parameters

*cout	pointer to upper 128-bits of the product.
a	128-bit vector treated as unsigned __int128.

Returns: vector __int128 (lower 128-bits of the 256-bit product) a * 100.

◆ vec_cmul100ecuq()

static vui128_t vec_cmul100ecuq	(	vui128_t *	cout,
		vui128_t	a,
		vui128_t	cin
	)

inlinestatic

Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword.

Compute the product of a 128 bit value a * 100 + digit(cin). The function return its low order 128 bits of the extended product. The first parameter (*cout) it the address of the vector to receive the generated carry out in the range 0-99.

processor	Latency	Throughput
power8	15-17	1/cycle
power9	9	1/cycle

Parameters

*cout	pointer to upper 128-bits of the product.
a	128-bit vector treated as unsigned __int128.
cin	values 0-99 in bits 120:127 of a vector.

Returns: vector __int128 (lower 128-bits of the 256-bit product) a * 100.

◆ vec_cmul10cuq()

static vui128_t vec_cmul10cuq	(	vui128_t *	cout,
		vui128_t	a
	)

inlinestatic

Vector combined Multiply by 10 & write Carry Unsigned Quadword.

compute the product of a 128 bit values a * 10. Only the low order 128 bits of the product are returned.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	3	1/ 2cycles

Parameters

*cout	pointer to upper 128-bits of the product.
a	128-bit vector treated as a unsigned __int128.

Returns: vector __int128 (lower 128-bits of the 256-bit product) a * 10.

◆ vec_cmul10ecuq()

static vui128_t vec_cmul10ecuq	(	vui128_t *	cout,
		vui128_t	a,
		vui128_t	cin
	)

inlinestatic

Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword.

Compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	3	1/ 2cycles

Parameters

*cout	pointer to upper 128-bits of the product.
a	128-bit vector treated as a unsigned __int128.
cin	values 0-9 in bits 124:127 of a vector.

Returns: vector __int128 (upper 128-bits of the 256-bit product) a * 10.

◆ vec_ctzq()

static vui128_t vec_ctzq ( vui128_t vra )

inlinestatic

Vector Count Trailing Zeros Quadword for unsigned __int128 elements.

Count trailing zeros for a vector __int128 and return the count in a vector suitable for use with vector shift (left|right) and vector shift (left|right) by octet instructions.

processor	Latency	Throughput
power8	15-17	1/cycle
power9	13-16	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.

Returns: a 128-bit vector with bits 121:127 containing the count of trailing zeros.

◆ vec_divsq_10e31()

static vi128_t vec_divsq_10e31 ( vi128_t vra )

inlinestatic

Vector Divide by const 10e31 Signed Quadword.

Compute the quotient of a 128 bit values vra / 10e31.

Note: vec_divsq_10e31() and vec_modsq_10e31() can be used to prepare for Decimal Convert From Signed Quadword (See vec_bcdcfsq()), This guarantees that the conversion to Vector BCD does not overflow and the 39-digit extended result is obtained.

processor	Latency	Throughput
power8	18-60	1/cycle
power9	20-45	1/cycle

Parameters

vra	the dividend as a vector treated as a unsigned __int128.

Returns: the quotient as vector unsigned __int128.

◆ vec_divudq_10e31()

static vui128_t vec_divudq_10e31	(	vui128_t *	qh,
		vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Divide Unsigned Double Quadword by const 10e31.

Compute the quotient of 256 bit value vra||vrb / 10e31.

Note: vec_divudq_10e31() and vec_modudq_10e31() can be used to perform long division of a multi-quaqword binary value by the constant 10e31. The final remainder can be passed to Decimal Convert From Signed Quadword (See vec_bcdcfsq()). Long division is repeated on the resulting multi-quadword quotient to extract 31-digits for each step. This continues until the multi-quadword quotient is less than 10e31 which provides the highest order 31-digits of the of the multiple precision binary to BCD conversion.

processor	Latency	Throughput
power8	12-192	1/cycle
power9	9-127	1/cycle

Parameters

*qh	the high quotient as a vector unsigned __int128.
vra	the high dividend as a vector unsigned __int128.
vrb	the low dividend as a vector unsigned __int128.

Returns: the low quotient as vector unsigned __int128.

◆ vec_divudq_10e32()

static vui128_t vec_divudq_10e32	(	vui128_t *	qh,
		vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Divide Unsigned Double Quadword by const 10e32.

Compute the quotient of 256 bit value vra||vrb / 10e32.

Note: vec_divudq_10e32() and vec_modudq_10e32() can be used to perform long division of a multi-quaqword binary value by the constant 10e32. The final remainder can be passed to Decimal Convert From Unsigned Quadword (See vec_bcdcfuq()). Long division it repeated on the resulting multi-quadword quotient to extract 32-digits for each step. This continues until the multi-quadword quotient result is less than 10e32 which provides the highest order 32-digits of the of the multiple precision binary to BCD conversion.

processor	Latency	Throughput
power8	12-192	1/cycle
power9	9-127	1/cycle

Parameters

*qh	the high quotient as a vector unsigned __int128.
vra	the high dividend as a vector unsigned __int128.
vrb	the low dividend as a vector unsigned __int128.

Returns: the low quotient as vector unsigned __int128.

◆ vec_divuq_10e31()

static vui128_t vec_divuq_10e31 ( vui128_t vra )

inlinestatic

Vector Divide by const 10e31 Unsigned Quadword.

Compute the quotient of a 128 bit values vra / 10e31.

Note: vec_divuq_10e31() and vec_moduq_10e31() can be used to prepare for Decimal Convert From Signed Quadword (See vec_bcdcfsq()), This guarantees that the conversion to Vector BCD does not overflow and the 39-digit extended result is obtained.

processor	Latency	Throughput
power8	8-48	1/cycle
power9	9-31	1/cycle

Parameters

vra	the dividend as a vector treated as a unsigned __int128.

Returns: the quotient as vector unsigned __int128.

◆ vec_divuq_10e32()

static vui128_t vec_divuq_10e32 ( vui128_t vra )

inlinestatic

Vector Divide by const 10e32 Unsigned Quadword.

Compute the quotient of a 128 bit values vra / 10e32.

Note: vec_divuq_10e32() and vec_moduq_10e32() can be used to prepare for Decimal Convert From Unsigned Quadword (See vec_bcdcfuq()), This guarantees that the conversion to Vector BCD does not overflow and the 39-digit extended result is obtained.

processor	Latency	Throughput
power8	8-48	1/cycle
power9	9-31	1/cycle

Parameters

vra	the dividend as a vector treated as a unsigned __int128.

Returns: the quotient as vector unsigned __int128.

◆ vec_madd2uq()

static vui128_t vec_madd2uq	(	vui128_t *	mulu,
		vui128_t	a,
		vui128_t	b,
		vui128_t	c1,
		vui128_t	c2
	)

inlinestatic

Vector Multiply-Add2 Unsigned Quadword.

Compute the sum of the 256 bit product of two 128 bit values a, b plus the sum of 128 bit values c1 and c2. The low order 128 bits of the sum are returned, while the high order 128-bits are "stored" via the mulu pointer.

Note: The advantage of this form (versus Multiply-Sum) is that the final 256 bit sum can not overflow.

processor	Latency	Throughput
power8	60-66	1/cycle
power9	30-36	1/cycle

Parameters

*mulu	pointer to vector unsigned __int128 to receive the upper 128-bits of the 256 bit sum ((a * b) + c1 + c2).
a	128-bit vector treated as unsigned __int128.
b	128-bit vector treated as unsigned __int128.
c1	128-bit vector treated as unsigned __int128.
c2	128-bit vector treated as unsigned __int128.

Returns: vector unsigned __int128 (lower 128-bits) of ((a * b) + c1 + c2).

◆ vec_madduq()

static vui128_t vec_madduq	(	vui128_t *	mulu,
		vui128_t	a,
		vui128_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Add Unsigned Quadword.

Compute the sum of the 256 bit product of two 128 bit values a, b plus the 128 bit value c. The low order 128 bits of the sum are returned, while the high order 128-bits are "stored" via the mulu pointer.

Note: The advantage of this form (versus Multiply-Sum) is that the final 256 bit sum can not overflow.

processor	Latency	Throughput
power8	56-62	1/cycle
power9	27-33	1/cycle

Parameters

*mulu	pointer to vector unsigned __int128 to receive the upper 128-bits of the 256 bit sum ((a * b) + c).
a	128-bit vector treated as unsigned __int128.
b	128-bit vector treated as unsigned __int128.
c	128-bit vector treated as unsigned __int128.

Returns: vector unsigned __int128 (lower 128-bits) of ((a * b) + c).

◆ vec_maxsq()

static vi128_t vec_maxsq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Maximum Signed Quadword.

Compare Quadwords vra and vrb as signed integers and return the larger value in the result.

processor	Latency	Throughput
power8	12-18	2/cycle
power9	10-18	2/cycle

Parameters

vra	128-bit vector __int128.
vrb	128-bit vector __int128.

Returns: vector __int128 maximum of a and b.

◆ vec_maxuq()

static vui128_t vec_maxuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Maximum Unsigned Quadword.

Compare Quadwords vra and vrb as unsigned integers and return the larger value in the result.

processor	Latency	Throughput
power8	10	2/cycle
power9	8	2/cycle

Parameters

vra	128-bit vector unsigned __int128.
vrb	128-bit vector unsigned __int128.

Returns: vector unsigned __int128 maximum of a and b.

◆ vec_minsq()

static vi128_t vec_minsq	(	vi128_t	vra,
		vi128_t	vrb
	)

inlinestatic

Vector Minimum Signed Quadword.

Compare Quadwords vra and vrb as signed integers and return the smaller value in the result.

processor	Latency	Throughput
power8	12-18	2/cycle
power9	10-18	2/cycle

Parameters

vra	128-bit vector __int128.
vrb	128-bit vector __int128.

Returns: vector __int128 minimum of a and b.

◆ vec_minuq()

static vui128_t vec_minuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Minimum Unsigned Quadword.

Compare Quadwords vra and vrb as unsigned integers and return the smaller value in the result.

processor	Latency	Throughput
power8	10	2/cycle
power9	8	2/cycle

Parameters

vra	128-bit vector unsigned __int128 int.
vrb	128-bit vector unsigned __int128 int.

Returns: vector unsigned __int128 minimum of a and b.

◆ vec_modsq_10e31()

static vi128_t vec_modsq_10e31	(	vi128_t	vra,
		vi128_t	q
	)

inlinestatic

Vector Modulo by const 10e31 Signed Quadword.

Compute the remainder of a 128 bit values vra % 10e31.

processor	Latency	Throughput
power8	8-52	1/cycle
power9	9-23	2/cycle

Parameters

vra	the dividend as a vector treated as a signed __int128.
q	128-bit signed __int128 containing the quotient from vec_divuq_10e31().

Returns: the remainder as vector signed __int128.

◆ vec_modudq_10e31()

static vui128_t vec_modudq_10e31	(	vui128_t	vra,
		vui128_t	vrb,
		vui128_t *	ql
	)

inlinestatic

Vector Modulo Unsigned Double Quadword by const 10e31.

Compute the remainder (vra||vrb) - (ql * 10e31).

Note: As we are using 128-bit multiplicative inverse for 128-bit integer in a 256-bit divide, so the quotient may not be exact (one bit off). So we check here if the remainder is too high (greater than 10e31) and correct both the remainder and quotient if needed.

processor	Latency	Throughput
power8	12-124	1/cycle
power9	12-75	1/cycle

Parameters

vra	the high dividend as a vector unsigned __int128.
vrb	the low dividend as a vector unsigned __int128.
*ql	128-bit unsigned __int128 containing the quotient from vec_divudq_10e31().

Returns: the remainder as vector unsigned __int128.

◆ vec_modudq_10e32()

static vui128_t vec_modudq_10e32	(	vui128_t	vra,
		vui128_t	vrb,
		vui128_t *	ql
	)

inlinestatic

Vector Modulo Unsigned Double Quadword by const 10e32.

Compute the remainder (vra||vrb) - (ql * 10e32).

Note: As we are using 128-bit multiplicative inverse for 128-bit integer in a 256-bit divide, so the quotient may not be exact (one bit off). So we check here if the remainder is too high (greater than 10e32) and correct both the remainder and quotient if needed.

processor	Latency	Throughput
power8	12-124	1/cycle
power9	12-75	1/cycle

Parameters

vra	the high dividend as a vector unsigned __int128.
vrb	the low dividend as a vector unsigned __int128.
*ql	128-bit unsigned __int128 containing the quotient from vec_divudq_10e31().

Returns: the remainder as vector unsigned __int128.

◆ vec_moduq_10e31()

static vui128_t vec_moduq_10e31	(	vui128_t	vra,
		vui128_t	q
	)

inlinestatic

Vector Modulo by const 10e31 Unsigned Quadword.

Compute the remainder of a 128 bit values vra % 10e31.

processor	Latency	Throughput
power8	8-52	1/cycle
power9	9-23	2/cycle

Parameters

vra	the dividend as a vector treated as a unsigned __int128.
q	128-bit unsigned __int128 containing the quotient from vec_divuq_10e31().

Returns: the remainder as vector unsigned __int128.

◆ vec_moduq_10e32()

static vui128_t vec_moduq_10e32	(	vui128_t	vra,
		vui128_t	q
	)

inlinestatic

Vector Modulo by const 10e32 Unsigned Quadword.

Compute the remainder of a 128 bit values vra % 10e32.

processor	Latency	Throughput
power8	8-52	1/cycle
power9	9-23	2/cycle

Parameters

vra	the dividend as a vector treated as a unsigned __int128.
q	128-bit unsigned __int128 containing the quotient from vec_divuq_10e32().

Returns: the remainder as vector unsigned __int128.

◆ vec_msumcud()

static vui128_t vec_msumcud	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum and Write Carryout Unsigned Doubleword.

Compute the even and odd 128-bit products of doubleword 64-bit element values from a, b. Then compute the carry-out of the low order 128-bits of the sum of (a_even * b_even) + (a_odd * b_odd) + c. Only the high order 2 bits of the 130-bit Multiply-Sum are returned and the low order 128-bits of the sum are ignored/lost. Results are in the range 0-2.

processor	Latency	Throughput
power8	30-32	1/cycle
power9	5-7	2/cycle

Parameters

a	128-bit __vector unsigned long long.
b	128-bit __vector unsigned long long.
c	128-bit __vector unsigned __int128.

Returns: The Carryout of the __vector unsigned Multiply-Sum.

◆ vec_msumudm()

static vui128_t vec_msumudm	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Unsigned Doubleword Modulo.

compute the even and odd 128-bit products of doubleword 64-bit element values from a, b. Then compute the 128-bit sum (a_even * b_even) + (a_odd * b_odd) + c. Only the low order 128 bits of the Multiply-Sum are returned and any overflow/carry-out is ignored/lost.

processor	Latency	Throughput
power8	30-32	1/cycle
power9	5-7	2/cycle

Parameters

a	128-bit __vector unsigned long int.
b	128-bit __vector unsigned long int.
c	128-bit __vector unsigned __int128.

Returns: __vector unsigned Modulo Sum of the 128-bit even / odd products of operands a and b plus the unsigned __int128 operand c.

◆ vec_mul10cuq()

static vui128_t vec_mul10cuq ( vui128_t a )

inlinestatic

Vector Multiply by 10 & write Carry Unsigned Quadword.

compute the product of a 128 bit value a * 10. Only the high order 128 bits of the product are returned. This will be binary coded decimal value 0-9 in bits 124-127, Bits 0-123 will be '0'.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	3	1/cycle

Parameters

a	128-bit vector treated as a unsigned __int128.

Returns: __int128 (upper 128-bits of the 256-bit product) a * 10 >> 128.

◆ vec_mul10ecuq()

static vui128_t vec_mul10ecuq	(	vui128_t	a,
		vui128_t	cin
	)

inlinestatic

Vector Multiply by 10 Extended & write Carry Unsigned Quadword.

Compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.

processor	Latency	Throughput
power8	15-17	1/cycle
power9	3	1/cycle

Parameters

a	128-bit vector treated as unsigned __int128.
cin	values 0-9 in bits 124:127 of a vector.

Returns: __int128 (upper 128-bits of the 256-bit product) a * 10 >> 128.

◆ vec_mul10euq()

static vui128_t vec_mul10euq	(	vui128_t	a,
		vui128_t	cin
	)

inlinestatic

Vector Multiply by 10 Extended Unsigned Quadword.

compute the product of a 128 bit value a * 10 + digit(cin). Only the low order 128 bits of the extended product are returned.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	3	1/cycle

Parameters

a	128-bit vector treated as unsigned __int128.
cin	values 0-9 in bits 124:127 of a vector.

Returns: __int128 (lower 128-bits) a * 10.

◆ vec_mul10uq()

static vui128_t vec_mul10uq ( vui128_t a )

inlinestatic

Vector Multiply by 10 Unsigned Quadword.

compute the product of a 128 bit value a * 10. Only the low order 128 bits of the product are returned.

processor	Latency	Throughput
power8	13-15	1/cycle
power9	3	1/cycle

Parameters

a	128-bit vector treated as unsigned __int128.

Returns: __int128 (lower 128-bits) a * 10.

◆ vec_muleud()

static vui128_t vec_muleud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Even Unsigned Doublewords.

Multiple the even 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the even doublewords.

Note: The element numbering changes between big and little-endian. So the compiler and this implementation adjusts the generated code to reflect this.

processor	Latency	Throughput
power8	21-23	1/cycle
power9	8-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.

Returns: vector unsigned __int128 product of the even double words of a and b.

◆ vec_mulhud()

static vui64_t vec_mulhud	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Multiply High Unsigned Doubleword.

Multiple the corresponding doubleword elements of two vector unsigned long values and return the high order 64-bits, from each 128-bit product.

processor	Latency	Throughput
power8	28-32	1/cycle
power9	11-16	1/cycle

Note: This operation can be used to effectively perform a divide by multiplying by the scaled multiplicative inverse (reciprocal).

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

Parameters

vra	128-bit vector unsigned long int.
vrb	128-bit vector unsigned long int.

Returns: vector unsigned long int of the high order 64-bits of the unsigned 128-bit product of the doubleword elements from vra and vrb.

◆ vec_mulhuq()

static vui128_t vec_mulhuq	(	vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Multiply High Unsigned Quadword.

compute the 256 bit product of two 128 bit values a, b. The high order 128 bits of the product are returned.

processor	Latency	Throughput
power8	56-64	1/cycle
power9	33-39	1/cycle

Parameters

a	128-bit vector treated as unsigned __int128.
b	128-bit vector treated as unsigned __int128.

Returns: vector unsigned __int128 (upper 128-bits) of a * b.

◆ vec_mulluq()

static vui128_t vec_mulluq	(	vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Multiply Low Unsigned Quadword.

compute the 256 bit product of two 128 bit values a, b. Only the low order 128 bits of the product are returned.

processor	Latency	Throughput
power8	42-48	1/cycle
power9	16-20	2/cycle

Parameters

a	128-bit vector treated as unsigned __int128.
b	128-bit vector treated as unsigned __int128.

Returns: vector unsigned __int128 (lower 128-bits) a * b.

◆ vec_muloud()

static vui128_t vec_muloud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Multiple the odd 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the odd doublewords.

Note: The element numbering changes between big and little-endian. So the compiler and this implementation adjusts the generated code to reflect this.

processor	Latency	Throughput
power8	21-23	1/cycle
power9	8-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.

Returns: vector unsigned __int128 product of the odd double words of a and b.

◆ vec_muludm()

static vui64_t vec_muludm	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Multiply Unsigned Doubleword Modulo.

Multiple the corresponding doubleword elements of two vector unsigned long values and return the low order 64-bits of the 128-bit product for each element.

Note: vec_muludm can be used for unsigned or signed integers. It is the vector equivalent of Multiply Low Doubleword.

processor	Latency	Throughput
power8	19-28	1/cycle
power9	11-16	1/cycle

Parameters

vra	128-bit vector unsigned long long.
vrb	128-bit vector unsigned long long.

Returns: vector unsigned long long of the low order 64-bits of the unsigned 128-bit product of the doubleword elements from vra and vrb.

◆ vec_muludq()

static vui128_t vec_muludq	(	vui128_t *	mulu,
		vui128_t	a,
		vui128_t	b
	)

inlinestatic

Vector Multiply Unsigned Double Quadword.

compute the 256 bit product of two 128 bit values a, b. The low order 128 bits of the product are returned, while the high order 128-bits are "stored" via the mulu pointer.

processor	Latency	Throughput
power8	52-56	1/cycle
power9	24-30	1/cycle

Parameters

*mulu	pointer to vector unsigned __int128 to receive the upper 128-bits of the product.
a	128-bit vector treated as unsigned __int128.
b	128-bit vector treated as unsigned __int128.

Returns: vector unsigned __int128 (lower 128-bits) of a * b.

◆ vec_negsq()

static vi128_t vec_negsq ( vi128_t int128 )

inlinestatic

Vector Negate Signed Quadword.

Negate (0 - int128) the quadword.

processor	Latency	Throughput
power8	6-8	2/cycle
power9	9-12	2/cycle

Parameters

int128 a 128-bit vector treated as signed __int128.

Returns: The negative of int128.

◆ vec_neguq()

static vui128_t vec_neguq ( vui128_t int128 )

inlinestatic

Vector Negate Unsigned Quadword.

Negate (0 - int128) the quadword.

processor	Latency	Throughput
power8	6-8	2/cycle
power9	9-12	2/cycle

Parameters

int128 a 128-bit vector treated as unsigned __int128.

Returns: The negative of int128.

◆ vec_popcntq()

static vui128_t vec_popcntq ( vui128_t vra )

inlinestatic

Vector Population Count Quadword for unsigned __int128 elements.

Count the number of '1' bits within a vector unsigned __int128 and return the count (0-128) in a vector unsigned __int128.

processor	Latency	Throughput
power8	9-11	2/cycle
power9	9-12	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.

Returns: a 128-bit vector with bits 121:127 containing the population count.

◆ vec_revbq()

static vui128_t vec_revbq ( vui128_t vra )

inlinestatic

Vector Byte Reverse Quadword.

Return the bytes / octets of a 128-bit vector in reverse order.

processor	Latency	Throughput
power8	2-13	2 cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.

Returns: a 128-bit vector with the bytes in reserve order.

◆ vec_rlq()

static vui128_t vec_rlq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Rotate Left Quadword.

Vector Rotate Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	10	1 cycle
power9	14	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
vrb	Shift amount in bits 121:127.

Returns: Left shifted vector.

◆ vec_rlqi()

static vui128_t vec_rlqi	(	vui128_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Rotate Left Quadword Immediate.

Vector Rotate Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	10	1 cycle
power9	14	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
shb	Shift amount in the range 0-127.

Returns: Left shifted vector.

◆ vec_selsq()

static vi128_t vec_selsq	(	vi128_t	vra,
		vi128_t	vrb,
		vb128_t	vrc
	)

inlinestatic

Vector Select Signed Quadword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
vrb	a 128-bit vector treated as unsigned __int128.
vrc	a 128-bit vector treated as bool __int128.

Returns: The selected bit from vra and vrb

◆ vec_seluq()

static vui128_t vec_seluq	(	vui128_t	vra,
		vui128_t	vrb,
		vb128_t	vrc
	)

inlinestatic

Vector Select Unsigned Quadword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
vrb	a 128-bit vector treated as unsigned __int128.
vrc	a 128-bit vector treated as bool __int128.

Returns: The selected bit from vra and vrb

◆ vec_setb_cyq()

static vb128_t vec_setb_cyq ( vui128_t vcy )

inlinestatic

Vector Set Bool from Quadword Carry.

If the vector quadword carry bit (vcy.bit[127]) is '1' then return a vector bool __int128 that is all '1's. Otherwise return all '0's.

processor	Latency	Throughput
power8	4 - 6	2/cycle
power9	3 - 5	2/cycle

Vector quadword carries are normally the result of a write-Carry operation. For example; vec_addcuq(), vec_addecuq(), vec_subcuq(), vec_subecuq(), vec_addcq(), vec_addeq().

Parameters

vcy	a 128-bit vector generated from a write-Carry operation.

Returns: a 128-bit vector bool of all '1's if the carry bit is '1'. Otherwise all '0's.

◆ vec_setb_ncq()

static vb128_t vec_setb_ncq ( vui128_t vcy )

inlinestatic

Vector Set Bool from Quadword not Carry.

If the vector quadword carry bit (vcy.bit[127]) is '1' then return a vector bool __int128 that is all '0's. Otherwise return all '1's.

processor	Latency	Throughput
power8	4 - 6	2/cycle
power9	3 - 5	2/cycle

Vector quadword carries are normally the result of a write-Carry operation. For example; vec_addcuq(), vec_addecuq(), vec_subcuq(), vec_subecuq(), vec_addcq(), vec_addeq().

Parameters

vcy	a 128-bit vector generated from a write-Carry operation.

Returns: a 128-bit vector bool of all '1's if the carry bit is '0'. Otherwise all '0's.

◆ vec_setb_sq()

static vb128_t vec_setb_sq ( vi128_t vra )

inlinestatic

Vector Set Bool from Signed Quadword.

If the quadword's sign bit is '1' then return a vector bool __int128 that is all '1's. Otherwise return all '0's.

processor	Latency	Throughput
power8	4 - 6	2/cycle
power9	5 - 8	2/cycle

Parameters

vra	a 128-bit vector treated as signed __int128.

Returns: a 128-bit vector bool of all '1's if the sign bit is '1'. Otherwise all '0's.

◆ vec_sldq()

static vui128_t vec_sldq	(	vui128_t	vrw,
		vui128_t	vrx,
		vui128_t	vrb
	)

inlinestatic

Vector Shift Left Double Quadword.

Vector Shift Left double Quadword 0-127 bits. Return a vector __int128 that is the left most 128-bits after shifting left 0-127-bits of the 256-bit double vector (vrw||vrx). The shift amount is from bits 121:127 of vrb.

processor	Latency	Throughput
power8	10	1 cycle
power9	14	1/cycle

Parameters

vrw	upper 128-bits of the 256-bit double vector.
vrx	lower 128-bits of the 256-bit double vector.
vrb	Shift amount in bits 121:127.

Returns: high 128-bits of left shifted double vector.

◆ vec_sldqi()

static vui128_t vec_sldqi	(	vui128_t	vrw,
		vui128_t	vrx,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Left Double Quadword Immediate.

Vector Shift Left double Quadword 0-127 bits. Return a vector __int128 that is the left most 128-bits after shifting left 0-127-bits of the 256-bit double vector (vrw||vrx). The shift amount is from bits 121:127 of vrb.

processor	Latency	Throughput
power8	10	1 cycle
power9	14	1/cycle

Parameters

vrw	upper 128-bits of the 256-bit double vector.
vrx	lower 128-bits of the 256-bit double vector.
shb	Shift amount in the range 0-127.

Returns: high 128-bits of left shifted double vector.

◆ vec_slq()

static vui128_t vec_slq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Shift Left Quadword.

Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	4	1/cycle
power9	6	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
vrb	Shift amount in bits 121:127.

Returns: Left shifted vector.

◆ vec_slq4()

static vui128_t vec_slq4 ( vui128_t vra )

inlinestatic

Deprecated:: Vector Shift Left 4-bits Quadword. Replaced by vec_slqi with shb param = 4.

Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

Parameters

vra	a 128-bit vector treated a __int128.

Returns: Left shifted vector.

◆ vec_slq5()

static vui128_t vec_slq5 ( vui128_t vra )

inlinestatic

Deprecated:: Vector Shift Left 5-bits Quadword. Replaced by vec_slqi with shb param = 5.

Vector Shift Left Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

 @param vra a 128-bit vector treated a __int128.
 @return Left shifted vector.

◆ vec_slqi()

static vui128_t vec_slqi	(	vui128_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Left Quadword Immediate.

Shift left Quadword 0-127 bits. The shift amount is a const unsigned int in the range 0-127. A shift count of 0 returns the original value of vra. Shift counts greater then 127 bits return zero.

processor	Latency	Throughput
power8	2-13	2 cycle
power9	3-15	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
shb	Shift amount in the range 0-127.

Returns: 128-bit vector shifted left shb bits.

◆ vec_splat_s128()

static vi128_t vec_splat_s128 ( const int sim )

inlinestatic

Vector Splat Immediate Signed Quadword. Extend a signed integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Signed (Byte | Halfword |Word).

Note: POWER9/10 will generate the instruction xxspltib for byte values -128 to 128. But the ISA does not have vextsb2q instructions (so far). So we need to sign extent the byte value using a const quadword (0/-1 depending on the sign) and vsldoi. POWER8 (and earlier) does not have xxspltib but does have vspltisw. For a smaller range (-16 -> 15) POWER8 can use the sequence vec_splat_s8(sim)/vec_splat_s8(0/-1)/vec_sld. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section.

processor	Latency	Throughput
power8	4 - 9	1/cycle
power9	5 - 9	2/cycle

Parameters

sim	a small signed integer const.

Returns: Vector with sim value extended to quadword.

◆ vec_splat_u128()

static vui128_t vec_splat_u128 ( const int sim )

inlinestatic

Vector Splat Immediate Unsigned Quadword. Extend a unsigned integer constant across the quadword element of the result. This is the quadword equivalent of Vector Splat Immediate Unsigned (Byte | Halfword |Word).

Note: POWER9/10 will generate the instruction xxspltib for byte values 0 to 255. Then we need to sign extent the byte value using a const quadword 0 and vsldoi. POWER8 (and earlier) does not have xxspltib but does have vspltisw. For a smaller range (0 -> 15) POWER8 can use the sequence vec_splat_s8(sim)/vec_splat_s8(0)/vec_sld. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section.

processor	Latency	Throughput
power8	4 - 9	1/cycle
power9	5 - 9	2/cycle

Parameters

sim	a small unsigned integer const.

Returns: Vector with sim value extended to quadword.

◆ vec_sraq()

static vi128_t vec_sraq	(	vi128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Shift Right Algebraic Quadword.

Vector Shift Right Algebraic Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	10	1 cycle
power9	14	1/cycle

Parameters

vra	a 128-bit vector treated as signed __int128.
vrb	Shift amount in bits 121:127.

Returns: Right algebraic shifted vector.

◆ vec_sraqi()

static vi128_t vec_sraqi	(	vi128_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Algebraic Quadword Immediate.

Vector Shift Right Algebraic Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	6-15	1 cycle
power9	9-18	1/cycle

Note: vec_sraqi optimizes for some special cases. For shift by octet (multiple of 8 bits) use vec_setb_sq () to extend sign then vector shift left double by octet immediate by (16 - (shb / 8)) to effect the right octet shift. For _ARCH_PWR8 and shifts less than 64 bits, use both vec_srqi () and vector shift right algebraic doubleword. Then use vec_pasted () to combine the high 64-bits from vec_sradi () and the low 64-bits from vec_srqi ().

Parameters

vra	a 128-bit vector treated as signed __int128.
shb	Shift amount in the range 0-127.

Returns: Right algebraic shifted vector.

◆ vec_srq()

static vui128_t vec_srq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Shift Right Quadword.

Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

processor	Latency	Throughput
power8	4	1/cycle
power9	6	1/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
vrb	Shift amount in bits 121:127.

Returns: Right shifted vector.

◆ vec_srq4()

static vui128_t vec_srq4 ( vui128_t vra )

inlinestatic

Deprecated:: Vector Shift right 4-bits Quadword. Replaced by vec_srqi with shb param = 4.

Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

Parameters

vra	a 128-bit vector treated as a __int128.

Returns: Right shifted vector.

◆ vec_srq5()

static vui128_t vec_srq5 ( vui128_t vra )

inlinestatic

Deprecated:: Vector Shift right 5-bits Quadword. Replaced by vec_srqi with shb param = 5.

Vector Shift Right Quadword 0-127 bits. The shift amount is from bits 121-127 of vrb.

Parameters

vra	a 128-bit vector treated a __int128.

Returns: Right shifted vector.

◆ vec_srqi()

static vui128_t vec_srqi	(	vui128_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Quadword Immediate.

Shift right Quadword 0-127 bits. The shift amount is a const unsigned int in the range 0-127. A shift count of 0 returns the original value of vra. Shift counts greater then 127 bits return zero.

processor	Latency	Throughput
power8	2-13	2 cycle
power9	3-15	2/cycle

Parameters

vra	a 128-bit vector treated as unsigned __int128.
shb	Shift amount in the range 0-127.

Returns: 128-bit vector shifted right shb bits.

◆ vec_subcuq()

static vui128_t vec_subcuq	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Subtract and Write Carry Unsigned Quadword.

Generate the carry-out of the sum (vra + NOT(vrb) + 1).

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

vra	128-bit vector treated as unsigned __int128.
vrb	128-bit vector treated as unsigned __int128.

Returns: __int128 carry from the unsigned difference vra - vrb.

◆ vec_subecuq()

static vui128_t vec_subecuq	(	vui128_t	vra,
		vui128_t	vrb,
		vui128_t	vrc
	)

inlinestatic

Vector Subtract Extended and Write Carry Unsigned Quadword.

Generate the carry-out of the sum (vra + NOT(vrb) + vrc.bit[127]).

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

vra	128-bit vector treated as unsigned __int128.
vrb	128-bit vector treated as unsigned __int128.
vrc	128-bit vector carry-in from bit 127.

Returns: __int128 carry from the extended __int128 difference.

◆ vec_subeuqm()

static vui128_t vec_subeuqm	(	vui128_t	vra,
		vui128_t	vrb,
		vui128_t	vrc
	)

inlinestatic

Vector Subtract Extended Unsigned Quadword Modulo.

Subtract two vector __int128 values and return result modulo 128-bits.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

vra	128-bit vector treated as unsigned __int128.
vrb	128-bit vector treated as unsigned __int128.
vrc	128-bit vector carry-in from bit 127.

Returns: __int128 unsigned difference of vra minus vrb.

◆ vec_subuqm()

static vui128_t vec_subuqm	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Subtract Unsigned Quadword Modulo.

Subtract two vector __int128 values and return result modulo 128-bits.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

Parameters

vra	128-bit vector treated as unsigned __int128.
vrb	128-bit vector treated as unsigned __int128.

Returns: __int128 unsigned difference of vra minus vrb.

◆ vec_vmadd2eud()

static vui128_t vec_vmadd2eud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c,
		vui64_t	d
	)

inlinestatic

Vector Multiply-Add2 Even Unsigned Doublewords.

Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and the even doublewords of c and d ((a_even * b_even) + c_even + d_even).

Note: The advantage of this form (versus Multiply-Sum) is that the final 128 bit sum can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	13-18	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned long int.
d	128-bit vector unsigned long int.

Returns: vector unsigned __int128 sum (a_even * b_even) + c_even + d_even.

◆ vec_vmadd2oud()

static vui128_t vec_vmadd2oud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c,
		vui64_t	d
	)

inlinestatic

Vector Multiply-Add2 Odd Unsigned Doublewords.

Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and the odd doublewords of c and d ((a_odd * b_odd) + c_odd + d_odd).

Note: The advantage of this form (versus Multiply-Sum) is that the final 128 bit sum can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	13-18	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned long int.
d	128-bit vector unsigned long int.

Returns: vector unsigned __int128 sum (a_odd * b_odd) + c_odd + d_odd.

◆ vec_vmaddeud()

static vui128_t vec_vmaddeud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c
	)

inlinestatic

Vector Multiply-Add Even Unsigned Doublewords.

Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and the even doubleword of c (a_even * b_even) + c_even.

Note: The advantage of this form (versus Multiply-Sum) is that the final 128 bit sum can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	10-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned long int.

Returns: vector unsigned __int128 sum (a_even * b_even) + c_even.

◆ vec_vmaddoud()

static vui128_t vec_vmaddoud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c
	)

inlinestatic

Vector Multiply-Add Odd Unsigned Doublewords.

Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and the odd doubleword of c (a_odd * b_odd) + c_odd.

Note: The advantage of this form (versus Multiply-Sum) is that the final 128 bit sum can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	10-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned long int.

Returns: vector unsigned __int128 sum (a_odd * b_odd) + c_odd.

◆ vec_vmsumeud()

static vui128_t vec_vmsumeud	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Even Unsigned Doublewords.

Multiply the even 64-bit doublewords of vector unsigned long values (a * b) and return sum of the unsigned __int128 product and c (a_even * b_even) + c.

Note: This form (Multiply-Sum) can overflow the final 128 bit sum, unless the addend (c) is restricted to (INT64_MAX * 2) or less.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	10-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned __int128.

Returns: vector unsigned __int128 sum (a_even * b_even) + c.

◆ vec_vmsumoud()

static vui128_t vec_vmsumoud	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Odd Unsigned Doublewords.

Multiply the odd 64-bit doublewords of two vector unsigned long values (a * b) and return the sum of the unsigned __int128 product and variable c (a_odd * b_odd) + c>.

Note: This form (Multiply-Sum) can overflow the final 128 bit sum, unless the addend (c) is restricted to (INT64_MAX * 2) or less.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	25-28	1/cycle
power9	10-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.
c	128-bit vector unsigned __int128.

Returns: vector unsigned __int128 sum (a_odd * b_odd) + c.

◆ vec_vmuleud()

static vui128_t vec_vmuleud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Even Unsigned Doublewords.

Multiply the even 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the even doublewords.

Note: This function implements the operation of a Vector Multiply Even Doubleword instruction, as if the PowerISA included such an instruction. This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	21-23	1/cycle
power9	8-11	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.

Returns: vector unsigned __int128 product of the even double words of a and b.

◆ vec_vmuloud()

static vui128_t vec_vmuloud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Multiply the odd 64-bit doublewords of two vector unsigned long values and return the unsigned __int128 product of the odd doublewords.

Note: This function implements the operation of a Vector Multiply Odd Doubleword instruction, as if the PowerISA included such an instruction. This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	21-23	1/cycle
power9	8-13	2/cycle

Parameters

a	128-bit vector unsigned long int.
b	128-bit vector unsigned long int.

Returns: vector unsigned __int128 product of the odd double words of a and b.

◆ vec_vsldbi()

static vui128_t vec_vsldbi	(	vui128_t	vra,
		vui128_t	vrb,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Left Double Quadword by Bit Immediate.

Return a vector __int128 that is bits shb:shb+127 from the (256-bit) double quadword (vra || vrb). The shift amount is constant immediate value in the range 0-7.

processor	Latency	Throughput
power8	8	1 cycle
power9	11	1/cycle

Parameters

vra	upper 128-bits of the 256-bit double quadword vector.
vrb	lower 128-bits of the 256-bit double quadword vector.
shb	Shift amount in the range 0-7.

Returns: 128-bits from bits shb:shb+127.

◆ vec_vsrdbi()

static vui128_t vec_vsrdbi	(	vui128_t	vra,
		vui128_t	vrb,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Double Quadword by Bit Immediate.

Return a vector __int128 that is bits 128-shb:255-shb from the (256-bit) double quadword (vra || vrb). The shift amount is constant immediate value in the range 0-7.

processor	Latency	Throughput
power8	8	1 cycle
power9	11	1/cycle

Parameters

vra	upper 128-bits of the 256-bit double quadword vector.
vrb	lower 128-bits of the 256-bit double quadword vector.
shb	Shift amount in the range 0-7.

Returns: 128-bits from bits 128-shb:255-shb.

Macros

Functions

Detailed Description

Endian problems with quadword implementations

Quadword Integer Constants

Support for Quadword Integer Constants

Loading small Quadword constants

Alternatives to loading from .rodata

Some special quadword constants

Defining our own vec_splat_s128

Some facts about fixed precision integers

Some useful arithmetic facts (you may of forgotten)

Why does this matter?

Vector Multiply-Add

And Vector Multiply-Add2

Why not Vector Multiply-Sum

Vector Multiply-Add Quadword

Vector Quadword Examples

Printing Vector __int128 values

Converting Vector __int128 values to BCD

Extending integer operations beyond Quadword

Extended Quadword multiply

Quadword Long Division

Performance data.

Macro Definition Documentation

◆ CONST_VUINT128_Qx16d

◆ CONST_VUINT128_Qx18d

◆ CONST_VUINT128_Qx19d

◆ CONST_VUINT128_QxD

◆ CONST_VUINT128_QxW

Function Documentation

◆ vec_absduq()

◆ vec_abssq()

◆ vec_addcq()

◆ vec_addcuq()

◆ vec_addecuq()

◆ vec_addeq()

◆ vec_addeuqm()

◆ vec_adduqm()

◆ vec_avguq()

◆ vec_clzq()

◆ vec_cmpeqsq()

◆ vec_cmpequq()

◆ vec_cmpgesq()

◆ vec_cmpgeuq()

◆ vec_cmpgtsq()

◆ vec_cmpgtuq()

◆ vec_cmplesq()

◆ vec_cmpleuq()

◆ vec_cmpltsq()

◆ vec_cmpltuq()

◆ vec_cmpnesq()

◆ vec_cmpneuq()

◆ vec_cmpsq_all_eq()

◆ vec_cmpsq_all_ge()

◆ vec_cmpsq_all_gt()

◆ vec_cmpsq_all_le()

◆ vec_cmpsq_all_lt()

◆ vec_cmpsq_all_ne()

◆ vec_cmpuq_all_eq()

◆ vec_cmpuq_all_ge()

◆ vec_cmpuq_all_gt()

◆ vec_cmpuq_all_le()

◆ vec_cmpuq_all_lt()

◆ vec_cmpuq_all_ne()

◆ vec_cmul100cuq()

◆ vec_cmul100ecuq()

◆ vec_cmul10cuq()

◆ vec_cmul10ecuq()

◆ vec_ctzq()

◆ vec_divsq_10e31()

◆ vec_divudq_10e31()

◆ vec_divudq_10e32()

◆ vec_divuq_10e31()

◆ vec_divuq_10e32()

◆ vec_madd2uq()

◆ vec_madduq()

◆ vec_maxsq()

◆ vec_maxuq()

◆ vec_minsq()