POWER Vector Library Manual  1.0.4
Functions
vec_int64_ppc.h File Reference

Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements. More...

#include <pveclib/vec_int32_ppc.h>

Go to the source code of this file.

Functions

static vui64_t vec_absdud (vui64_t vra, vui64_t vrb)
 Vector Absolute Difference Unsigned Doubleword. More...
 
static vui64_t vec_addudm (vui64_t a, vui64_t b)
 Vector Add Unsigned Doubleword Modulo. More...
 
static vui64_t vec_clzd (vui64_t vra)
 Vector Count Leading Zeros Doubleword for unsigned long long elements. More...
 
static vui64_t vec_ctzd (vui64_t vra)
 Vector Count Trailing Zeros Doubleword for unsigned long long elements. More...
 
static vb64_t vec_cmpeqsd (vi64_t a, vi64_t b)
 Vector Compare Equal Signed Doubleword. More...
 
static vb64_t vec_cmpequd (vui64_t a, vui64_t b)
 Vector Compare Equal Unsigned Doubleword. More...
 
static vb64_t vec_cmpgesd (vi64_t a, vi64_t b)
 Vector Compare Greater Than or Equal Signed Doubleword. More...
 
static vb64_t vec_cmpgeud (vui64_t a, vui64_t b)
 Vector Compare Greater Than or Equal Unsigned Doubleword. More...
 
static vb64_t vec_cmpgtsd (vi64_t a, vi64_t b)
 Vector Compare Greater Than Signed Doubleword. More...
 
static vb64_t vec_cmpgtud (vui64_t a, vui64_t b)
 Vector Compare Greater Than Unsigned Doubleword. More...
 
static vb64_t vec_cmplesd (vi64_t a, vi64_t b)
 Vector Compare Less Than Equal Signed Doubleword. More...
 
static vb64_t vec_cmpleud (vui64_t a, vui64_t b)
 Vector Compare Less Than Equal Unsigned Doubleword. More...
 
static vb64_t vec_cmpltsd (vi64_t a, vi64_t b)
 Vector Compare less Than Signed Doubleword. More...
 
static vb64_t vec_cmpltud (vui64_t a, vui64_t b)
 Vector Compare less Than Unsigned Doubleword. More...
 
static vb64_t vec_cmpnesd (vi64_t a, vi64_t b)
 Vector Compare Not Equal Signed Doubleword. More...
 
static vb64_t vec_cmpneud (vui64_t a, vui64_t b)
 Vector Compare Not Equal Unsigned Doubleword. More...
 
static int vec_cmpsd_all_eq (vi64_t a, vi64_t b)
 Vector Compare all Equal Signed Doubleword. More...
 
static int vec_cmpsd_all_ge (vi64_t a, vi64_t b)
 Vector Compare all Greater Than or Equal Signed Doubleword. More...
 
static int vec_cmpsd_all_gt (vi64_t a, vi64_t b)
 Vector Compare all Greater Than Signed Doubleword. More...
 
static int vec_cmpsd_all_le (vi64_t a, vi64_t b)
 Vector Compare all Less than equal Signed Doubleword. More...
 
static int vec_cmpsd_all_lt (vi64_t a, vi64_t b)
 Vector Compare all Less than Signed Doubleword. More...
 
static int vec_cmpsd_all_ne (vi64_t a, vi64_t b)
 Vector Compare all Not Equal Signed Doubleword. More...
 
static int vec_cmpsd_any_eq (vi64_t a, vi64_t b)
 Vector Compare any Equal Signed Doubleword. More...
 
static int vec_cmpsd_any_ge (vi64_t a, vi64_t b)
 Vector Compare any Greater Than or Equal Signed Doubleword. More...
 
static int vec_cmpsd_any_gt (vi64_t a, vi64_t b)
 Vector Compare any Greater Than Signed Doubleword. More...
 
static int vec_cmpsd_any_le (vi64_t a, vi64_t b)
 Vector Compare any Less than equal Signed Doubleword. More...
 
static int vec_cmpsd_any_lt (vi64_t a, vi64_t b)
 Vector Compare any Less than Signed Doubleword. More...
 
static int vec_cmpsd_any_ne (vi64_t a, vi64_t b)
 Vector Compare any Not Equal Signed Doubleword. More...
 
static int vec_cmpud_all_eq (vui64_t a, vui64_t b)
 Vector Compare all Equal Unsigned Doubleword. More...
 
static int vec_cmpud_all_ge (vui64_t a, vui64_t b)
 Vector Compare all Greater Than or Equal Unsigned Doubleword. More...
 
static int vec_cmpud_all_gt (vui64_t a, vui64_t b)
 Vector Compare all Greater Than Unsigned Doubleword. More...
 
static int vec_cmpud_all_le (vui64_t a, vui64_t b)
 Vector Compare all Less than equal Unsigned Doubleword. More...
 
static int vec_cmpud_all_lt (vui64_t a, vui64_t b)
 Vector Compare all Less than Unsigned Doubleword. More...
 
static int vec_cmpud_all_ne (vui64_t a, vui64_t b)
 Vector Compare all Not Equal Unsigned Doubleword. More...
 
static int vec_cmpud_any_eq (vui64_t a, vui64_t b)
 Vector Compare any Equal Unsigned Doubleword. More...
 
static int vec_cmpud_any_ge (vui64_t a, vui64_t b)
 Vector Compare any Greater Than or Equal Unsigned Doubleword. More...
 
static int vec_cmpud_any_gt (vui64_t a, vui64_t b)
 Vector Compare any Greater Than Unsigned Doubleword. More...
 
static int vec_cmpud_any_le (vui64_t a, vui64_t b)
 Vector Compare any Less than equal Unsigned Doubleword. More...
 
static int vec_cmpud_any_lt (vui64_t a, vui64_t b)
 Vector Compare any Less than Unsigned Doubleword. More...
 
static int vec_cmpud_any_ne (vui64_t a, vui64_t b)
 Vector Compare any Not Equal Unsigned Doubleword. More...
 
static vi64_t vec_maxsd (vi64_t vra, vi64_t vrb)
 Vector Maximum Signed Doubleword. More...
 
static vui64_t vec_maxud (vui64_t vra, vui64_t vrb)
 Vector Maximum Unsigned Doubleword. More...
 
static vi64_t vec_minsd (vi64_t vra, vi64_t vrb)
 Vector Minimum Signed Doubleword. More...
 
static vui64_t vec_minud (vui64_t vra, vui64_t vrb)
 Vector Minimum Unsigned Doubleword. More...
 
static vui64_t vec_mrgahd (vui128_t vra, vui128_t vrb)
 Vector Merge Algebraic High Doublewords. More...
 
static vui64_t vec_mrgald (vui128_t vra, vui128_t vrb)
 Vector Merge Algebraic Low Doublewords. More...
 
static vui64_t vec_mrged (vui64_t __VA, vui64_t __VB)
 Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...
 
static vui64_t vec_mrghd (vui64_t __VA, vui64_t __VB)
 Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...
 
static vui64_t vec_mrgld (vui64_t __VA, vui64_t __VB)
 Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...
 
static vui64_t vec_mrgod (vui64_t __VA, vui64_t __VB)
 Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...
 
static vui128_t vec_msumudm (vui64_t a, vui64_t b, vui128_t c)
 Vector Multiply-Sum Unsigned Doubleword Modulo. More...
 
static vui128_t vec_muleud (vui64_t a, vui64_t b)
 Vector Multiply Even Unsigned Doublewords. More...
 
static vui64_t vec_mulhud (vui64_t vra, vui64_t vrb)
 Vector Multiply High Unsigned Doubleword. More...
 
static vui128_t vec_muloud (vui64_t a, vui64_t b)
 Vector Multiply Odd Unsigned Doublewords. More...
 
static vui64_t vec_muludm (vui64_t vra, vui64_t vrb)
 Vector Multiply Unsigned Doubleword Modulo. More...
 
static vui64_t vec_pasted (vui64_t __VH, vui64_t __VL)
 Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector. More...
 
static vui64_t vec_permdi (vui64_t vra, vui64_t vrb, const int ctl)
 Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector. More...
 
static vui64_t vec_popcntd (vui64_t vra)
 Vector Population Count doubleword. More...
 
static vui64_t vec_revbd (vui64_t vra)
 byte reverse each doubleword for a vector unsigned long int. More...
 
static vui64_t vec_vrld (vui64_t vra, vui64_t vrb)
 Vector Rotate Left Doubleword. More...
 
static vui64_t vec_vsld (vui64_t vra, vui64_t vrb)
 Vector Shift Left Doubleword. More...
 
static vui64_t vec_vsrd (vui64_t vra, vui64_t vrb)
 Vector Shift Right Doubleword. More...
 
static vi64_t vec_vsrad (vi64_t vra, vui64_t vrb)
 Vector Shift Right Algebraic Doubleword. More...
 
static vb64_t vec_setb_sd (vi64_t vra)
 Vector Set Bool from Signed Doubleword. More...
 
static vui64_t vec_rldi (vui64_t vra, const unsigned int shb)
 Vector Rotate left Doubleword Immediate. More...
 
static vui64_t vec_sldi (vui64_t vra, const unsigned int shb)
 Vector Shift left Doubleword Immediate. More...
 
static vi64_t vec_selsd (vi64_t vra, vi64_t vrb, vb64_t vrc)
 Vector Select Signed Doubleword. More...
 
static vui64_t vec_selud (vui64_t vra, vui64_t vrb, vb64_t vrc)
 Vector Select Unsigned Doubleword. More...
 
static vui64_t vec_splatd (vui64_t vra, const int ctl)
 Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian. More...
 
static vi64_t vec_splat_s64 (const int sim)
 Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte | Halfword |Word). More...
 
static vui64_t vec_splat_u64 (const int sim)
 Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte | Halfword |Word). More...
 
static vui64_t vec_spltd (vui64_t vra, const int ctl)
 
static vui64_t vec_srdi (vui64_t vra, const unsigned int shb)
 Vector Shift Right Doubleword Immediate. More...
 
static vi64_t vec_sradi (vi64_t vra, const unsigned int shb)
 Vector Shift Right Algebraic Doubleword Immediate. More...
 
static vui64_t vec_subudm (vui64_t a, vui64_t b)
 Vector Subtract Unsigned Doubleword Modulo. More...
 
static vui64_t vec_swapd (vui64_t vra)
 Vector doubleword swap. Exchange the high and low doubleword elements of a vector. More...
 
static vui64_t vec_vgluddo (unsigned long long *array, vi64_t vra)
 Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets. More...
 
static vui64_t vec_vgluddsx (unsigned long long *array, vi64_t vra, const unsigned char scale)
 Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes. More...
 
static vui64_t vec_vgluddx (unsigned long long *array, vi64_t vra)
 Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes. More...
 
static vui64_t vec_vgludso (unsigned long long *array, const long long offset0, const long long offset1)
 Vector Gather-Load Integer Doublewords from Scalar Offsets. More...
 
static vui64_t vec_vlsidx (const signed long long ra, const unsigned long long *rb)
 Vector Load Scalar Integer Doubleword Indexed. More...
 
static vui128_t vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
 Vector Multiply-Add2 Even Unsigned Doublewords. More...
 
static vui128_t vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c)
 Vector Multiply-Add Even Unsigned Doublewords. More...
 
static vui128_t vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
 Vector Multiply-Add2 Odd Unsigned Doublewords. More...
 
static vui128_t vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c)
 Vector Multiply-Add Odd Unsigned Doublewords. More...
 
static vui128_t vec_vmuleud (vui64_t a, vui64_t b)
 Vector Multiply Even Unsigned Doublewords. More...
 
static vui128_t vec_vmuloud (vui64_t a, vui64_t b)
 Vector Multiply Odd Unsigned Doublewords. More...
 
static vui128_t vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c)
 Vector Multiply-Sum Even Unsigned Doublewords. More...
 
static vui128_t vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c)
 Vector Multiply-Sum Odd Unsigned Doublewords. More...
 
static vui32_t vec_vpkudum (vui64_t vra, vui64_t vrb)
 Vector Pack Unsigned Doubleword Unsigned Modulo. More...
 
static void vec_vsstuddo (vui64_t xs, unsigned long long *array, vi64_t vra)
 Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets. More...
 
static void vec_vsstuddsx (vui64_t xs, unsigned long long *array, vi64_t vra, const unsigned char scale)
 Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes. More...
 
static void vec_vsstuddx (vui64_t xs, unsigned long long *array, vi64_t vra)
 Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes. More...
 
static void vec_vsstudso (vui64_t xs, unsigned long long *array, const long long offset0, const long long offset1)
 Vector Scatter-Store Integer Doublewords to Scalar Offsets. More...
 
static void vec_vstsidx (vui64_t xs, const signed long long ra, unsigned long long *rb)
 Vector Store Scalar Integer Doubleword Indexed. More...
 
static vui64_t vec_xxspltd (vui64_t vra, const int ctl)
 Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. More...
 
static vui64_t vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c)
 Vector Multiply-Add Even Unsigned Words. More...
 
static vui64_t vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
 Vector Multiply-Add2 Even Unsigned Words. More...
 
static vui64_t vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c)
 Vector Multiply-Add Odd Unsigned Words. More...
 
static vui64_t vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
 Vector Multiply-Add2 Odd Unsigned Words. More...
 
static vui64_t vec_vmsumuwm (vui32_t vra, vui32_t vrb, vui64_t vrc)
 Vector Multiply-Sum Unsigned Word Modulo. More...
 

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.

The original VMX (AKA Altivec) did not define any doubleword element (long long integer or double float) operations. The VSX facility (introduced with POWER7) added vector double float but did not add any integer doubleword (64-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector long int (64-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.

POWER8 (PowerISA 2.07B) adds important doubleword integer (add, subtract, compare, shift, rotate, ...) VMX operations. POWER8 also added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend).

POWER9 (PowerISA 3.0B) adds the Vector Multiply-Sum Unsigned Doubleword Modulo instruction. This is not the expected multiply even/odd/modulo doubleword nor a full multiply modulo quadword. But with a few extra (permutes and splat zero) instructions you can get equivalent function.

Note
The doubleword integer multiply implementations are included in vec_int128_ppc.h. This resolves a circular dependency as 64-bit by 64-bit integer multiplies require 128-bit integer addition (vec_adduqm()) to produce the full product.
See also
vec_msumudm, vec_muleud, vec_mulhud, vec_muloud, vec_muludm, vec_vmuleud, and vec_vmuloud

Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit integer vector intrinsics to be added to altivec.h. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.

Note
The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example, if you compile with -mcpu=power7, vec_vclz and vec_vclzd will not be defined. But vec_clzd is always defined in this header, will generate the minimum code, appropriate for the target, and produce correct results.

64-bit integer operations are commonly used in the implementation of optimized double float math library functions and this applies to the vector equivalents of math functions. So missing, incomplete or buggy support for vector long integer intrinsics can be a impediment to the implementation of optimized and portable vector double math libraries. This header is a prerequisite for vec_f64_ppc.h which together are intended to support the implementation of vector math libraries.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.

This header covers operations that are any of the following:

Some missing doubleword operations

The original VMX instruction set extension was limited to byte, halfword, and word size element operations. This limited vector arithmetic operations to char, short, int and float elements. This limitation persisted until PowerISA 2.06 (POWER7) added the Vector Scalar Extensions (VSX) facility. VSX combined/extended the FPRs and VRs into 64 by 128-bit Vector/Scalar Registers (VSRs).

VSX added a large number of scalar double-precision and vector single / double-precision floating-point operations. The double-precision scalar (xs prefix) instructions where largely duplicates of the existing Floating-Point Facility operations, extended to access the whole (64) VSX register set. Similarly the VSX vector single precision floating-point (xv prefix, sp suffix) instructions were added to give vectorized float code access to 64 VSX registers.

The addition of VSX vector double-precision (xv prefix) instructions was the most significant addition. This added vector doubleword floating-point operations and provided access to all 64 VSX registers. Alas, there are no doubleword (64-bit long) integer operations in the initial VSX. A few logical and permute class (xx prefix) operations on word/doubleword elements where tacked on. These apply equally to float and integer elements. But nothing for 64-bit integer arithmetic.

Note
The full title in PowerISA 2.06 is Vector-Scalar Floating-Point Operations [Category: VSX].

PowerISA 2.07 (POWER8) did add a significant number of doubleword (64-bit) integer operations. Including;

Also a number of new word (32-bit) integer operations;

And some new quadword (128-bit) integer operations;

And some specialized operations;

Note
The operations above are all Vector Category and can only access the 32 original vector registers (VSRs 32-63).

The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:

PowerISA 3.0 (POWER9) adds a few more doubleword (64-bit) integer operations. Including;

Also a number of new word (32-bit) integer operations;

And some new quadword (128-bit) integer operations;

The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:

An impressive list of operations that can be used for;

The challenge is that useful operations available for POWER9 will need equivalent implementations for POWER8 and POWER7. Similarly for operations introduced for POWER8 will need POWER7 implementations. Also there are some obvious missing operations;

Challenges and opportunities

The stated goals for pveclib are:

So the first step is to provide implementations for the key POWER8 doubleword integer operations for older compilers. For example, some of the generic doubleword integer operations were not defined until GCC 6.0. Here we define the specific Compare Equal Unsigned Doubleword implementation:

static inline
{
vb64_t result;
#ifdef _ARCH_PWR8
#if __GNUC__ >= 6
result = vec_cmpeq(a, b);
#else
__asm__(
"vcmpequd %0,%1,%2;\n"
: "=v" (result)
: "v" (a),
"v" (b)
: );
#endif
#else
// _ARCH_PWR7 implementation ...
#endif
return (result);
}

The implementation checks if the compile target is POWER8 then checks of the compiler is new enough to use the generic vector compare built-in. If the generic built-in is not defined in <altivec.h> then we provide the equivalent inline assembler.

For POWER7 targets we don't have any vector compare doubleword operations and we need to define the equivalent operation using PowerISA 2.06B (and earlier) instructions. For example:

#else
// _ARCH_PWR7 implementation ...
vui8_t permute =
{ 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03,
0x0C,0x0D,0x0E,0x0F, 0x08,0x09,0x0A,0x0B};
vui32_t r, rr;
r = (vui32_t) vec_cmpeq ((vui32_t) a, (vui32_t) b);
if (vec_any_ne ((vui32_t) a, (vui32_t) b))
{
rr = vec_perm (r, r, permute);
r= vec_and (r, rr);
}
result = (vb64_t)r;
#endif

Here we use Compare Equal Unsigned Word. If all words are equal, use the result as is. Otherwise, if any word elements are not equal, we do some extra work. For each doubleword, rotate the word compare result by 32-bits (here we use permute as we don't have rotate doubleword either). Then logical and the original word compare and rotated results to get the final doubleword compare results.

Similarly for all the doubleword compare variants. Similarly for doubleword; add, subtract, maximum, minimum, shift, rotate, count leading zeros, population count, and Byte reverse.

More Challenges

Now we can look at the case where vector doubleword operations of interest don't have an equivalent instruction. Here interesting operations include those that are supported for other element sizes and types.

The simplest example is absolute difference which was introduced in PowerISA 3.0 for byte, halfword and word elements. From the implementation of vec_absduw() we see how to implement the operation for POWER8 using subtract, maximum, and minimum. For example:

static inline vui64_t
{
return vec_subudm (vec_maxud (vra, vrb), vec_minud (vra, vrb));
}

This works because pveclib provides implementations for min, max, and sub operations that work across GCC versions and provide processor specific implementations for POWER8/9 and POWER7.

Now we need to look at the multiply doubleword situation. We need implementations for vec_msumudm(), vec_muleud(), vec_mulhud(), vec_muloud(), and vec_muludm(). We saw in the implementations of vec_int32_ppc.h that multiply high and low/modulo can implemented using multiply and merge even/odd of that element size. Multiply low can also be implemented using the multiply sum and multiply odd of the next smaller element size. Also multiply-sum can be implemented using multiply even/odd and a couple of adds. And multiply even/odd can be implemented using multiply sum by supplying zeros to appropriate inputs/elements.

The above discussion has many circular dependencies. Eventually we need to get down to an implementation on each processor using actual hardware instructions. So what multiply doubleword operations does the PowerISA actually have from the list above:

It seems the best implementation strategy uses;

We really care about performance and latency for POWER9/8. We need POWER7 to work correctly so we can test on and support legacy hardware. The rest is grade school math.

First we need to make sure we have implementations across the GCC versions 6, 7, and 8 for the instructions we need. For example:

static inline vui128_t
{
vui128_t res;
#if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
__asm__(
"vmsumudm %0,%1,%2,%3;\n"
: "=v" (res)
: "v" (a), "v" (b), "v" (c)
: );
#else
vui128_t p_even, p_odd, p_sum;
p_even = vec_muleud (a, b);
p_odd = vec_muloud (a, b);
p_sum = vec_adduqm (p_even, p_odd);
res = vec_adduqm (p_sum, c);
#endif
return (res);
}
Note
The _ARCH_PWR8 implementation above depends on vec_muleud() and vec_muloud() for which there are no hardware instructions. Hold that thought.

While we are it we can implement multiply-sum unsigned word modulo.

static inline vui64_t
{
vui64_t peven, podd, psum;
peven = vec_muleuw (vra, vrb);
podd = vec_mulouw (vra, vrb);
psum = vec_addudm (peven, podd);
return vec_addudm (psum, vrc);
}

We will need this later.

Now we need to provide implementations of vec_muleud() and vec_muloud(). For example:

static inline vui128_t
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return vec_vmuloud (a, b);
#else
return vec_vmuleud (a, b);
#endif
}

The implementation above is just handling the pesky little endian transforms. The real implementations are in vec_vmuleud() and vec_vmuloud() which implement the operation as if the PowerISA included such an instruction. These implementation is NOT endian sensitive and the function is stable across BE/LE implementations. For example:

static inline vui128_t
{
vui64_t res;
#if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
const vui64_t zero = { 0, 0 };
vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
__asm__(
"vmsumudm %0,%1,%2,%3;\n"
: "=v" (res)
: "v" (a), "v" (b_eud), "v" (zero)
: );
#else
#ifdef _ARCH_PWR8
const vui64_t zero = { 0, 0 };
vui64_t p0, p1, pp10, pp01;
vui32_t m0, m1;
// Need the endian invariant merge word high here
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// Nullify the little endian transform
m0 = vec_mergel ((vui32_t) b, (vui32_t) b);
#else
m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);
#endif
m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 0);
// Need the endian invariant multiply even/odd word here
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// Nullify the little endian transform
p1 = vec_muleuw (m1, m0);
p0 = vec_mulouw (m1, m0);
#else
p1 = vec_mulouw (m1, m0);
p0 = vec_muleuw (m1, m0);
#endif
// res[1] = p1[1]; res[0] = p0[0];
res = vec_pasted (p0, p1);
// pp10[1] = p1[0]; pp10[0] = 0;
// pp01[1] = p0[1]; pp01[0] = 0;
// Need the endian invariant merge algebraic high/low here
pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);
pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);
// pp01 = pp01 + pp10.
pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);
// res = res + (pp01 << 32)
pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);
res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);
#else
// _ARCH_PWR7 implementation ...
#endif
#endif
return ((vui128_t) res);
}

The _ARCH_PWR9 implementation uses the multiply-sum doubleword operation but implements the multiply even behavior by forcing the contents of doubleword element 1 of [VRB] and the contents of [VRC] to 0.

The _ARCH_PWR8 implementation looks ugly but it works. It starts with some merges and splats to get inputs columns lined up for the multiply. Then we use (POWER8 instructions) Multiply Even/Odd Unsigned Word to generate doubleword partial products. Then more merges and a rotate to line up the partial products for summation as the final quadword product.

Individually vec_vmuleud() and vec_vmuloud() execute with a latency of 21-23 cycles on POWER8. Normally these operations are used and scheduled together as in the POWER8 implementation of vec_msumudm() or vec_mulhud(). Good scheduling by the compiler and pipelining keeps the POWER8 latency in the 28-32 cycle range. For example, the vec_mulhud() implementation:

static inline vui64_t
{
return vec_mrgahd (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));
}

Generates the following code for POWER8:

vspltisw v0,0
xxmrghw vs33,vs35,vs35
xxspltd vs45,vs34,0
xxmrglw vs35,vs35,vs35
vmulouw v11,v13,v1
xxspltd vs34,vs34,1
xxmrghd vs41,vs32,vs43
vmulouw v12,v2,v3
vmuleuw v13,v13,v1
vmuleuw v2,v2,v3
xxmrghd vs42,vs32,vs44
xxmrgld vs33,vs32,vs45
xxmrgld vs32,vs32,vs34
xxpermdi vs44,vs34,vs44,1
vadduqm v1,v1,v9
xxpermdi vs45,vs45,vs43,1
vadduqm v0,v0,v10
vsldoi v1,v1,v1,4
vsldoi v0,v0,v0,4
vadduqm v2,v1,v13
vadduqm v0,v0,v12
xxmrghd vs34,vs34,vs32

The POWER9 latencies for this operation range from 5-7 (for vmsumudm itself) to 11-16 (for vec_mulhud()). The additional latency reflects zero constant vector generation and merges required to condition the inputs and output. For these operations the vec_msumudm(), vrc operand is always zero. Selecting the even/odd doubleword for input requires a merge low/high. And selecting the high doubleword for multiply high require a final merge high.

vec_mulhud() generates the following code for POWER9:

xxspltib vs32,0
xxmrghd vs33,vs35,vs32
xxmrgld vs35,vs32,vs35
vmsumudm v1,v2,v1,v0
vmsumudm v2,v2,v3,v0
xxmrghd vs34,vs33,vs34

Wrapping up the doubleword multiplies we should look at the multiply low (AKA Multiply Unsigned Doubleword Modulo). The POWER9 implementation is similar to vec_mulhud () and the generated code is similar to the example above.

Multiply low doubleword is a special case, as we are discarding the highest partial doubleword product. For POWER8 we can optimize for that case using multiply odd and multiply-sum word operations. Also as we are only generating doubleword partial products we only need add doubleword modulo operations to sum the results. This avoids the more expensive add quadword operation required for the general case. The fact that vec_vmsumuwm() is only a software construct is not an issue. It expands into hardware multiple even/odd word and add doubleword instructions that the compiler can schedule and optimize.

Here vec_mulouw() generates low order partial product. Then vec_vrld () and vec_vmsumuwm() generate doubleword sums of the two middle order partial products. Then vec_vsld() shifts the middle order partial sum left 32-bits (discarding the unneeded high order 32-bits). Finally sum the low and middle order partial doubleword products to produce the multiply-low doubleword result. For example, this POWER8 only implementation:

static inline vui64_t
{
vui64_t s32 = { 32, 32 }; // shift / rotate amount.
vui64_t z = { 0, 0 };
vui64_t t2, t3, t4;
vui32_t t1;
t1 = (vui32_t) vec_vrld (vrb, s32);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// Nullify the little endian transform, really want mulouw here.
t2 = vec_muleuw ((vui32_t) vra, (vui32_t) vrb);
#else
t2 = vec_mulouw ((vui32_t) vra, (vui32_t) vrb);
#endif
t3 = vec_vmsumuwm ((vui32_t) vra, t1, z);
t4 = vec_vsld (t3, s32);
return (vui64_t) vec_vaddudm (t4, t2);
}

Which generates the following for POWER8:

addis r9,r2,.rodata.cst16+0x60@ha
addi r9,r9,.rodata.cst16+0x60@l
lxv vs33,0,r9
vmulouw v13,v2,v3
vrld v0,v3,v1
vmulouw v3,v2,v0
vmuleuw v2,v2,v0
vaddudm v2,v3,v2
vsld v2,v2,v1
vaddudm v2,v13,v2
Note
The addition of zeros to the final sum of vec_vmsumuwm() (vec_addudm (psum, vrc))has been optimized away by the compiler. This eliminates the xxspltib and one vaddudm instruction from the final code sequence.

And we can assume that the constant load of { 32, 32 } will be common-ed with other operations or hoisted out of loops. So the shift constant can be loaded early and vrld is not delayed. This keeps the POWER8 latency in the 19-28 cycle range.

Loading small Doubleword constants

Programming with vector doubleword integers will need doubleword constants for masking and arithmetic operations. Doubleword splat constants are common in vectorized long integer code for arithmetic, comparison, and mask operations. For example:

__test_incud_V0 (vui64_t vra)
{
// increament unsigned doubleword elements
return vra + 1;
}

The endian sensitive macros from vec_common_ppc.h can be used to construct doubleword integer constants. For example:

const vui64_t dw_one = CONST_VINT64_DW(1, 1);
const vui64_t dw_ten = CONST_VINT64_DW(10, 10);
const vui64_t dw_sign_mask = (vui64_t) CONST_VINT128_W(0x80000000, 0x0,
0x80000000, 0x0);

In most cases this compiler will allocate these constant values to the read-only data (.rodata) section. When these constants are referenced in programming operations the compiler generates the appropriate vector loads. For example the GCC V11 generates the following for the -mcpu=power8 target:

addis r9,r2,.rodata.cst16+0x30@toc@ha
addi r9,r9,.rodata.cst16+0x30@toc@l
lvx v0,0,r9 # Load { 1, 1 }
vaddudm v2,v2,v0 # vra + 1

The addis/addi/lvx pattern is common to loading most vector constants for POWER8 and earlier.

For some odd reason the compiler might generate the sequence:

addis r9,r2,.rodata.cst16+0x30@toc@ha
addi r9,r9,.rodata.cst16+0x30@toc@l
rldicr r9,r9,0,59
lxvd2x vs0,0,r9
xxswapd vs0,vs0

for -mcpu=power8 ppc64le targets.

The Load VSX Vector Dword*2 Indexed (lxvd2x) would be required if the compiler could not know that the data was quadword aligned. The lxvd2x instruction handles unaligned access but requires the little endian adjustment (xxswapd). However the compiler controls the allocation and alignment of vector constants in .rodata and already insures quadword alignment.

Note
This is has the look of a compiler phase error bug where important information is lost between compiler phases.

For the -mcpu=power9 (and later) target GCC uses the Load VXS Vector (lxv) instruction:

addis r9,r2,.rodata.cst16+0x30@toc@ha
addi r9,r9,.rodata.cst16+0x30@toc@l
lxv v2,0(r9)

The first sequence is expected for POWER8 as PowerISA 2.07B does not have any displacement form (D-Form) vector (VSX) load/store instructions. The compiler allocates constants to the .rodata sections and the linker collects .rodata from object files into a combined executable .rodata section. This section is placed near the Table of Contents (TOC) section. The ABI dedicates R2 as the base address .TOC. for the TOC and adjacent sections.

The Add Immediate Shifted (addis) Add Immediate (addi) sequence above computes a signed 32-bit .TOC. relative offset to a specific .rodata quadword. Two instructions are required as; addis provides the high adjusted (@ha) 16-bits shifted left 16-bits, while addi provides the low (@l) 16-bits. The sum of R2 and these immediate values is the 64-bit effective address of a .rodata constant value. A signed 32-bit offset is large enough to support most (-mcmodel=medium) program and library executables.

The load itself has a 5-cycle latency assuming a L1 cache hit. The three instruction sequence is sequentially dependent and requires 9-cycles latency (minimum) to execute. A L1 cache miss will increase the latency by 7-28 cycles, assuming the data resides in the L2/L3 caches.

Optimizing loads from .rodata

However the compiler is not following the recommendations of PowerISA 2.07B, Book II, Chapter 2.1 Performance-Optimized Instruction Sequences. This chapter recommends a specific pattern for the addi/lvx sequence. For example:

addis rA,r2,.rodata.cst16+0x30@toc@ha
addi rx,0,.rodata.cst16+0x30@toc@l
lvx v2,rA,rx

In this case rx can be any GPR (including r0) while RA must be a valid base (r1 <-> r31) register.

The POWER8 implementation allows for Instruction Funsion combining information from two adjacentt instructions into one (internal) instruction so that it executes faster than the non-fused case. Effectively the addi/lvx combination above becomes a D-Form load vector instruction.

There are additional restrictions on the definition of adjacent:

This can reduce the latency from 9 to 7-cycles. This would be true even without Instruction Funsion as the addis/addi instructions are now independent and can execute in parallel.

The sequence generated for POWER9 is even more disappointing. The lxv is a D-Form (DQ) instruction and the displacement operand could be used to replace the addi instruction. For example: -mcpu=power9 target:

addis r9,r2,.rodata.cst16+0x30@toc@ha
lxv v2,.rodata.cst16+0x30@toc@l(r9)

This provides the equivalent 32-bit TOC relative displacement with one less instructions and reduced latency of 7-cycles.

Alternatives to loading from .rodata

This is all a little cumbersome and it seems like there should be a better/faster way. Any instruction sequence that loads quadword integer constants in:

is a good deal.

The base (Altivec) vector ISA included Vector Splat Immediate Signed Byte/Halfword/Word instructions. These are fast (2-cycle latency) and convenient for small integer constants in the range -16 to 15. So far the ISA has not added doubleword or quadword forms of splat immediate.

POWER9 added a VSX Vector Splat Immediate Byte (xxspltib) instruction. This expands the immediate range to -128 to 127 but does not include larger element sizes. POWER9 does provide Vector Extend Sign Byte To Word/Doubleword (vextsb2w/vextsb2d) instructions. For example the two instruction sequence:

xxspltib vs34,127
vextsb2d v2,v2

can generate a doubleword splat immediate for integers in the range -128 to 127 with a cycle latency of 5-cycles.

Note
POWER10 does add the interesting VSX Vector Splat Immediate Double-Precision instruction. This is a 64-bit instruction with a 32-bit single precision immediate operand. Interesting but not helpful for doubleword integer.

Some special quadword constants

The GCC compiler does recognize some vector constants as special case. For example:

__test_splatisq_n1_V0 (void)
{
const vui32_t q_ones = {-1, -1, -1, -1};
return (vi128_t) q_ones;
}
__test_splatisq_0_V0 (void)
{
const vui32_t q_zero = {0, 0, 0, 0};
return (vi128_t) q_zero;
}

will generate:

0000000000000080 <__test_splatisq_n1_V0>:
vspltisw v2,-1
blr
00000000000000a0 <__test_splatisq_0_V0>:
vspltisw v2,0
blr

As we will see the all zero/ones constants are common building blocks. So the compiler should treat these as common sub expressions across all operations using those constants.

Defining our own vec_splat_s64

So the compiler can do clever things with vector constants. But so far these are the only examples I have found. Other cases that you might expect to be a special case are not. For example:

__test_splatudi_15_V1 (void)
{
return vec_splats ((unsigned long long) 12);
}
__test_splatudi_15_V0 (void)
{
const vui64_t dw_15 = CONST_VINT64_DW(15, 15);
return dw_15;
}

both generate the 3 instruction (9-cycle) load from .rodata sequence. Also constants using the vector long long or __int128 types may fail to compile on older versions of the compiler.

We can generate small constants in the range -16 <-> 15 with using the following pattern:

__test_splatsdi_15_V1 (void)
{
vi32_t vwi = vec_splat_s32 (15);
return vec_unpackl (vwi);
}

Which should generate:

0000000000000040 <__test_splatisd_15_v2>:
vspltisw v2,15
vupklsw v2,v2
blr

Here we use the vec_splat_s32(15) intrinsic to generate Vector Splat Immediate Signed Word (vspltisw) to splat the value 15 across word elements of vwi. Then vec_unpackl (vwi) to generate Vector Unpack Low Signed Word vupklsw which sign extends the 2 low words of vwi to signed doubleword elements. This sequence is only 2 instructions and will execute with 4-cycle latency.

Note
unfortunately GCC compilers after GCC-8 will recognize this sequence and convert it back to the three instruction .rodata load sequence. See: GCC PR 104124 Until PR 104124 is fixed the following work-around is used for the PVECLIB implementation.

Putting this all together we can create a static inline function to generate small doubleword constants (in the range -16 to 15). For example:

static inline vi64_t
vec_splat_s64_PWR8 (const int sim)
{
vi64_t result;
if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))
{
vi32_t vwi = vec_splat_s32 (sim);
if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
{
// Special case for -1 and 0. Skip vec_unpackl().
result = (vi64_t) vwi;
} else {
// For P8 can use either vupklsh or vupklsw but P7 only has
// vupklsh. Given the reduced range, Either works here.
// Unpack signed HW works here because immediate value fits
// into the low HW and sign extends to high HW of each word.
// Unpack will expand the low HW to low word and high HW
// (sign extend) into the high word of each DW.
// Unpack low/high (or endian) will not change the result.
#if defined (__GNUC__) && (__GNUC__ == 8)
// GCC 8 (AT12) handle this correctly.
result = (vi64_t) vec_vupklsh ((vi16_t) vwi);
#else
// But GCC 9+ optimized the above to be load from .rodata.
// With a little register pressure it adds some gratuitous store/reloads.
// So the following work-around is required.
__asm__(
"vupklsh %0,%1;"
: "=v" (result)
: "v" (vwi)
: );
#endif
}
}
else
result = vec_splats ((signed long long) sim);
return (result);
}

This version uses only <altivec.h> intrinsics supported by POWER8 and earlier. For constants in the range (-16 to 15) the range is divided into two groups:

Values outside this range use the vec_splats() intrinsic which will generate the appropriate quadword constant in .rodata and the load sequence to retrieve that value.

For POWER9 and later we can use the vec_splats() intrinsic which (so far) generates the xxspltib/vextsb2d sequence for the constant range -128 to 127.

static inline vi64_t
vec_splat_s64_PWR9 (const int sim)
{
return vec_splats ((signed long long) sim);
}

Endian problems with doubleword operations

From the examples above we see that the construction of higher precision multiplies requires significant massaging of input and output elements. Here merge even/odd, merge high/low, swap, and splat doubleword element operations are commonly used.

PowerISA 2.06 VSX (POWER7) added the general purpose Vector Permute Doubleword Immediate (xxpermdi). The compiler generates some form of xxpermdi for the doubleword (double float, long int, bool long) merge/splat/swap operations. As xxpermdi's element selection is an immediate field, most operations require only a single instruction. All the merge/splat/swap doubleword variant are just a specific select mask value and the inputs to xxpermdi.

Which is very useful indeed for assembling, disassembling, merging, splatting, swapping, and pasting doubleword elements.

Of course it took several compiler releases to implement all the generic merge/splat/swap operations for the supported types. GCC 4.8 as the first to support vec_xxpermdi as a built-in. GCC 4.8 also supported the generic built-ins vec_mergeh, vec_mergel, and vec_splat for the vector signed/unsigned/bool long type. But endian sensitive vec_mergeh, vec_mergel, and vec_splat were not supported until GCC 7. And the generic vec_mergee, vec_mergeo built-ins where not supported until GCC 8.

But as we have explained in General Endian Issues and Endian problems with word operations the little endian transforms applied by the compiler can cause problems for developers of multi-precision libraries. The doubleword forms of the generic merge/splat operations etc. are no exception. This is especially annoying when the endian sensitive transforms are applied between releases of the compiler.

So we need a strategy to provide endian invariant merge/splat/swap operations to be used in multi-precision arithmetic. And another set of endian sensitive operations that are mandated by the OpenPOWER ABI.

First we need a safely endian invariant version of xxpermdi to use in building other variants:

Then build the core set of endian invariant permute doubleword operations using vec_permdi():

We use the merge algebraic high/low doubleword operations in the implementation of vec_mulhud(), vec_mulhud(), vec_vmuleud(), and vec_vmuloud(). We use the vec_xxspltd operation in the implementation of vec_vrld(), vec_vmuleud(), and vec_vmuloud(). We use the paste doubleword (vec_pasted()) operation in the implementation of vec_vsrad(), vec_vmuleud(), and vec_vmuloud(). We use the swap doubleword operation in the implementation of vec_cmpequq(), vec_cmpneuq(), vec_muludq(), and vec_mulluq().

Then use the compilers __BYTE_ORDER__ == ORDER_LITTLE_ENDIAN conditional to invert the vec_permdi() select control for endian sensitive merge/splat doubleword operations:

Vector Doubleword Examples

Suppose we have a requirement to convert an array of 64-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 2 doublewords or 4 words.

The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume are dealing with longer intervals (greater than ~8.38 seconds) so the full 64-bit TimeBase is required. TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.

The timespec format is a struct of unsigned int fields for seconds and nanoseconds. So the task is to convert the 512MHz 64-bit TimeBase intervals to seconds and remaining clock ticks. Then convert the remaining (subsecond) clock ticks from 512MHz to nanoseconds. The separate seconds and nanoseconds are combined in the timespec structure.

First we need to separate the raw TimeBase into the integer seconds and (subsecond) clock-ticks. Normally scalar codes would use integer divide/modulo by 512000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?

Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 64-bit fraction and we have a multiply high (vec_mulhud()) operation. Multiplying a 64-bit unsigned integer by a 64-bit unsigned fraction generates a 128-bit product with 64-bits above (integer) and below (fraction) the radix point. The high 64-bits of the product is the integer quotient.

It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.

See also
"Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

In the chapter above;

Figure 10-2 Computing the magic number for unsigned division.

provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.).

For the divisor 512000000 this is { 4835703278458516699, 0 , 27 }:

// Magic numbers for multiplicative inverse to divide by 512,000,000
// are 4835703278458516699 and shift right 27 bits.
const vui64_t mul_invs_clock =
{ 4835703278458516699UL, 4835703278458516699UL };
const int shift_clock = 27;
// Need const for TB clocks/second to extract remainder.
const vui64_t tb_clock_sec =
{ 512000000, 512000000};
vui64_t tb_v, tmp, tb_clocks, seconds, nseconds;
vui64_t timespec1, timespec2;
// extract integer seconds from timebase vector.
tmp = vec_mulhud (tb_v, mul_invs_clock);
seconds = vec_srdi (tmp, shift_clock);
// Extract the remainder in tb clock ticks.
tmp = vec_muludm (seconds, tb_clock_sec);
tb_clocks = vec_sub (tb_v, tmp);

Next we need to convert the subseconds from TimeBase clock-ticks to nanoseconds. The subsecond remainder is now small enough (compared to a doubleword) that we can perform the conversion in place. The nanosecond conversion is ((tb_clocks * 1000000000) / 512000000). And we can reduce this to ((tb_clocks * 1000) / 512). We still have a small multiply but the divide can be converted to shift right of 9-bits.

const int shift_512 = 9;
const vui64_t nano_512 =
{ 1000, 1000};
// Convert 512MHz timebase to nanoseconds.
// nseconds = tb_clocks * 1000000000 / 512000000
// reduces to (tb_clocks * 1000) >> 9
tmp = vec_muludm (tb_clocks, nano_512);
nseconds = vec_srdi (tmp, shift_512);

Finally we need to merge the vectors of seconds and nanoseconds into vectors of timespec. So far we have been working with 64-bit integers but the timespec is a struct of 32-bit (word) integers. Here 32-bit seconds and nanosecond provided sufficient range and precision. So the final step packs a pair of 64-bit timespec values into a vector of two 32-bit timespec values, each containing 2 32-bit (second, nanosecond) values.

timespec1 = vec_mergeh (seconds, nseconds);
timespec2 = vec_mergel (seconds, nseconds);
// seconds and nanoseconds fit int 32-bits after conversion.
// So pack the results and store the timespec.
*timespec++ = vec_vpkudum (timespec1, timespec2);
Note
vec_sub(), vec_mergeh(), and vec_mergel() are existing altivec.h generic built-ins.
vec_vpkudum() is an existing altivec.h built-in that is only defined for _ARCH_PWR8 and later. This header insures that vec_vpkudum is defined for older compilers and provides an functional equivalent implementation for POWER7.

Vectorized 64-bit TimeBase conversion example

Here is the complete vectorized 64-bit TimeBase to timespec conversion example:

void
example_dw_convert_timebase (vui64_t *tb, vui32_t *timespec, int n)
{
// Magic numbers for multiplicative inverse to divide by 512,000,000
// are 4835703278458516699 and shift right 27 bits.
const vui64_t mul_invs_clock =
{ 4835703278458516699UL, 4835703278458516699UL };
const int shift_clock = 27;
// Need const for TB clocks/second to extract remainder.
const vui64_t tb_clock_sec =
{ 512000000, 512000000};
const int shift_512 = 9;
const vui64_t nano_512 =
{ 1000, 1000};
vui64_t tb_v, tmp, tb_clocks, seconds, nseconds;
vui64_t timespec1, timespec2;
int i;
for (i = 0; i < n; i++)
{
tb_v = *tb++;
// extract integer seconds from timebase vector.
tmp = vec_mulhud (tb_v, mul_invs_clock);
seconds = vec_srdi (tmp, shift_clock);
// Extract remainder in tb clock ticks.
tmp = vec_muludm (seconds, tb_clock_sec);
tb_clocks = vec_sub (tb_v, tmp);
// Convert 512MHz timebase to nanoseconds.
// nseconds = tb_clocks * 1000000000 / 512000000
// reduces to (tb_clocks * 1000) >> 9
tmp = vec_muludm (tb_clocks, nano_512);
nseconds = vec_srdi (tmp, shift_512);
// Use merge high/low to interleave seconds and nseconds
// into timespec.
timespec1 = vec_mergeh (seconds, nseconds);
timespec2 = vec_mergel (seconds, nseconds);
// seconds and nanoseconds fit int 32-bits after conversion.
// So pack the results and store the timespec.
*timespec++ = vec_vpkudum (timespec1, timespec2);
}
}

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absdud()

static vui64_t vec_absdud ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Absolute Difference Unsigned Doubleword.

Compute the absolute difference for each doubleword. For each unsigned doubleword, subtract VRB[i] from VRA[i] and return the absolute value of the difference.

processor Latency Throughput
power8 4 1/cycle
power9 5 1/cycle
Parameters
vravector of 2 x unsigned doublewords
vrbvector of 2 x unsigned doublewords
Returns
vector of the absolute differences.

◆ vec_addudm()

static vui64_t vec_addudm ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Add Unsigned Doubleword Modulo.

Add two vector long int values and return modulo 64-bits result.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
a128-bit vector long int.
b128-bit vector long int.
Returns
vector long int sums of a and b.

◆ vec_clzd()

static vui64_t vec_clzd ( vui64_t  vra)
inlinestatic

Vector Count Leading Zeros Doubleword for unsigned long long elements.

Count the number of leading '0' bits (0-64) within each doubleword element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Doubleword instruction vclzd. Otherwise use sequence of pre 2.07 VMX instructions.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vraa 128-bit vector treated as 2 x 64-bit unsigned long long (doubleword) elements.
Returns
128-bit vector with the leading zeros count for each doubleword element.

◆ vec_cmpeqsd()

static vb64_t vec_cmpeqsd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare equal result for each element.

◆ vec_cmpequd()

static vb64_t vec_cmpequd ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare equal result for each element.

◆ vec_cmpgesd()

static vb64_t vec_cmpgesd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltud, then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater then or equal result for each element.

◆ vec_cmpgeud()

static vb64_t vec_cmpgeud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud, then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater then or equal result for each element.

◆ vec_cmpgtsd()

static vb64_t vec_cmpgtsd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtsd) instruction. Otherwise use boolean logic using word compares.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpgtud()

static vb64_t vec_cmpgtud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtud) instruction. Otherwise use boolean logic using word compares.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmplesd()

static vb64_t vec_cmplesd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare Less Than Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpleud()

static vb64_t vec_cmpleud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare Less Than Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud. Use vec_cmpgtud then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpltsd()

static vb64_t vec_cmpltsd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare less Than Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare less result for each element.

◆ vec_cmpltud()

static vb64_t vec_cmpltud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare less Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare less result for each element.

◆ vec_cmpnesd()

static vb64_t vec_cmpnesd ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare not equal result for each element.

◆ vec_cmpneud()

static vb64_t vec_cmpneud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.

processor Latency Throughput
power8 4 2/cycle
power9 5 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
128-bit vector with each dword boolean reflecting compare not equal result for each element.

◆ vec_cmpsd_all_eq()

static int vec_cmpsd_all_eq ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a and b are equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_all_ge()

static int vec_cmpsd_all_ge ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a >= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_gt()

static int vec_cmpsd_all_gt ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a > b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_le()

static int vec_cmpsd_all_le ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Less than equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a <= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_lt()

static int vec_cmpsd_all_lt ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Less than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a < b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_ne()

static int vec_cmpsd_all_ne ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare all Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a and b are not equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_any_eq()

static int vec_cmpsd_any_eq ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a and b are equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_any_ge()

static int vec_cmpsd_any_ge ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a >= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_any_gt()

static int vec_cmpsd_any_gt ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a > b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_any_le()

static int vec_cmpsd_any_le ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Less than equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a <= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpsd_any_lt()

static int vec_cmpsd_any_lt ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Less than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a < b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpsd_any_ne()

static int vec_cmpsd_any_ne ( vi64_t  a,
vi64_t  b 
)
inlinestatic

Vector Compare any Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a and b are not equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
Returns
boolean int for any 128-bits, true if equal, false otherwise.

◆ vec_cmpud_all_eq()

static int vec_cmpud_all_eq ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a and b are equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_all_ge()

static int vec_cmpud_all_ge ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a >= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_gt()

static int vec_cmpud_all_gt ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a > b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_le()

static int vec_cmpud_all_le ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Less than equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a <= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_lt()

static int vec_cmpud_all_lt ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Less than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a < b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_ne()

static int vec_cmpud_all_ne ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare all Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a and b are not equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_any_eq()

static int vec_cmpud_any_eq ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a and b are equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_any_ge()

static int vec_cmpud_any_ge ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a >= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_any_gt()

static int vec_cmpud_any_gt ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a > b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_any_le()

static int vec_cmpud_any_le ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Less than equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a <= b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpud_any_lt()

static int vec_cmpud_any_lt ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Less than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a < b.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpud_any_ne()

static int vec_cmpud_any_ne ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Compare any Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a and b are not equal.

processor Latency Throughput
power8 4-9 2/cycle
power9 3 2/cycle
Parameters
a128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
Returns
boolean int for any 128-bits, true if equal, false otherwise.

◆ vec_ctzd()

static vui64_t vec_ctzd ( vui64_t  vra)
inlinestatic

Vector Count Trailing Zeros Doubleword for unsigned long long elements.

Count the number of trailing '0' bits (0-64) within each doubleword element of a 128-bit vector.

For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Doubleword instruction vctzd. Otherwise use a sequence of pre ISA 3.0 VMX instructions leveraging the PVECLIB popcntd operation. SIMDized count Trailing zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.

processor Latency Throughput
power8 8-10 2/2 cycles
power9 3 2/cycle
Parameters
vra128-bit vector treated as 2 x 64-bit integer (doublewords) elements.
Returns
128-bit vector with the trailng zeros count for each doubleword element.

◆ vec_maxsd()

static vi64_t vec_maxsd ( vi64_t  vra,
vi64_t  vrb 
)
inlinestatic

Vector Maximum Signed Doubleword.

For each doubleword element [0|1] of vra and vrb compare as signed integers and return the larger value in the result.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector long int.
vrb128-bit vector long int.
Returns
vector long maximum of a and b.

◆ vec_maxud()

static vui64_t vec_maxud ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Maximum Unsigned Doubleword.

For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the larger value in the result.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector long int.
vrb128-bit vector long int.
Returns
vector unsigned long maximum of a and b.

◆ vec_minsd()

static vi64_t vec_minsd ( vi64_t  vra,
vi64_t  vrb 
)
inlinestatic

Vector Minimum Signed Doubleword.

For each doubleword element [0|1] of vra and vrb compare as signed integers and return the smaller value in the result.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector long int.
vrb128-bit vector long int.
Returns
vector long minimum of a and b.

◆ vec_minud()

static vui64_t vec_minud ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Minimum Unsigned Doubleword.

For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the smaller value in the result.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vra128-bit vector unsigned long int.
vrb128-bit vector unsignedlong int.
Returns
vector unsigned long minimum of a and b.

◆ vec_mrgahd()

static vui64_t vec_mrgahd ( vui128_t  vra,
vui128_t  vrb 
)
inlinestatic

Vector Merge Algebraic High Doublewords.

Merge only the high doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This is effectively the Vector Merge Even Doubleword operation that is not modified for endian.

For example, merge the high 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply high unsigned doubleword.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned __int128.
vrb128-bit vector unsigned __int128.
Returns
A vector merge from only the high doublewords of the 2 x algebraic quadwords across vra and vrb.

◆ vec_mrgald()

static vui64_t vec_mrgald ( vui128_t  vra,
vui128_t  vrb 
)
inlinestatic

Vector Merge Algebraic Low Doublewords.

Merge only the low doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This effectively the Vector Merge Odd doubleword operation that is not modified for endian.

For example, merge the low 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply low unsigned doubleword.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Parameters
vra128-bit vector unsigned __int128.
vrb128-bit vector unsigned __int128.
Returns
A vector merge from only the low doublewords of the 2 x algebraic quadwords across vra and vrb.

◆ vec_mrged()

static vui64_t vec_mrged ( vui64_t  __VA,
vui64_t  __VB 
)
inlinestatic

Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
__VAa 128-bit vector as the source of the results even doubleword.
__VBa 128-bit vector as the source of the results odd doubleword.
Returns
A vector merge from only the even doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrghd()

static vui64_t vec_mrghd ( vui64_t  __VA,
vui64_t  __VB 
)
inlinestatic

Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
__VAa 128-bit vector as the source of the results even doubleword.
__VBa 128-bit vector as the source of the results odd doubleword.
Returns
A vector merge from only the high doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrgld()

static vui64_t vec_mrgld ( vui64_t  __VA,
vui64_t  __VB 
)
inlinestatic

Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
__VAa 128-bit vector as the source of the results even doubleword.
__VBa 128-bit vector as the source of the results odd doubleword.
Returns
A vector merge from only the low doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrgod()

static vui64_t vec_mrgod ( vui64_t  __VA,
vui64_t  __VB 
)
inlinestatic

Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
__VAa 128-bit vector as the source of the results even doubleword.
__VBa 128-bit vector as the source of the results odd doubleword.
Returns
A vector merge from only the odd doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_msumudm()

static vui128_t vec_msumudm ( vui64_t  a,
vui64_t  b,
vui128_t  c 
)
inlinestatic

Vector Multiply-Sum Unsigned Doubleword Modulo.

Note
this implementation exists in vec_int128_ppc::h::vec_msumudm() as it requires vec_adduqm().

◆ vec_muleud()

static vui128_t vec_muleud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Multiply Even Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_muleud() as it requires vec_vmuleud and vec_adduqm().

◆ vec_mulhud()

static vui64_t vec_mulhud ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Multiply High Unsigned Doubleword.

Note
this implementation exists in vec_int128_ppc::h::vec_mulhud() as it requires vec_vmuleud() and vec_vmuloud().

◆ vec_muloud()

static vui128_t vec_muloud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_muloud() as it requires vec_vmuloud() and vec_adduqm().

◆ vec_muludm()

static vui64_t vec_muludm ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Multiply Unsigned Doubleword Modulo.

Note
this implementation exists in vec_int128_ppc::h::vec_muludm() as it requires vec_vmuleud() and vec_vmuloud().

◆ vec_pasted()

static vui64_t vec_pasted ( vui64_t  __VH,
vui64_t  __VL 
)
inlinestatic

Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
__VHa 128-bit vector as the source of the high order doubleword.
__VLa 128-bit vector as the source of the low order doubleword.
Returns
The combined 128-bit vector composed of the high order doubleword of __VH and the low order doubleword of __VL.

◆ vec_permdi()

static vui64_t vec_permdi ( vui64_t  vra,
vui64_t  vrb,
const int  ctl 
)
inlinestatic

Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector.

Note
This function implements the operation of a VSX Permute Doubleword Immediate instruction. This implementation is NOT Endian sensitive and the function is stable across BE/LE implementations.

The 2-bit control operand (ctl) selects which doubleword from the 1st and 2nd vector operands are transfered to the result vector. Control table:

ctl vrt[0:63] vrt[64:127]
0 vra[0:63] vrb[0:63]
1 vra[0:63] vrb[64:127]
2 vra[64:127] vrb[0:63]
3 vra[64:127] vrb[64:127]
processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector as the source of the high order doubleword of the result.
vrba 128-bit vector as the source of the low order doubleword of the result.
ctlconst integer where the low order 2 bits control the selection of doublewords from input vector vra and vrb.
Returns
The combined 128-bit vector composed of the high order doubleword of vra and the low order doubleword of vrb.

◆ vec_popcntd()

static vui64_t vec_popcntd ( vui64_t  vra)
inlinestatic

Vector Population Count doubleword.

Count the number of '1' bits (0-64) within each doubleword element of a 128-bit vector.

processor Latency Throughput
power8 4 2/2 cycles
power9 3 2/cycle

For POWER8 (PowerISA 2.07B) or later use the Vector Population Count DoubleWord (vpopcntd) instruction. Otherwise use the pveclib vec_popcntw to count each word then sum across with Vector Sum across Half Signed Word Saturate (vsum2sws).

Parameters
vra128-bit vector treated as 2 x 64-bit integer (dwords) elements.
Returns
128-bit vector with the population count for each dword element.

◆ vec_revbd()

static vui64_t vec_revbd ( vui64_t  vra)
inlinestatic

byte reverse each doubleword for a vector unsigned long int.

For each doubleword of the input vector, reverse the order of bytes / octets within the doubleword.

processor Latency Throughput
power8 2-11 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector unsigned long int.
Returns
a 128-bit vector with the bytes of each doubleword reversed.

◆ vec_rldi()

static vui64_t vec_rldi ( vui64_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Rotate left Doubleword Immediate.

Rotate left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The rotate amount is a const unsigned int in the range 0-63. A rotate count of 0 returns the original value of vra. Shift counts greater then 63 bits handled modulo 64.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned long int.
shbrotate amount in the range 0-63.
Returns
128-bit vector unsigned long int, shifted left shb bits.

◆ vec_selsd()

static vi64_t vec_selsd ( vi64_t  vra,
vi64_t  vrb,
vb64_t  vrc 
)
inlinestatic

Vector Select Signed Doubleword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector treated as a vector signed long long int.
vrba 128-bit vector treated as a vector signed long long int.
vrca 128-bit vector treated as vector bool long long int.
Returns
The selected bits from vra and vrb

◆ vec_selud()

static vui64_t vec_selud ( vui64_t  vra,
vui64_t  vrb,
vb64_t  vrc 
)
inlinestatic

Vector Select Unsigned Doubleword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned long long int.
vrba 128-bit vector treated as a vector unsigned long long int.
vrca 128-bit vector treated as vector bool long long int.
Returns
The selected bits from vra and vrb

◆ vec_setb_sd()

static vb64_t vec_setb_sd ( vi64_t  vra)
inlinestatic

Vector Set Bool from Signed Doubleword.

For each doubleword, propagate the sign bit to all 64-bits of that doubleword. The result is vector bool long long reflecting the sign bit of each 64-bit doubleword.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraVector signed long long.
Returns
vector bool long long reflecting the sign bits of each doubleword.

◆ vec_sldi()

static vui64_t vec_sldi ( vui64_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift left Doubleword Immediate.

Shift left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned long int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned long int.
shbshift amount in the range 0-63.
Returns
128-bit vector unsigned long int, shifted left shb bits.

◆ vec_splat_s64()

static vi64_t vec_splat_s64 ( const int  sim)
inlinestatic

Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte | Halfword |Word).

Note
POWER9/10 will generate the 2 instruction sequence xxspltib/vextsb2d for values -128 to 128. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section. POWER8 (and earlier) does not have vextsb2d instructions. For a smaller range (-16 -> 15) POWER8 can use the sequence vec_splat_s32/vec_unpackl but the latest compilers are too clever for this and generate a load from .rodata anyway.
processor Latency Throughput
power8 4 - 9 2/cycle
power9 5 2/cycle
Parameters
sima small signed integer const.
Returns
Vector with sim value splatted to doublewords.

◆ vec_splat_u64()

static vui64_t vec_splat_u64 ( const int  sim)
inlinestatic

Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte | Halfword |Word).

Note
POWER9/10 will generate the 2 instruction sequence xxspltib/vextsb2d for values -128 to 128. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section. POWER8 (and earlier) does not have vextsb2d instructions. For a smaller range (-16 -> 15) POWER8 can use the sequence vec_splat_s32/vec_unpackl but the latest compilers are too clever for this and generate a load from .rodata anyway.
processor Latency Throughput
power8 4 - 9 2/cycle
power9 5 2/cycle
Parameters
sima small signed integer const.
Returns
Vector with sim value splatted to doublewords.

◆ vec_splatd()

static vui64_t vec_splatd ( vui64_t  vra,
const int  ctl 
)
inlinestatic

Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl vrt[0] vrt[1]
0 vra[0] vra[0]
1 vra[1] vra[1]
Parameters
vraa 128-bit vector.
ctla const integer encoding the source doubleword.
Returns
The original vector with the doubleword elements swapped.

◆ vec_spltd()

static vui64_t vec_spltd ( vui64_t  vra,
const int  ctl 
)
inlinestatic
Deprecated:
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result.
processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl vrt[0:63] vrt[64:127]
0 vra[0:63] vra[0:63]
1 vra[64:127] vra[64:127]
Parameters
vraa 128-bit vector.
ctla const integer encoding the source doubleword.
Returns
The original vector with the doubleword elements swapped.

◆ vec_sradi()

static vi64_t vec_sradi ( vi64_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift Right Algebraic Doubleword Immediate.

Shift Right Algebraic each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return the sign bit propagated to each bit of each element.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraa 128-bit vector treated as a vector signed long int.
shbshift amount in the range 0-63.
Returns
128-bit vector signed long int, shifted right shb bits.

◆ vec_srdi()

static vui64_t vec_srdi ( vui64_t  vra,
const unsigned int  shb 
)
inlinestatic

Vector Shift Right Doubleword Immediate.

Shift Right each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.

processor Latency Throughput
power8 2-4 2/cycle
power9 2-5 2/cycle
Parameters
vraa 128-bit vector treated as a vector unsigned long int.
shbshift amount in the range 0-63.
Returns
128-bit vector unsigned long int, shifted right shb bits.

◆ vec_subudm()

static vui64_t vec_subudm ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Subtract Unsigned Doubleword Modulo.

For each unsigned long (64-bit) integer element c[i] = a[i] + NOT(b[i]) + 1.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle

For POWER8 (PowerISA 2.07B) or later use the Vector Subtract Unsigned Doubleword Modulo (vsubudm) instruction. Otherwise use vector add word modulo forms and propagate the carry bits.

Parameters
a128-bit vector treated as 2 X unsigned long int.
b128-bit vector treated as 2 X unsigned long int.
Returns
vector unsigned long int sum of a[0] + NOT(b[0]) + 1 and a[1] + NOT(b[1]) + 1.

◆ vec_swapd()

static vui64_t vec_swapd ( vui64_t  vra)
inlinestatic

Vector doubleword swap. Exchange the high and low doubleword elements of a vector.

processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector.
Returns
The original vector with the doubleword elements swapped.

◆ vec_vgluddo()

static vui64_t vec_vgluddo ( unsigned long long *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets.

For each doubleword element [i] of vra, load the doubleword element at *(char*)array+vra[i]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).

Note
As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.
processor Latency Throughput
power8 12 1/cycle
power9 11 1/cycle
Parameters
arrayPointer to array of integer doublewords.
vraVector of doubleword (64-bit) byte offsets from &array.
Returns
vector doubleword containing elements loaded from *(char*)array+vra[0] and *(char*)array+vra[1].

◆ vec_vgluddsx()

static vui64_t vec_vgluddsx ( unsigned long long *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, load the doubleword element array[vra[i] * (1 << scale)]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.

Note
As effective address calculation is modulo 64-bits, signed or unsigned doubleword indexes are equivalent.
processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of integer doublewords.
vraVector of signed doubleword indexes.
scale8-bit integer. Indexes are multiplying by 2scale.
Returns
vector containing doublewords from array[(vra[0,1]<<scale)].

◆ vec_vgluddx()

static vui64_t vec_vgluddx ( unsigned long long *  array,
vi64_t  vra 
)
inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes.

For each doubleword element [i] of vra, load the doubleword element from array[vra[i]]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left 3 bits.

Note
As effective address calculation is modulo 64-bits, signed or unsigned doubleword indexes are equivalent.
processor Latency Throughput
power8 14-23 1/cycle
power9 13-22 1/cycle
Parameters
arrayPointer to array of integer doublewords.
vraVector of signed doubleword indexes.
Returns
vector containing doublewords array[vra[0,1]].

◆ vec_vgludso()

static vui64_t vec_vgludso ( unsigned long long *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Gather-Load Integer Doublewords from Scalar Offsets.

For each scalar offset[0|1], load the doubleword element at *(char*)array+offset[0|1]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets should be doubleword aligned (integer multiple of 8).

processor Latency Throughput
power8 7 1/cycle
power9 8 1/cycle
Parameters
arrayPointer to array of integer doublewords.
offset0Scalar (64-bit) byte offsets from &array.
offset1Scalar (64-bit) byte offsets from &array.
Returns
vector doubleword containing elements loaded from *(char*)array+offset0 and *(char*)array+offset1.

◆ vec_vlsidx()

static vui64_t vec_vlsidx ( const signed long long  ra,
const unsigned long long *  rb 
)
inlinestatic

Vector Load Scalar Integer Doubleword Indexed.

Load the left most doubleword of vector xt as a scalar doubleword from the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be doubleword aligned (integer multiple of 8).

Note
the right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. This simplifies merging elements for gather operations.

Note
This instruction was introduced in PowerISA 2.06 (POWER7). For POWER8/9 there are additional optimizations by effectively converting small constant index values into displacements. For POWER8 a specific pattern of addi/lsxdx instruction is fused into a single load displacement internal operation. For POWER9 we can use the lxsd (DS-form) instruction directly.
processor Latency Throughput
power8 5 2/cycle
power9 5 2/cycle
Parameters
raconst signed doubleword index (offset/displacement).
rbconst doubleword pointer to an array of doubles.
Returns
The data stored at (ra + rb) is loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vmadd2eud()

static vui128_t vec_vmadd2eud ( vui64_t  a,
vui64_t  b,
vui64_t  c,
vui64_t  d 
)
inlinestatic

Vector Multiply-Add2 Even Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmadd2eud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmadd2euw()

static vui64_t vec_vmadd2euw ( vui32_t  a,
vui32_t  b,
vui32_t  c,
vui32_t  d 
)
inlinestatic

Vector Multiply-Add2 Even Unsigned Words.

Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c and d (aeven * beven) + EXTZ(ceven + EXTZ(deven).

Note
The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 9 1/cycle
power9 9 1/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
c128-bit vector unsigned int.
d128-bit vector unsigned int.
Returns
vector unsigned long int sum (aeven * beven) + EXTZ(ceven) + EXTZ(deven).

◆ vec_vmadd2oud()

static vui128_t vec_vmadd2oud ( vui64_t  a,
vui64_t  b,
vui64_t  c,
vui64_t  d 
)
inlinestatic

Vector Multiply-Add2 Odd Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmadd2oud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmadd2ouw()

static vui64_t vec_vmadd2ouw ( vui32_t  a,
vui32_t  b,
vui32_t  c,
vui32_t  d 
)
inlinestatic

Vector Multiply-Add2 Odd Unsigned Words.

Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c and d (aodd * bodd) + EXTZ(codd + EXTZ(dodd).

Note
The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 9 1/cycle
power9 9 1/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
c128-bit vector unsigned int.
d128-bit vector unsigned int.
Returns
vector unsigned long int sum (aodd * bodd) + EXTZ(codd + EXTZ(dodd).

◆ vec_vmaddeud()

static vui128_t vec_vmaddeud ( vui64_t  a,
vui64_t  b,
vui64_t  c 
)
inlinestatic

Vector Multiply-Add Even Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmaddeud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmaddeuw()

static vui64_t vec_vmaddeuw ( vui32_t  a,
vui32_t  b,
vui32_t  c 
)
inlinestatic

Vector Multiply-Add Even Unsigned Words.

Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c (aeven * beven) + EXTZ(ceven).

Note
The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 9 2/cycle
power9 9 2/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
c128-bit vector unsigned int.
Returns
vector unsigned long int sum (aeven * beven) + EXTZ(ceven).

◆ vec_vmaddoud()

static vui128_t vec_vmaddoud ( vui64_t  a,
vui64_t  b,
vui64_t  c 
)
inlinestatic

Vector Multiply-Add Odd Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmaddoud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmaddouw()

static vui64_t vec_vmaddouw ( vui32_t  a,
vui32_t  b,
vui32_t  c 
)
inlinestatic

Vector Multiply-Add Odd Unsigned Words.

Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c (aodd * bodd) + EXTZ(codd).

Note
The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.
This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 9 2/cycle
power9 9 2/cycle
Parameters
a128-bit vector unsigned int.
b128-bit vector unsigned int.
c128-bit vector unsigned int.
Returns
vector unsigned long int sum (aodd * bodd) + EXTZ(codd).

◆ vec_vmsumeud()

static vui128_t vec_vmsumeud ( vui64_t  a,
vui64_t  b,
vui128_t  c 
)
inlinestatic

Vector Multiply-Sum Even Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmsumeud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmsumoud()

static vui128_t vec_vmsumoud ( vui64_t  a,
vui64_t  b,
vui128_t  c 
)
inlinestatic

Vector Multiply-Sum Odd Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmsumoud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmsumuwm()

static vui64_t vec_vmsumuwm ( vui32_t  vra,
vui32_t  vrb,
vui64_t  vrc 
)
inlinestatic

Vector Multiply-Sum Unsigned Word Modulo.

Multiply the unsigned word elements of vra and vrb, internally generating doubleword products. Then generate three-way sum of adjacent doubleword product pairs, plus the doubleword elements from vrc. The final summation is modulo 64-bits.

Note
This function implements the operation of a Vector Multiply-Sum Unsigned Word Modulo instruction, if the PowerISA included such an instruction. This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.
processor Latency Throughput
power8 11 1/cycle
power9 11 1/cycle
Parameters
vra128-bit vector unsigned int.
vrb128-bit vector unsigned int.
vrc128-bit vector unsigned long.
Returns
vector of doubleword elements where each is the sum of the even and odd adjacent products of the vra and vrb, plus the corresponding doubleword element of vrc.

◆ vec_vmuleud()

static vui128_t vec_vmuleud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Multiply Even Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmuleud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmuloud()

static vui128_t vec_vmuloud ( vui64_t  a,
vui64_t  b 
)
inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Note
this implementation exists in vec_int128_ppc::h::vec_vmuloud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vpkudum()

static vui32_t vec_vpkudum ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Pack Unsigned Doubleword Unsigned Modulo.

The doubleword source is the concatination of vra and vrb. For each integer word from 0 to 3, of the result vector, do the following: place the contents of bits 32:63 of the corresponding doubleword source element [i] into word element [i] of the result.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Use vec_vpkudum naming but only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 2 x unsigned long integers.
vrba 128-bit vector treated as 2 x unsigned long integers.
Returns
128-bit vector treated as 4 x unsigned integers.

◆ vec_vrld()

static vui64_t vec_vrld ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Rotate Left Doubleword.

Vector Rotate Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Use vec_vrld naming but only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 2 x unsigned long integers.
vrbshift amount in bits 58:63 and 122:127.
Returns
Left shifted vector unsigned long.

◆ vec_vsld()

static vui64_t vec_vsld ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Shift Left Doubleword.

Vector Shift Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Can not use vec_sld naming here as that would conflict with the generic Shift Left Double Vector. Use vec_vsld but only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 2 x unsigned long integers.
vrbshift amount in bits 58:63 and 122:127.
Returns
Left shifted vector unsigned long.

◆ vec_vsrad()

static vi64_t vec_vsrad ( vi64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Shift Right Algebraic Doubleword.

Vector Shift Right Algebraic Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Use the vec_vsrad for consistency with vec_vsld above. Define vec_vsrad only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 2 x unsigned long integers.
vrbshift amount in bits 58:63 and 122:127.
Returns
Right shifted vector unsigned long.

◆ vec_vsrd()

static vui64_t vec_vsrd ( vui64_t  vra,
vui64_t  vrb 
)
inlinestatic

Vector Shift Right Doubleword.

Vector Shift Right Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor Latency Throughput
power8 2 2/cycle
power9 2 2/cycle
Note
Use the vec_vsrd for consistency with vec_vsld above. Define vec_vsrd only if the compiler does not define it in <altivec.h>.
Parameters
vraa 128-bit vector treated as 2 x unsigned long integers.
vrbshift amount in bits 58:63 and 122:127.
Returns
Right shifted vector unsigned long.

◆ vec_vsstuddo()

static void vec_vsstuddo ( vui64_t  xs,
unsigned long long *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets.

For each doubleword element [i] of vra, Store the doubleword element xs[i] at the address *(char*)array+vra[i] For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).

processor Latency Throughput
power8 12 1/cycle
power9 8 1/cycle
Parameters
xsVector of integer doubleword elements to scatter store.
arrayPointer to array of integer doublewords.
vraVector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstuddsx()

static void vec_vsstuddsx ( vui64_t  xs,
unsigned long long *  array,
vi64_t  vra,
const unsigned char  scale 
)
inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, store the doubleword element xs[i] at array[(vra[i] << scale)]. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.

processor Latency Throughput
power8 14-23 1/cycle
power9 10-19 1/cycle
Parameters
xsVector of integer doubleword elements to scatter store.
arrayPointer to array of integer doublewords.
vraVector of signed doubleword indexes.
scale8-bit integer. Indexes are multiplying by 2scale.

◆ vec_vsstuddx()

static void vec_vsstuddx ( vui64_t  xs,
unsigned long long *  array,
vi64_t  vra 
)
inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes.

For each doubleword element [i] of vra, store the doubleword element xs[i] at array[vra[i]]. Indexes are converted to offsets from *array by shifting each doubleword of vra left (3+scale) bits.

processor Latency Throughput
power8 14-23 1/cycle
power9 10-19 1/cycle
Parameters
xsVector of integer doubleword elements to scatter store.
arrayPointer to array of integer doublewords.
vraVector of signed doubleword indexes.

◆ vec_vsstudso()

static void vec_vsstudso ( vui64_t  xs,
unsigned long long *  array,
const long long  offset0,
const long long  offset1 
)
inlinestatic

Vector Scatter-Store Integer Doublewords to Scalar Offsets.

For each doubleword element [i] of vra, Store the doubleword element xs[i] at *(char*)array+offset[0|1]. For best performance, &array and doubleword offsets should be doubleword aligned (integer multiple of 8).

processor Latency Throughput
power8 12 1/cycle
power9 8 1/cycle
Parameters
xsVector of integer doubleword elements to scatter store.
arrayPointer to array of integer doublewords.
offset0Scalar (64-bit) byte offset from &array.
offset1Scalar (64-bit) byte offset from &array.

◆ vec_vstsidx()

static void vec_vstsidx ( vui64_t  xs,
const signed long long  ra,
unsigned long long *  rb 
)
inlinestatic

Vector Store Scalar Integer Doubleword Indexed.

Stores the left most doubleword of vector xs as a scalar doubleword at the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be doubleword aligned (integer multiple of 8).

This operation is an alternate form of vector store element, with the added simplification that data is always left justified in the vector. This simplifies scatter operations.

Note
This is instruction was introduced in PowerISA 2.06 (POWER7). For POWER9 there are additional optimizations by effectively converting small constant index values into displacements. For POWER9 we can use the stxsd (DS-form) instruction directly.
processor Latency Throughput
power8 0 - 2 2/cycle
power9 0 - 2 4/cycle
Parameters
xsvector doubleword element 0 to be stored.
raconst signed long long index (offset/displacement).
rbconst doubleword pointer to an array of doubles.

◆ vec_xxspltd()

static vui64_t vec_xxspltd ( vui64_t  vra,
const int  ctl 
)
inlinestatic

Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result.

Note
This function implements the operation of a VSX Splat Doubleword Immediate instruction. This implementation is NOT Endian sensitive and the function is stable across BE/LE implementations.

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl vrt[0:63] vrt[64:127]
0 vra[0:63] vra[0:63]
1 vra[64:127] vra[64:127]
processor Latency Throughput
power8 2 2/cycle
power9 3 2/cycle
Parameters
vraa 128-bit vector.
ctla const integer encoding the source doubleword.
Returns
The original vector with the doubleword elements swapped.
vec_muleuw
static vui64_t vec_muleuw(vui32_t a, vui32_t b)
Vector multiply even unsigned words.
Definition: vec_int32_ppc.h:1007
vec_xxspltd
static vui64_t vec_xxspltd(vui64_t vra, const int ctl)
Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of ...
Definition: vec_int64_ppc.h:4647
vec_vpkudum
static vui32_t vec_vpkudum(vui64_t vra, vui64_t vrb)
Vector Pack Unsigned Doubleword Unsigned Modulo.
Definition: vec_int64_ppc.h:4141
vec_muludm
static vui64_t vec_muludm(vui64_t vra, vui64_t vrb)
Vector Multiply Unsigned Doubleword Modulo.
Definition: vec_int128_ppc.h:5344
vec_minud
static vui64_t vec_minud(vui64_t vra, vui64_t vrb)
Vector Minimum Unsigned Doubleword.
Definition: vec_int64_ppc.h:2663
CONST_VINT128_W
#define CONST_VINT128_W(__w0, __w1, __w2, __w3)
Arrange word elements of a unsigned int initializer in high->low order. May require an explicit cast.
Definition: vec_common_ppc.h:304
CONST_VINT64_DW
#define CONST_VINT64_DW(__dw0, __dw1)
Arrange elements of dword initializer in high->low order.
Definition: vec_common_ppc.h:295
vec_srdi
static vui64_t vec_srdi(vui64_t vra, const unsigned int shb)
Vector Shift Right Doubleword Immediate.
Definition: vec_int64_ppc.h:3604
vec_vmuleud
static vui128_t vec_vmuleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:7487
vi128_t
__vector __int128 vi128_t
vector of one 128-bit signed __int128 element.
Definition: vec_common_ppc.h:235
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vi32_t
__vector int vi32_t
vector of 32-bit signed int elements.
Definition: vec_common_ppc.h:215
vec_subudm
static vui64_t vec_subudm(vui64_t a, vui64_t b)
Vector Subtract Unsigned Doubleword Modulo.
Definition: vec_int64_ppc.h:3746
vec_vrld
static vui64_t vec_vrld(vui64_t vra, vui64_t vrb)
Vector Rotate Left Doubleword.
Definition: vec_int64_ppc.h:4185
vec_vsld
static vui64_t vec_vsld(vui64_t vra, vui64_t vrb)
Vector Shift Left Doubleword.
Definition: vec_int64_ppc.h:4238
vec_maxud
static vui64_t vec_maxud(vui64_t vra, vui64_t vrb)
Vector Maximum Unsigned Doubleword.
Definition: vec_int64_ppc.h:2579
vui128_t
__vector unsigned __int128 vui128_t
vector of one 128-bit unsigned __int128 element.
Definition: vec_common_ppc.h:237
vb64_t
__vector __bool long long vb64_t
vector of 64-bit bool long long elements.
Definition: vec_common_ppc.h:230
vec_msumudm
static vui128_t vec_msumudm(vui64_t a, vui64_t b, vui128_t c)
Vector Multiply-Sum Unsigned Doubleword Modulo.
Definition: vec_int128_ppc.h:5202
vec_vmuloud
static vui128_t vec_vmuloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:7733
vi64_t
__vector long long vi64_t
vector of 64-bit signed long long elements.
Definition: vec_common_ppc.h:217
vec_mulouw
static vui64_t vec_mulouw(vui32_t a, vui32_t b)
Vector multiply odd unsigned words.
Definition: vec_int32_ppc.h:1043
vec_pasted
static vui64_t vec_pasted(vui64_t __VH, vui64_t __VL)
Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word o...
Definition: vec_int64_ppc.h:2937
vec_mrgald
static vui64_t vec_mrgald(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic Low Doublewords.
Definition: vec_int64_ppc.h:2736
vec_mrgahd
static vui64_t vec_mrgahd(vui128_t vra, vui128_t vrb)
Vector Merge Algebraic High Doublewords.
Definition: vec_int64_ppc.h:2710
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
vec_mulhud
static vui64_t vec_mulhud(vui64_t vra, vui64_t vrb)
Vector Multiply High Unsigned Doubleword.
Definition: vec_int128_ppc.h:5277
vi16_t
__vector short vi16_t
vector of 16-bit signed short elements.
Definition: vec_common_ppc.h:213
vec_adduqm
static vui128_t vec_adduqm(vui128_t a, vui128_t b)
Vector Add Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:2739
vec_vmsumuwm
static vui64_t vec_vmsumuwm(vui32_t vra, vui32_t vrb, vui64_t vrc)
Vector Multiply-Sum Unsigned Word Modulo.
Definition: vec_int64_ppc.h:4829
vec_addudm
static vui64_t vec_addudm(vui64_t a, vui64_t b)
Vector Add Unsigned Doubleword Modulo.
Definition: vec_int64_ppc.h:1261
vec_absdud
static vui64_t vec_absdud(vui64_t vra, vui64_t vrb)
Vector Absolute Difference Unsigned Doubleword.
Definition: vec_int64_ppc.h:1241
vec_muleud
static vui128_t vec_muleud(vui64_t a, vui64_t b)
Vector Multiply Even Unsigned Doublewords.
Definition: vec_int128_ppc.h:5244
vec_muloud
static vui128_t vec_muloud(vui64_t a, vui64_t b)
Vector Multiply Odd Unsigned Doublewords.
Definition: vec_int128_ppc.h:5313
vec_cmpequd
static vb64_t vec_cmpequd(vui64_t a, vui64_t b)
Vector Compare Equal Unsigned Doubleword.
Definition: vec_int64_ppc.h:1451