Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements. More...

#include <pveclib/vec_int32_ppc.h>

Functions
static vui64_t	vec_absdud (vui64_t vra, vui64_t vrb)
	Vector Absolute Difference Unsigned Doubleword. More...

static vui64_t	vec_addudm (vui64_t a, vui64_t b)
	Vector Add Unsigned Doubleword Modulo. More...

static vui64_t	vec_clzd (vui64_t vra)
	Vector Count Leading Zeros Doubleword for unsigned long long elements. More...

static vui64_t	vec_ctzd (vui64_t vra)
	Vector Count Trailing Zeros Doubleword for unsigned long long elements. More...

static vb64_t	vec_cmpeqsd (vi64_t a, vi64_t b)
	Vector Compare Equal Signed Doubleword. More...

static vb64_t	vec_cmpequd (vui64_t a, vui64_t b)
	Vector Compare Equal Unsigned Doubleword. More...

static vb64_t	vec_cmpgesd (vi64_t a, vi64_t b)
	Vector Compare Greater Than or Equal Signed Doubleword. More...

static vb64_t	vec_cmpgeud (vui64_t a, vui64_t b)
	Vector Compare Greater Than or Equal Unsigned Doubleword. More...

static vb64_t	vec_cmpgtsd (vi64_t a, vi64_t b)
	Vector Compare Greater Than Signed Doubleword. More...

static vb64_t	vec_cmpgtud (vui64_t a, vui64_t b)
	Vector Compare Greater Than Unsigned Doubleword. More...

static vb64_t	vec_cmplesd (vi64_t a, vi64_t b)
	Vector Compare Less Than Equal Signed Doubleword. More...

static vb64_t	vec_cmpleud (vui64_t a, vui64_t b)
	Vector Compare Less Than Equal Unsigned Doubleword. More...

static vb64_t	vec_cmpltsd (vi64_t a, vi64_t b)
	Vector Compare less Than Signed Doubleword. More...

static vb64_t	vec_cmpltud (vui64_t a, vui64_t b)
	Vector Compare less Than Unsigned Doubleword. More...

static vb64_t	vec_cmpnesd (vi64_t a, vi64_t b)
	Vector Compare Not Equal Signed Doubleword. More...

static vb64_t	vec_cmpneud (vui64_t a, vui64_t b)
	Vector Compare Not Equal Unsigned Doubleword. More...

static int	vec_cmpsd_all_eq (vi64_t a, vi64_t b)
	Vector Compare all Equal Signed Doubleword. More...

static int	vec_cmpsd_all_ge (vi64_t a, vi64_t b)
	Vector Compare all Greater Than or Equal Signed Doubleword. More...

static int	vec_cmpsd_all_gt (vi64_t a, vi64_t b)
	Vector Compare all Greater Than Signed Doubleword. More...

static int	vec_cmpsd_all_le (vi64_t a, vi64_t b)
	Vector Compare all Less than equal Signed Doubleword. More...

static int	vec_cmpsd_all_lt (vi64_t a, vi64_t b)
	Vector Compare all Less than Signed Doubleword. More...

static int	vec_cmpsd_all_ne (vi64_t a, vi64_t b)
	Vector Compare all Not Equal Signed Doubleword. More...

static int	vec_cmpsd_any_eq (vi64_t a, vi64_t b)
	Vector Compare any Equal Signed Doubleword. More...

static int	vec_cmpsd_any_ge (vi64_t a, vi64_t b)
	Vector Compare any Greater Than or Equal Signed Doubleword. More...

static int	vec_cmpsd_any_gt (vi64_t a, vi64_t b)
	Vector Compare any Greater Than Signed Doubleword. More...

static int	vec_cmpsd_any_le (vi64_t a, vi64_t b)
	Vector Compare any Less than equal Signed Doubleword. More...

static int	vec_cmpsd_any_lt (vi64_t a, vi64_t b)
	Vector Compare any Less than Signed Doubleword. More...

static int	vec_cmpsd_any_ne (vi64_t a, vi64_t b)
	Vector Compare any Not Equal Signed Doubleword. More...

static int	vec_cmpud_all_eq (vui64_t a, vui64_t b)
	Vector Compare all Equal Unsigned Doubleword. More...

static int	vec_cmpud_all_ge (vui64_t a, vui64_t b)
	Vector Compare all Greater Than or Equal Unsigned Doubleword. More...

static int	vec_cmpud_all_gt (vui64_t a, vui64_t b)
	Vector Compare all Greater Than Unsigned Doubleword. More...

static int	vec_cmpud_all_le (vui64_t a, vui64_t b)
	Vector Compare all Less than equal Unsigned Doubleword. More...

static int	vec_cmpud_all_lt (vui64_t a, vui64_t b)
	Vector Compare all Less than Unsigned Doubleword. More...

static int	vec_cmpud_all_ne (vui64_t a, vui64_t b)
	Vector Compare all Not Equal Unsigned Doubleword. More...

static int	vec_cmpud_any_eq (vui64_t a, vui64_t b)
	Vector Compare any Equal Unsigned Doubleword. More...

static int	vec_cmpud_any_ge (vui64_t a, vui64_t b)
	Vector Compare any Greater Than or Equal Unsigned Doubleword. More...

static int	vec_cmpud_any_gt (vui64_t a, vui64_t b)
	Vector Compare any Greater Than Unsigned Doubleword. More...

static int	vec_cmpud_any_le (vui64_t a, vui64_t b)
	Vector Compare any Less than equal Unsigned Doubleword. More...

static int	vec_cmpud_any_lt (vui64_t a, vui64_t b)
	Vector Compare any Less than Unsigned Doubleword. More...

static int	vec_cmpud_any_ne (vui64_t a, vui64_t b)
	Vector Compare any Not Equal Unsigned Doubleword. More...

static vi64_t	vec_maxsd (vi64_t vra, vi64_t vrb)
	Vector Maximum Signed Doubleword. More...

static vui64_t	vec_maxud (vui64_t vra, vui64_t vrb)
	Vector Maximum Unsigned Doubleword. More...

static vi64_t	vec_minsd (vi64_t vra, vi64_t vrb)
	Vector Minimum Signed Doubleword. More...

static vui64_t	vec_minud (vui64_t vra, vui64_t vrb)
	Vector Minimum Unsigned Doubleword. More...

static vui64_t	vec_mrgahd (vui128_t vra, vui128_t vrb)
	Vector Merge Algebraic High Doublewords. More...

static vui64_t	vec_mrgald (vui128_t vra, vui128_t vrb)
	Vector Merge Algebraic Low Doublewords. More...

static vui64_t	vec_mrged (vui64_t __VA, vui64_t __VB)
	Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...

static vui64_t	vec_mrghd (vui64_t __VA, vui64_t __VB)
	Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...

static vui64_t	vec_mrgld (vui64_t __VA, vui64_t __VB)
	Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...

static vui64_t	vec_mrgod (vui64_t __VA, vui64_t __VB)
	Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian. More...

static vui128_t	vec_msumudm (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Unsigned Doubleword Modulo. More...

static vui128_t	vec_muleud (vui64_t a, vui64_t b)
	Vector Multiply Even Unsigned Doublewords. More...

static vui64_t	vec_mulhud (vui64_t vra, vui64_t vrb)
	Vector Multiply High Unsigned Doubleword. More...

static vui128_t	vec_muloud (vui64_t a, vui64_t b)
	Vector Multiply Odd Unsigned Doublewords. More...

static vui64_t	vec_muludm (vui64_t vra, vui64_t vrb)
	Vector Multiply Unsigned Doubleword Modulo. More...

static vui64_t	vec_pasted (vui64_t __VH, vui64_t __VL)
	Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector. More...

static vui64_t	vec_permdi (vui64_t vra, vui64_t vrb, const int ctl)
	Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector. More...

static vui64_t	vec_popcntd (vui64_t vra)
	Vector Population Count doubleword. More...

static vui64_t	vec_revbd (vui64_t vra)
	byte reverse each doubleword for a vector unsigned long int. More...

static vui64_t	vec_vrld (vui64_t vra, vui64_t vrb)
	Vector Rotate Left Doubleword. More...

static vui64_t	vec_vsld (vui64_t vra, vui64_t vrb)
	Vector Shift Left Doubleword. More...

static vui64_t	vec_vsrd (vui64_t vra, vui64_t vrb)
	Vector Shift Right Doubleword. More...

static vi64_t	vec_vsrad (vi64_t vra, vui64_t vrb)
	Vector Shift Right Algebraic Doubleword. More...

static vb64_t	vec_setb_sd (vi64_t vra)
	Vector Set Bool from Signed Doubleword. More...

static vui64_t	vec_rldi (vui64_t vra, const unsigned int shb)
	Vector Rotate left Doubleword Immediate. More...

static vui64_t	vec_sldi (vui64_t vra, const unsigned int shb)
	Vector Shift left Doubleword Immediate. More...

static vi64_t	vec_selsd (vi64_t vra, vi64_t vrb, vb64_t vrc)
	Vector Select Signed Doubleword. More...

static vui64_t	vec_selud (vui64_t vra, vui64_t vrb, vb64_t vrc)
	Vector Select Unsigned Doubleword. More...

static vui64_t	vec_splatd (vui64_t vra, const int ctl)
	Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian. More...

static vi64_t	vec_splat_s64 (const int sim)
	Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte \| Halfword \|Word). More...

static vui64_t	vec_splat_u64 (const int sim)
	Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte \| Halfword \|Word). More...

static vui64_t	vec_spltd (vui64_t vra, const int ctl)

static vui64_t	vec_srdi (vui64_t vra, const unsigned int shb)
	Vector Shift Right Doubleword Immediate. More...

static vi64_t	vec_sradi (vi64_t vra, const unsigned int shb)
	Vector Shift Right Algebraic Doubleword Immediate. More...

static vui64_t	vec_subudm (vui64_t a, vui64_t b)
	Vector Subtract Unsigned Doubleword Modulo. More...

static vui64_t	vec_swapd (vui64_t vra)
	Vector doubleword swap. Exchange the high and low doubleword elements of a vector. More...

static vui64_t	vec_vgluddo (unsigned long long *array, vi64_t vra)
	Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets. More...

static vui64_t	vec_vgluddsx (unsigned long long *array, vi64_t vra, const unsigned char scale)
	Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes. More...

static vui64_t	vec_vgluddx (unsigned long long *array, vi64_t vra)
	Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes. More...

static vui64_t	vec_vgludso (unsigned long long *array, const long long offset0, const long long offset1)
	Vector Gather-Load Integer Doublewords from Scalar Offsets. More...

static vui64_t	vec_vlsidx (const signed long long ra, const unsigned long long *rb)
	Vector Load Scalar Integer Doubleword Indexed. More...

static vui128_t	vec_vmadd2eud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
	Vector Multiply-Add2 Even Unsigned Doublewords. More...

static vui128_t	vec_vmaddeud (vui64_t a, vui64_t b, vui64_t c)
	Vector Multiply-Add Even Unsigned Doublewords. More...

static vui128_t	vec_vmadd2oud (vui64_t a, vui64_t b, vui64_t c, vui64_t d)
	Vector Multiply-Add2 Odd Unsigned Doublewords. More...

static vui128_t	vec_vmaddoud (vui64_t a, vui64_t b, vui64_t c)
	Vector Multiply-Add Odd Unsigned Doublewords. More...

static vui128_t	vec_vmuleud (vui64_t a, vui64_t b)
	Vector Multiply Even Unsigned Doublewords. More...

static vui128_t	vec_vmuloud (vui64_t a, vui64_t b)
	Vector Multiply Odd Unsigned Doublewords. More...

static vui128_t	vec_vmsumeud (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Even Unsigned Doublewords. More...

static vui128_t	vec_vmsumoud (vui64_t a, vui64_t b, vui128_t c)
	Vector Multiply-Sum Odd Unsigned Doublewords. More...

static vui32_t	vec_vpkudum (vui64_t vra, vui64_t vrb)
	Vector Pack Unsigned Doubleword Unsigned Modulo. More...

static void	vec_vsstuddo (vui64_t xs, unsigned long long *array, vi64_t vra)
	Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets. More...

static void	vec_vsstuddsx (vui64_t xs, unsigned long long *array, vi64_t vra, const unsigned char scale)
	Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes. More...

static void	vec_vsstuddx (vui64_t xs, unsigned long long *array, vi64_t vra)
	Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes. More...

static void	vec_vsstudso (vui64_t xs, unsigned long long *array, const long long offset0, const long long offset1)
	Vector Scatter-Store Integer Doublewords to Scalar Offsets. More...

static void	vec_vstsidx (vui64_t xs, const signed long long ra, unsigned long long *rb)
	Vector Store Scalar Integer Doubleword Indexed. More...

static vui64_t	vec_xxspltd (vui64_t vra, const int ctl)
	Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. More...

static vui64_t	vec_vmaddeuw (vui32_t a, vui32_t b, vui32_t c)
	Vector Multiply-Add Even Unsigned Words. More...

static vui64_t	vec_vmadd2euw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
	Vector Multiply-Add2 Even Unsigned Words. More...

static vui64_t	vec_vmaddouw (vui32_t a, vui32_t b, vui32_t c)
	Vector Multiply-Add Odd Unsigned Words. More...

static vui64_t	vec_vmadd2ouw (vui32_t a, vui32_t b, vui32_t c, vui32_t d)
	Vector Multiply-Add2 Odd Unsigned Words. More...

static vui64_t	vec_vmsumuwm (vui32_t vra, vui32_t vrb, vui64_t vrc)
	Vector Multiply-Sum Unsigned Word Modulo. More...

Detailed Description

Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.

The original VMX (AKA Altivec) did not define any doubleword element (long long integer or double float) operations. The VSX facility (introduced with POWER7) added vector double float but did not add any integer doubleword (64-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector long int (64-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.

POWER8 (PowerISA 2.07B) adds important doubleword integer (add, subtract, compare, shift, rotate, ...) VMX operations. POWER8 also added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend).

POWER9 (PowerISA 3.0B) adds the Vector Multiply-Sum Unsigned Doubleword Modulo instruction. This is not the expected multiply even/odd/modulo doubleword nor a full multiply modulo quadword. But with a few extra (permutes and splat zero) instructions you can get equivalent function.

Note: The doubleword integer multiply implementations are included in vec_int128_ppc.h. This resolves a circular dependency as 64-bit by 64-bit integer multiplies require 128-bit integer addition (vec_adduqm()) to produce the full product.

See also: vec_msumudm, vec_muleud, vec_mulhud, vec_muloud, vec_muludm, vec_vmuleud, and vec_vmuloud

Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit integer vector intrinsics to be added to altivec.h. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.

Note: The compiler disables associated <altivec.h> built-ins if the mcpu target does not enable the specific instruction. For example, if you compile with -mcpu=power7, vec_vclz and vec_vclzd will not be defined. But vec_clzd is always defined in this header, will generate the minimum code, appropriate for the target, and produce correct results.

64-bit integer operations are commonly used in the implementation of optimized double float math library functions and this applies to the vector equivalents of math functions. So missing, incomplete or buggy support for vector long integer intrinsics can be a impediment to the implementation of optimized and portable vector double math libraries. This header is a prerequisite for vec_f64_ppc.h which together are intended to support the implementation of vector math libraries.

Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the built-ins.

This header covers operations that are any of the following:

Implemented in hardware instructions for later processors and useful to programmers, on slightly older processors, even if the equivalent function requires more instructions. Examples include the doubleword operations: Add, Compare, Maximum, Minimum and Subtract.
Defined in the OpenPOWER ABI but not yet defined in <altivec.n> provided by available compilers in common use. Examples include doubleword forms of: Multiply Even/Odd/Modulo, Count Leading Zeros, Population Count, and Byte Reverse operations.
Commonly used operations, not covered by the ABI or <altivec.h>, and require multiple instructions or are not obvious. Examples include doubleword forms of: Merge Algebraic High/Low, Paste, and Rotate/Shift Immediate operations.
Commonly used operations that are useful for doubleword, but are missing from the PowerISA and OpenPOWER ABI. Examples include: Absolute Difference Doubleword and Multiply-Sum Unsigned Word Modulo.

Some missing doubleword operations

The original VMX instruction set extension was limited to byte, halfword, and word size element operations. This limited vector arithmetic operations to char, short, int and float elements. This limitation persisted until PowerISA 2.06 (POWER7) added the Vector Scalar Extensions (VSX) facility. VSX combined/extended the FPRs and VRs into 64 by 128-bit Vector/Scalar Registers (VSRs).

VSX added a large number of scalar double-precision and vector single / double-precision floating-point operations. The double-precision scalar (xs prefix) instructions where largely duplicates of the existing Floating-Point Facility operations, extended to access the whole (64) VSX register set. Similarly the VSX vector single precision floating-point (xv prefix, sp suffix) instructions were added to give vectorized float code access to 64 VSX registers.

The addition of VSX vector double-precision (xv prefix) instructions was the most significant addition. This added vector doubleword floating-point operations and provided access to all 64 VSX registers. Alas, there are no doubleword (64-bit long) integer operations in the initial VSX. A few logical and permute class (xx prefix) operations on word/doubleword elements where tacked on. These apply equally to float and integer elements. But nothing for 64-bit integer arithmetic.

Note: The full title in PowerISA 2.06 is Vector-Scalar Floating-Point Operations [Category: VSX].

PowerISA 2.07 (POWER8) did add a significant number of doubleword (64-bit) integer operations. Including;

Add and subtract modulo
Signed and unsigned compare, maximum, minimum,
Shift and rotate
Count leading zeros and population count

Also a number of new word (32-bit) integer operations;

Multiply even/odd/modulo.
Pack signed/unsigned/saturate and Unpack signed.
Merge even/odd words

And some new quadword (128-bit) integer operations;

Add and Subtract modulo/extend/write-carry
Decimal Add and Subtract modulo

And some specialized operations;

Crypto, Raid, Polynomial multiply-sum

Note: The operations above are all Vector Category and can only access the 32 original vector registers (VSRs 32-63).

The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:

Scalar single precision floating-point
Direct move between GPRs and VSRs
Logical operations; equivalence, not and, or compliment

PowerISA 3.0 (POWER9) adds a few more doubleword (64-bit) integer operations. Including;

Compare not equal
Count trailing zeros and parity
Extract and Insert
Multiply-sum modulo
Negate
Rotate Left under mask

Also a number of new word (32-bit) integer operations;

Absolute Difference word
Extend Sign word to doubleword

And some new quadword (128-bit) integer operations;

Multiply-by-10 extend/write-carry
Decimal convert from/to signed (binary) quadword
Decimal convert from/to zoned (ASCII char)
Decimal shift/round/truncate

The new VSX operations (with access to all 64 VSRs) were not directly applicable to 64-bit integer arithmetic:

Scalar quad-precision floating-point
Scalar and Vector convert with rounding
Scalar and Vector extract/insert exponent/significand
Scalar and Vector test data class
Permute and Permute right index

An impressive list of operations that can be used for;

Vectorizing long integer loops
Implementing useful quadword integer operations which do not have corresponding PowerISA instructions
implementing extended precision multiply and multiplicative inverse operations

The challenge is that useful operations available for POWER9 will need equivalent implementations for POWER8 and POWER7. Similarly for operations introduced for POWER8 will need POWER7 implementations. Also there are some obvious missing operations;

Absolute Difference Doubleword (we have byte, halfword, and word)
Average Doubleword (we have byte, halfword, and word)
Extend Sign Doubleword to quadword (we have byte, halfword, and word)
Multiply-sum Word (we have byte, halfword, and doubleword)
Multiply Even/Odd Doublewords (we have byte, halfword, and word)

Challenges and opportunities

The stated goals for pveclib are:

Provide equivalent functions across versions of the compiler.
Provide equivalent functions across versions of the PowerISA.
Provide complete arithmetic operations across supported C types.

So the first step is to provide implementations for the key POWER8 doubleword integer operations for older compilers. For example, some of the generic doubleword integer operations were not defined until GCC 6.0. Here we define the specific Compare Equal Unsigned Doubleword implementation:

static inline
vb64_t
vec_cmpequd (vui64_t a, vui64_t b)
{
  vb64_t result;
#ifdef _ARCH_PWR8
#if __GNUC__ >= 6
  result = vec_cmpeq(a, b);
#else
  __asm__(
      "vcmpequd %0,%1,%2;\n"
      : "=v" (result)
      : "v" (a),
      "v" (b)
      : );
#endif
#else
  // _ARCH_PWR7 implementation ...
#endif
  return (result);
}

The implementation checks if the compile target is POWER8 then checks of the compiler is new enough to use the generic vector compare built-in. If the generic built-in is not defined in <altivec.h> then we provide the equivalent inline assembler.

For POWER7 targets we don't have any vector compare doubleword operations and we need to define the equivalent operation using PowerISA 2.06B (and earlier) instructions. For example:

#else
  // _ARCH_PWR7 implementation ...
  vui8_t permute =
    { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03,
      0x0C,0x0D,0x0E,0x0F, 0x08,0x09,0x0A,0x0B};
  vui32_t r, rr;
  r = (vui32_t) vec_cmpeq ((vui32_t) a, (vui32_t) b);
  if (vec_any_ne ((vui32_t) a, (vui32_t) b))
    {
       rr = vec_perm (r, r, permute);
       r= vec_and (r, rr);
    }
  result = (vb64_t)r;
#endif

Here we use Compare Equal Unsigned Word. If all words are equal, use the result as is. Otherwise, if any word elements are not equal, we do some extra work. For each doubleword, rotate the word compare result by 32-bits (here we use permute as we don't have rotate doubleword either). Then logical and the original word compare and rotated results to get the final doubleword compare results.

Similarly for all the doubleword compare variants. Similarly for doubleword; add, subtract, maximum, minimum, shift, rotate, count leading zeros, population count, and Byte reverse.

More Challenges

Now we can look at the case where vector doubleword operations of interest don't have an equivalent instruction. Here interesting operations include those that are supported for other element sizes and types.

The simplest example is absolute difference which was introduced in PowerISA 3.0 for byte, halfword and word elements. From the implementation of vec_absduw() we see how to implement the operation for POWER8 using subtract, maximum, and minimum. For example:

static inline vui64_t
vec_absdud (vui64_t vra, vui64_t vrb)
{
  return vec_subudm (vec_maxud (vra, vrb), vec_minud (vra, vrb));
}

This works because pveclib provides implementations for min, max, and sub operations that work across GCC versions and provide processor specific implementations for POWER8/9 and POWER7.

Now we need to look at the multiply doubleword situation. We need implementations for vec_msumudm(), vec_muleud(), vec_mulhud(), vec_muloud(), and vec_muludm(). We saw in the implementations of vec_int32_ppc.h that multiply high and low/modulo can implemented using multiply and merge even/odd of that element size. Multiply low can also be implemented using the multiply sum and multiply odd of the next smaller element size. Also multiply-sum can be implemented using multiply even/odd and a couple of adds. And multiply even/odd can be implemented using multiply sum by supplying zeros to appropriate inputs/elements.

The above discussion has many circular dependencies. Eventually we need to get down to an implementation on each processor using actual hardware instructions. So what multiply doubleword operations does the PowerISA actually have from the list above:

POWER9 added multiply-sum unsigned doubleword modulo but no multiply doubleword even/odd/modulo instructions.
POWER8 added multiply even/odd/modulo word but no multiply-sum word instructions
POWER7 and earlier we have the original VMX multiply even/odd halfword, and multiply-sum unsigned halfword modulo, but no multiply modulo halfword.

It seems the best implementation strategy uses;

Multiply-sum doubleword for POWER9
Multiply even/odd word for POWER8
Multiply even/odd halfword for POWER7

We really care about performance and latency for POWER9/8. We need POWER7 to work correctly so we can test on and support legacy hardware. The rest is grade school math.

First we need to make sure we have implementations across the GCC versions 6, 7, and 8 for the instructions we need. For example:

static inline vui128_t
vec_msumudm (vui64_t a, vui64_t b, vui128_t c)
{
  vui128_t res;
#if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
  __asm__(
      "vmsumudm %0,%1,%2,%3;\n"
      : "=v" (res)
      : "v" (a), "v" (b), "v" (c)
      : );
#else
  vui128_t p_even, p_odd, p_sum;
 
  p_even = vec_muleud (a, b);
  p_odd  = vec_muloud (a, b);
  p_sum  = vec_adduqm (p_even, p_odd);
  res    = vec_adduqm (p_sum, c);
#endif
  return (res);
}

Note: The _ARCH_PWR8 implementation above depends on vec_muleud() and vec_muloud() for which there are no hardware instructions. Hold that thought.

While we are it we can implement multiply-sum unsigned word modulo.

static inline vui64_t
vec_vmsumuwm (vui32_t vra, vui32_t vrb, vui64_t vrc)
{
  vui64_t peven, podd, psum;
 
  peven = vec_muleuw (vra, vrb);
  podd  = vec_mulouw (vra, vrb);
  psum  = vec_addudm (peven, podd);
 
  return vec_addudm (psum, vrc);
}

We will need this later.

Now we need to provide implementations of vec_muleud() and vec_muloud(). For example:

static inline vui128_t
vec_muleud (vui64_t a, vui64_t b)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return vec_vmuloud (a, b);
#else
  return vec_vmuleud (a, b);
#endif
}

The implementation above is just handling the pesky little endian transforms. The real implementations are in vec_vmuleud() and vec_vmuloud() which implement the operation as if the PowerISA included such an instruction. These implementation is NOT endian sensitive and the function is stable across BE/LE implementations. For example:

static inline vui128_t
vec_vmuleud (vui64_t a, vui64_t b)
{
  vui64_t res;
#if defined (_ARCH_PWR9) && ((__GNUC__ >= 6) || (__clang_major__ >= 11))
  const vui64_t zero = { 0, 0 };
  vui64_t b_eud = vec_mrgahd ((vui128_t) b, (vui128_t) zero);
  __asm__(
      "vmsumudm %0,%1,%2,%3;\n"
      : "=v" (res)
      : "v" (a), "v" (b_eud), "v" (zero)
      : );
#else
#ifdef _ARCH_PWR8
  const vui64_t zero = { 0, 0 };
  vui64_t p0, p1, pp10, pp01;
  vui32_t m0, m1;
 
  // Need the endian invariant merge word high here
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  // Nullify the little endian transform
  m0 = vec_mergel ((vui32_t) b, (vui32_t) b);
#else
  m0 = vec_mergeh ((vui32_t) b, (vui32_t) b);
#endif
  m1 = (vui32_t) vec_xxspltd ((vui64_t) a, 0);
 
  // Need the endian invariant multiply even/odd word here
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  // Nullify the little endian transform
  p1 = vec_muleuw (m1, m0);
  p0 = vec_mulouw (m1, m0);
#else
  p1 = vec_mulouw (m1, m0);
  p0 = vec_muleuw (m1, m0);
#endif
  // res[1] = p1[1];  res[0] = p0[0];
  res = vec_pasted (p0, p1);
 
  // pp10[1] = p1[0]; pp10[0] = 0;
  // pp01[1] = p0[1]; pp01[0] = 0;
  // Need the endian invariant merge algebraic high/low here
  pp10 = (vui64_t) vec_mrgahd ((vui128_t) zero, (vui128_t) p1);
  pp01 = (vui64_t) vec_mrgald ((vui128_t) zero, (vui128_t) p0);
  // pp01 = pp01 + pp10.
  pp01 = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) pp10);
 
  // res = res + (pp01 << 32)
  pp01 = (vui64_t) vec_sld ((vi32_t) pp01, (vi32_t) pp01, 4);
  res = (vui64_t) vec_adduqm ((vui128_t) pp01, (vui128_t) res);
#else
  // _ARCH_PWR7 implementation ...
#endif
#endif
  return ((vui128_t) res);
}

The _ARCH_PWR9 implementation uses the multiply-sum doubleword operation but implements the multiply even behavior by forcing the contents of doubleword element 1 of [VRB] and the contents of [VRC] to 0.

The _ARCH_PWR8 implementation looks ugly but it works. It starts with some merges and splats to get inputs columns lined up for the multiply. Then we use (POWER8 instructions) Multiply Even/Odd Unsigned Word to generate doubleword partial products. Then more merges and a rotate to line up the partial products for summation as the final quadword product.

Individually vec_vmuleud() and vec_vmuloud() execute with a latency of 21-23 cycles on POWER8. Normally these operations are used and scheduled together as in the POWER8 implementation of vec_msumudm() or vec_mulhud(). Good scheduling by the compiler and pipelining keeps the POWER8 latency in the 28-32 cycle range. For example, the vec_mulhud() implementation:

static inline vui64_t
vec_mulhud (vui64_t vra, vui64_t vrb)
{
  return vec_mrgahd (vec_vmuleud (vra, vrb), vec_vmuloud (vra, vrb));
}

Generates the following code for POWER8:

vspltisw v0,0
xxmrghw vs33,vs35,vs35
xxspltd vs45,vs34,0
xxmrglw vs35,vs35,vs35
vmulouw v11,v13,v1
xxspltd vs34,vs34,1
xxmrghd vs41,vs32,vs43
vmulouw v12,v2,v3
vmuleuw v13,v13,v1
vmuleuw v2,v2,v3
xxmrghd vs42,vs32,vs44
xxmrgld vs33,vs32,vs45
xxmrgld vs32,vs32,vs34
xxpermdi vs44,vs34,vs44,1
vadduqm v1,v1,v9
xxpermdi vs45,vs45,vs43,1
vadduqm v0,v0,v10
vsldoi  v1,v1,v1,4
vsldoi  v0,v0,v0,4
vadduqm v2,v1,v13
vadduqm v0,v0,v12
xxmrghd vs34,vs34,vs32

The POWER9 latencies for this operation range from 5-7 (for vmsumudm itself) to 11-16 (for vec_mulhud()). The additional latency reflects zero constant vector generation and merges required to condition the inputs and output. For these operations the vec_msumudm(), vrc operand is always zero. Selecting the even/odd doubleword for input requires a merge low/high. And selecting the high doubleword for multiply high require a final merge high.

vec_mulhud() generates the following code for POWER9:

xxspltib vs32,0
xxmrghd vs33,vs35,vs32
xxmrgld vs35,vs32,vs35
vmsumudm v1,v2,v1,v0
vmsumudm v2,v2,v3,v0
xxmrghd vs34,vs33,vs34

Wrapping up the doubleword multiplies we should look at the multiply low (AKA Multiply Unsigned Doubleword Modulo). The POWER9 implementation is similar to vec_mulhud () and the generated code is similar to the example above.

Multiply low doubleword is a special case, as we are discarding the highest partial doubleword product. For POWER8 we can optimize for that case using multiply odd and multiply-sum word operations. Also as we are only generating doubleword partial products we only need add doubleword modulo operations to sum the results. This avoids the more expensive add quadword operation required for the general case. The fact that vec_vmsumuwm() is only a software construct is not an issue. It expands into hardware multiple even/odd word and add doubleword instructions that the compiler can schedule and optimize.

Here vec_mulouw() generates low order partial product. Then vec_vrld () and vec_vmsumuwm() generate doubleword sums of the two middle order partial products. Then vec_vsld() shifts the middle order partial sum left 32-bits (discarding the unneeded high order 32-bits). Finally sum the low and middle order partial doubleword products to produce the multiply-low doubleword result. For example, this POWER8 only implementation:

static inline vui64_t
vec_muludm (vui64_t vra, vui64_t vrb)
{
  vui64_t s32 = { 32, 32 }; // shift / rotate amount.
  vui64_t z = { 0, 0 };
  vui64_t t2, t3, t4;
  vui32_t t1;
 
  t1 = (vui32_t) vec_vrld (vrb, s32);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  // Nullify the little endian transform, really want mulouw here.
  t2 = vec_muleuw ((vui32_t) vra, (vui32_t) vrb);
#else
  t2 = vec_mulouw ((vui32_t) vra, (vui32_t) vrb);
#endif
  t3 = vec_vmsumuwm ((vui32_t) vra, t1, z);
  t4 = vec_vsld (t3, s32);
  return (vui64_t) vec_vaddudm (t4, t2);
}

Which generates the following for POWER8:

addis   r9,r2,.rodata.cst16+0x60@ha
addi    r9,r9,.rodata.cst16+0x60@l
lxv     vs33,0,r9
vmulouw v13,v2,v3
vrld    v0,v3,v1
vmulouw v3,v2,v0
vmuleuw v2,v2,v0
vaddudm v2,v3,v2
vsld    v2,v2,v1
vaddudm v2,v13,v2

Note: The addition of zeros to the final sum of vec_vmsumuwm() (vec_addudm (psum, vrc))has been optimized away by the compiler. This eliminates the xxspltib and one vaddudm instruction from the final code sequence.

And we can assume that the constant load of { 32, 32 } will be common-ed with other operations or hoisted out of loops. So the shift constant can be loaded early and vrld is not delayed. This keeps the POWER8 latency in the 19-28 cycle range.

Loading small Doubleword constants

Programming with vector doubleword integers will need doubleword constants for masking and arithmetic operations. Doubleword splat constants are common in vectorized long integer code for arithmetic, comparison, and mask operations. For example:

vui64_t
__test_incud_V0 (vui64_t vra)
{
  // increament unsigned doubleword elements
  return vra + 1;
}

The endian sensitive macros from vec_common_ppc.h can be used to construct doubleword integer constants. For example:

const vui64_t dw_one = CONST_VINT64_DW(1, 1);
const vui64_t dw_ten = CONST_VINT64_DW(10, 10);
const vui64_t dw_sign_mask = (vui64_t) CONST_VINT128_W(0x80000000, 0x0,
                                                       0x80000000, 0x0);

In most cases this compiler will allocate these constant values to the read-only data (.rodata) section. When these constants are referenced in programming operations the compiler generates the appropriate vector loads. For example the GCC V11 generates the following for the -mcpu=power8 target:

addis   r9,r2,.rodata.cst16+0x30@toc@ha
addi    r9,r9,.rodata.cst16+0x30@toc@l
lvx     v0,0,r9    # Load { 1, 1 }
vaddudm v2,v2,v0   # vra + 1

The addis/addi/lvx pattern is common to loading most vector constants for POWER8 and earlier.

For some odd reason the compiler might generate the sequence:

addis   r9,r2,.rodata.cst16+0x30@toc@ha
addi    r9,r9,.rodata.cst16+0x30@toc@l
rldicr  r9,r9,0,59
lxvd2x  vs0,0,r9
xxswapd vs0,vs0

for -mcpu=power8 ppc64le targets.

The Load VSX Vector Dword*2 Indexed (lxvd2x) would be required if the compiler could not know that the data was quadword aligned. The lxvd2x instruction handles unaligned access but requires the little endian adjustment (xxswapd). However the compiler controls the allocation and alignment of vector constants in .rodata and already insures quadword alignment.

Note: This is has the look of a compiler phase error bug where important information is lost between compiler phases.

For the -mcpu=power9 (and later) target GCC uses the Load VXS Vector (lxv) instruction:

addis   r9,r2,.rodata.cst16+0x30@toc@ha
addi    r9,r9,.rodata.cst16+0x30@toc@l
lxv     v2,0(r9)

The first sequence is expected for POWER8 as PowerISA 2.07B does not have any displacement form (D-Form) vector (VSX) load/store instructions. The compiler allocates constants to the .rodata sections and the linker collects .rodata from object files into a combined executable .rodata section. This section is placed near the Table of Contents (TOC) section. The ABI dedicates R2 as the base address .TOC. for the TOC and adjacent sections.

The Add Immediate Shifted (addis) Add Immediate (addi) sequence above computes a signed 32-bit .TOC. relative offset to a specific .rodata quadword. Two instructions are required as; addis provides the high adjusted (@ha) 16-bits shifted left 16-bits, while addi provides the low (@l) 16-bits. The sum of R2 and these immediate values is the 64-bit effective address of a .rodata constant value. A signed 32-bit offset is large enough to support most (-mcmodel=medium) program and library executables.

The load itself has a 5-cycle latency assuming a L1 cache hit. The three instruction sequence is sequentially dependent and requires 9-cycles latency (minimum) to execute. A L1 cache miss will increase the latency by 7-28 cycles, assuming the data resides in the L2/L3 caches.

Optimizing loads from .rodata

However the compiler is not following the recommendations of PowerISA 2.07B, Book II, Chapter 2.1 Performance-Optimized Instruction Sequences. This chapter recommends a specific pattern for the addi/lvx sequence. For example:

addis   rA,r2,.rodata.cst16+0x30@toc@ha
addi    rx,0,.rodata.cst16+0x30@toc@l
lvx     v2,rA,rx

In this case rx can be any GPR (including r0) while RA must be a valid base (r1 <-> r31) register.

The POWER8 implementation allows for Instruction Funsion combining information from two adjacentt instructions into one (internal) instruction so that it executes faster than the non-fused case. Effectively the addi/lvx combination above becomes a D-Form load vector instruction.

There are additional restrictions on the definition of adjacent:

The instruction must be in the same dispatch group.
- In single-threaded mode, up to six non-branch and up to two branch instructions (6/2 groups).
- In multi-threaded mode, up to three non-branch and up to one branch instructions (3/1 groups).
Without any intervening branch instructions.
Instructions may span an I-cache line, with both lines fetched and residing in the i-buffer.

This can reduce the latency from 9 to 7-cycles. This would be true even without Instruction Funsion as the addis/addi instructions are now independent and can execute in parallel.

The sequence generated for POWER9 is even more disappointing. The lxv is a D-Form (DQ) instruction and the displacement operand could be used to replace the addi instruction. For example: -mcpu=power9 target:

addis r9,r2,.rodata.cst16+0x30@toc@ha

lxv v2,.rodata.cst16+0x30@toc@l(r9)

This provides the equivalent 32-bit TOC relative displacement with one less instructions and reduced latency of 7-cycles.

Alternatives to loading from .rodata

This is all a little cumbersome and it seems like there should be a better/faster way. Any instruction sequence that loads quadword integer constants in:

three instruction or less,
latency of 6 cycles or less,
and avoids cache misses

is a good deal.

The base (Altivec) vector ISA included Vector Splat Immediate Signed Byte/Halfword/Word instructions. These are fast (2-cycle latency) and convenient for small integer constants in the range -16 to 15. So far the ISA has not added doubleword or quadword forms of splat immediate.

POWER9 added a VSX Vector Splat Immediate Byte (xxspltib) instruction. This expands the immediate range to -128 to 127 but does not include larger element sizes. POWER9 does provide Vector Extend Sign Byte To Word/Doubleword (vextsb2w/vextsb2d) instructions. For example the two instruction sequence:

xxspltib vs34,127

vextsb2d v2,v2

can generate a doubleword splat immediate for integers in the range -128 to 127 with a cycle latency of 5-cycles.

Note: POWER10 does add the interesting VSX Vector Splat Immediate Double-Precision instruction. This is a 64-bit instruction with a 32-bit single precision immediate operand. Interesting but not helpful for doubleword integer.

Some special quadword constants

The GCC compiler does recognize some vector constants as special case. For example:

vi128_t
__test_splatisq_n1_V0 (void)
{
  const vui32_t q_ones = {-1, -1, -1, -1};
  return (vi128_t) q_ones;
}
 
vi128_t
__test_splatisq_0_V0 (void)
{
  const vui32_t q_zero = {0, 0, 0, 0};
  return (vi128_t) q_zero;
}

will generate:

0000000000000080 <__test_splatisq_n1_V0>:
     vspltisw v2,-1
     blr
00000000000000a0 <__test_splatisq_0_V0>:
     vspltisw v2,0
     blr

As we will see the all zero/ones constants are common building blocks. So the compiler should treat these as common sub expressions across all operations using those constants.

Defining our own vec_splat_s64

So the compiler can do clever things with vector constants. But so far these are the only examples I have found. Other cases that you might expect to be a special case are not. For example:

vui64_t
__test_splatudi_15_V1 (void)
{
  return vec_splats ((unsigned long long) 12);
}
 
vui64_t
__test_splatudi_15_V0 (void)
{
  const vui64_t dw_15 = CONST_VINT64_DW(15, 15);
  return dw_15;
}

both generate the 3 instruction (9-cycle) load from .rodata sequence. Also constants using the vector long long or __int128 types may fail to compile on older versions of the compiler.

We can generate small constants in the range -16 <-> 15 with using the following pattern:

vi64_t
__test_splatsdi_15_V1 (void)
{
  vi32_t vwi = vec_splat_s32 (15);
  return vec_unpackl (vwi);
}

Which should generate:

0000000000000040 <__test_splatisd_15_v2>:
      vspltisw v2,15
      vupklsw v2,v2
      blr

Here we use the vec_splat_s32(15) intrinsic to generate Vector Splat Immediate Signed Word (vspltisw) to splat the value 15 across word elements of vwi. Then vec_unpackl (vwi) to generate Vector Unpack Low Signed Word vupklsw which sign extends the 2 low words of vwi to signed doubleword elements. This sequence is only 2 instructions and will execute with 4-cycle latency.

Note: unfortunately GCC compilers after GCC-8 will recognize this sequence and convert it back to the three instruction .rodata load sequence. See: GCC PR 104124 Until PR 104124 is fixed the following work-around is used for the PVECLIB implementation.

Putting this all together we can create a static inline function to generate small doubleword constants (in the range -16 to 15). For example:

static inline vi64_t
vec_splat_s64_PWR8 (const int sim)
{
  vi64_t result;
  if (__builtin_constant_p (sim) && ((sim >= -16) && (sim < 16)))
    {
      vi32_t vwi = vec_splat_s32 (sim);
 
      if (__builtin_constant_p (sim) && ((sim == 0) || (sim == -1)))
        {
          // Special case for -1 and 0. Skip vec_unpackl().
          result = (vi64_t) vwi;
        } else {
          // For P8 can use either vupklsh or vupklsw but P7 only has
          // vupklsh. Given the reduced range, Either works here.
          // Unpack signed HW works here because immediate value fits
          // into the low HW and sign extends to high HW of each word.
          // Unpack will expand the low HW to low word and high HW
          // (sign extend) into the high word of each DW.
          // Unpack low/high (or endian) will not change the result.
#if defined (__GNUC__) && (__GNUC__ == 8)
          // GCC 8 (AT12) handle this correctly.
          result = (vi64_t) vec_vupklsh ((vi16_t) vwi);
#else
          // But GCC 9+ optimized the above to be load from .rodata.
          // With a little register pressure it adds some gratuitous store/reloads.
          // So the following work-around is required.
          __asm__(
              "vupklsh %0,%1;"
              : "=v" (result)
              : "v" (vwi)
              : );
#endif
        }
    }
  else
    result = vec_splats ((signed long long) sim);
 
  return (result);
}

This version uses only <altivec.h> intrinsics supported by POWER8 and earlier. For constants in the range (-16 to 15) the range is divided into two groups:

Special values -1 and 0 that can be generated in a single instruction.
Values -16 to 15 that require the vwi constant to sign extend.

Values outside this range use the vec_splats() intrinsic which will generate the appropriate quadword constant in .rodata and the load sequence to retrieve that value.

For POWER9 and later we can use the vec_splats() intrinsic which (so far) generates the xxspltib/vextsb2d sequence for the constant range -128 to 127.

static inline vi64_t
vec_splat_s64_PWR9 (const int sim)
{
  return vec_splats ((signed long long) sim);
}

Endian problems with doubleword operations

From the examples above we see that the construction of higher precision multiplies requires significant massaging of input and output elements. Here merge even/odd, merge high/low, swap, and splat doubleword element operations are commonly used.

PowerISA 2.06 VSX (POWER7) added the general purpose Vector Permute Doubleword Immediate (xxpermdi). The compiler generates some form of xxpermdi for the doubleword (double float, long int, bool long) merge/splat/swap operations. As xxpermdi's element selection is an immediate field, most operations require only a single instruction. All the merge/splat/swap doubleword variant are just a specific select mask value and the inputs to xxpermdi.

Which is very useful indeed for assembling, disassembling, merging, splatting, swapping, and pasting doubleword elements.

Of course it took several compiler releases to implement all the generic merge/splat/swap operations for the supported types. GCC 4.8 as the first to support vec_xxpermdi as a built-in. GCC 4.8 also supported the generic built-ins vec_mergeh, vec_mergel, and vec_splat for the vector signed/unsigned/bool long type. But endian sensitive vec_mergeh, vec_mergel, and vec_splat were not supported until GCC 7. And the generic vec_mergee, vec_mergeo built-ins where not supported until GCC 8.

But as we have explained in General Endian Issues and Endian problems with word operations the little endian transforms applied by the compiler can cause problems for developers of multi-precision libraries. The doubleword forms of the generic merge/splat operations etc. are no exception. This is especially annoying when the endian sensitive transforms are applied between releases of the compiler.

So we need a strategy to provide endian invariant merge/splat/swap operations to be used in multi-precision arithmetic. And another set of endian sensitive operations that are mandated by the OpenPOWER ABI.

First we need a safely endian invariant version of xxpermdi to use in building other variants:

vec_permdi() provides the basic xxpermdi operation but nullifies the little endian transforms.

Then build the core set of endian invariant permute doubleword operations using vec_permdi():

Merge algebraic high/low doubleword operations vec_mrgahd() and vec_mrgald().
Merge the left and right most doublewords from a double quadword operation vec_pasted().
Splat from the high/even or low/odd doubleword operation vec_xxspltd().
Swap high and low doublewords operation vec_swapd().

We use the merge algebraic high/low doubleword operations in the implementation of vec_mulhud(), vec_mulhud(), vec_vmuleud(), and vec_vmuloud(). We use the vec_xxspltd operation in the implementation of vec_vrld(), vec_vmuleud(), and vec_vmuloud(). We use the paste doubleword (vec_pasted()) operation in the implementation of vec_vsrad(), vec_vmuleud(), and vec_vmuloud(). We use the swap doubleword operation in the implementation of vec_cmpequq(), vec_cmpneuq(), vec_muludq(), and vec_mulluq().

Then use the compilers __BYTE_ORDER__ == ORDER_LITTLE_ENDIAN conditional to invert the vec_permdi() select control for endian sensitive merge/splat doubleword operations:

Merge even/odd doubleword operations vec_mrged() and vec_mrgod().
Merge high/low doubleword operations vec_mrghd() and vec_mrgld().
Splat even/odd doubleword operation vec_splatd().

Vector Doubleword Examples

Suppose we have a requirement to convert an array of 64-bit time-interval values that need to convert to timespec format. For simplicity we will also assume that the array is nicely (Quadword) aligned and an integer multiple of 2 doublewords or 4 words.

The PowerISA provides a 64-bit TimeBase register that clocks at a constant 512MHz. The TimeBase can be read directly as either the full 64-bit value or as 32-bit upper and lower halves. For this example we assume are dealing with longer intervals (greater than ~8.38 seconds) so the full 64-bit TimeBase is required. TimeBase values of adjacent events are subtracted to generate the intervals stored in the array.

The timespec format is a struct of unsigned int fields for seconds and nanoseconds. So the task is to convert the 512MHz 64-bit TimeBase intervals to seconds and remaining clock ticks. Then convert the remaining (subsecond) clock ticks from 512MHz to nanoseconds. The separate seconds and nanoseconds are combined in the timespec structure.

First we need to separate the raw TimeBase into the integer seconds and (subsecond) clock-ticks. Normally scalar codes would use integer divide/modulo by 512000000. Did I mention that the PowerISA vector unit does not have a integer divide operation?

Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 64-bit fraction and we have a multiply high (vec_mulhud()) operation. Multiplying a 64-bit unsigned integer by a 64-bit unsigned fraction generates a 128-bit product with 64-bits above (integer) and below (fraction) the radix point. The high 64-bits of the product is the integer quotient.

It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic.

See also: "Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.

In the chapter above;

Figure 10-2 Computing the magic number for unsigned division.

provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.).

For the divisor 512000000 this is { 4835703278458516699, 0 , 27 }:

the multiplier is 4835703278458516699.
no corrective add of the dividend is required.
the final shift is 27-bits right.

// Magic numbers for multiplicative inverse to divide by 512,000,000
// are 4835703278458516699 and shift right 27 bits.
const vui64_t mul_invs_clock =
  { 4835703278458516699UL, 4835703278458516699UL };
const int shift_clock = 27;
// Need const for TB clocks/second to extract remainder.
const vui64_t tb_clock_sec =
  { 512000000, 512000000};
vui64_t tb_v, tmp, tb_clocks, seconds, nseconds;
vui64_t timespec1, timespec2;
 
// extract integer seconds from timebase vector.
tmp = vec_mulhud (tb_v, mul_invs_clock);
seconds = vec_srdi (tmp, shift_clock);
// Extract the remainder in tb clock ticks.
tmp = vec_muludm (seconds, tb_clock_sec);
tb_clocks = vec_sub (tb_v, tmp);

Next we need to convert the subseconds from TimeBase clock-ticks to nanoseconds. The subsecond remainder is now small enough (compared to a doubleword) that we can perform the conversion in place. The nanosecond conversion is ((tb_clocks * 1000000000) / 512000000). And we can reduce this to ((tb_clocks * 1000) / 512). We still have a small multiply but the divide can be converted to shift right of 9-bits.

const int shift_512 = 9;
const vui64_t nano_512 =
  { 1000, 1000};
 
// Convert 512MHz timebase to nanoseconds.
// nseconds = tb_clocks * 1000000000 / 512000000
// reduces to (tb_clocks * 1000) >> 9
tmp = vec_muludm (tb_clocks, nano_512);
nseconds = vec_srdi (tmp, shift_512);

Finally we need to merge the vectors of seconds and nanoseconds into vectors of timespec. So far we have been working with 64-bit integers but the timespec is a struct of 32-bit (word) integers. Here 32-bit seconds and nanosecond provided sufficient range and precision. So the final step packs a pair of 64-bit timespec values into a vector of two 32-bit timespec values, each containing 2 32-bit (second, nanosecond) values.

timespec1 = vec_mergeh (seconds, nseconds);
timespec2 = vec_mergel (seconds, nseconds);
// seconds and nanoseconds fit int 32-bits after conversion.
// So pack the results and store the timespec.
*timespec++ = vec_vpkudum (timespec1, timespec2);

Note: vec_sub(), vec_mergeh(), and vec_mergel() are existing altivec.h generic built-ins.; vec_vpkudum() is an existing altivec.h built-in that is only defined for _ARCH_PWR8 and later. This header insures that vec_vpkudum is defined for older compilers and provides an functional equivalent implementation for POWER7.

Vectorized 64-bit TimeBase conversion example

Here is the complete vectorized 64-bit TimeBase to timespec conversion example:

void
example_dw_convert_timebase (vui64_t *tb, vui32_t *timespec, int n)
{
  // Magic numbers for multiplicative inverse to divide by 512,000,000
  // are 4835703278458516699 and shift right 27 bits.
  const vui64_t mul_invs_clock =
    { 4835703278458516699UL, 4835703278458516699UL };
  const int shift_clock = 27;
  // Need const for TB clocks/second to extract remainder.
  const vui64_t tb_clock_sec =
    { 512000000, 512000000};
  const int shift_512 = 9;
  const vui64_t nano_512 =
    { 1000, 1000};
  vui64_t tb_v, tmp, tb_clocks, seconds, nseconds;
  vui64_t timespec1, timespec2;
  int i;
 
  for (i = 0; i < n; i++)
    {
      tb_v = *tb++;
      // extract integer seconds from timebase vector.
      tmp = vec_mulhud (tb_v, mul_invs_clock);
      seconds = vec_srdi (tmp, shift_clock);
      // Extract remainder in tb clock ticks.
      tmp = vec_muludm (seconds, tb_clock_sec);
      tb_clocks = vec_sub (tb_v, tmp);
      // Convert 512MHz timebase to nanoseconds.
      // nseconds = tb_clocks * 1000000000 / 512000000
      // reduces to (tb_clocks * 1000) >> 9
      tmp = vec_muludm (tb_clocks, nano_512);
      nseconds = vec_srdi (tmp, shift_512);
      // Use merge high/low to interleave seconds and nseconds
      // into timespec.
      timespec1 = vec_mergeh (seconds, nseconds);
      timespec2 = vec_mergel (seconds, nseconds);
      // seconds and nanoseconds fit int 32-bits after conversion.
      // So pack the results and store the timespec.
      *timespec++ = vec_vpkudum (timespec1, timespec2);
    }
}

Performance data.

High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.

Function Documentation

◆ vec_absdud()

static vui64_t vec_absdud	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Absolute Difference Unsigned Doubleword.

Compute the absolute difference for each doubleword. For each unsigned doubleword, subtract VRB[i] from VRA[i] and return the absolute value of the difference.

processor	Latency	Throughput
power8	4	1/cycle
power9	5	1/cycle

Parameters

vra	vector of 2 x unsigned doublewords
vrb	vector of 2 x unsigned doublewords

Returns: vector of the absolute differences.

◆ vec_addudm()

static vui64_t vec_addudm	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Add Unsigned Doubleword Modulo.

Add two vector long int values and return modulo 64-bits result.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

a	128-bit vector long int.
b	128-bit vector long int.

Returns: vector long int sums of a and b.

◆ vec_clzd()

static vui64_t vec_clzd ( vui64_t vra )

inlinestatic

Vector Count Leading Zeros Doubleword for unsigned long long elements.

Count the number of leading '0' bits (0-64) within each doubleword element of a 128-bit vector.

For POWER8 (PowerISA 2.07B) or later use the Vector Count Leading Zeros Doubleword instruction vclzd. Otherwise use sequence of pre 2.07 VMX instructions.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	a 128-bit vector treated as 2 x 64-bit unsigned long long (doubleword) elements.

Returns: 128-bit vector with the leading zeros count for each doubleword element.

◆ vec_cmpeqsd()

static vb64_t vec_cmpeqsd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare equal result for each element.

◆ vec_cmpequd()

static vb64_t vec_cmpequd	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] == b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later, use the Vector Compare Equal Unsigned DoubleWord (vcmpequd) instruction. Otherwise use boolean logic using word compares.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare equal result for each element.

◆ vec_cmpgesd()

static vb64_t vec_cmpgesd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltud, then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater then or equal result for each element.

◆ vec_cmpgeud()

static vb64_t vec_cmpgeud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] >= b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud, then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater then or equal result for each element.

◆ vec_cmpgtsd()

static vb64_t vec_cmpgtsd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtsd) instruction. Otherwise use boolean logic using word compares.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpgtud()

static vb64_t vec_cmpgtud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's.

For POWER8 (PowerISA 2.07B) or later use the Vector Compare Greater Than Unsigned DoubleWord (vcmpgtud) instruction. Otherwise use boolean logic using word compares.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmplesd()

static vb64_t vec_cmplesd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare Less Than Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpleud()

static vb64_t vec_cmpleud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare Less Than Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] > b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud. Use vec_cmpgtud then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare greater result for each element.

◆ vec_cmpltsd()

static vb64_t vec_cmpltsd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare less Than Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtsd with parameters reversed to implement vec_cmpltsd.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare less result for each element.

◆ vec_cmpltud()

static vb64_t vec_cmpltud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare less Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] < b[i], otherwise all '0's. Use vec_cmpgtud with parameters reversed to implement vec_cmpltud.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare less result for each element.

◆ vec_cmpnesd()

static vb64_t vec_cmpnesd	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare not equal result for each element.

◆ vec_cmpneud()

static vb64_t vec_cmpneud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return all '1's, if a[i] != b[i], otherwise all '0's. Use vec_cmpequd then return the logical inverse.

processor	Latency	Throughput
power8	4	2/cycle
power9	5	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: 128-bit vector with each dword boolean reflecting compare not equal result for each element.

◆ vec_cmpsd_all_eq()

static int vec_cmpsd_all_eq	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a and b are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_all_ge()

static int vec_cmpsd_all_ge	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a >= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_gt()

static int vec_cmpsd_all_gt	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a > b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_le()

static int vec_cmpsd_all_le	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Less than equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a <= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_lt()

static int vec_cmpsd_all_lt	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Less than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a < b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_all_ne()

static int vec_cmpsd_all_ne	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare all Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a and b are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_any_eq()

static int vec_cmpsd_any_eq	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a and b are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpsd_any_ge()

static int vec_cmpsd_any_ge	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Greater Than or Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a >= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_any_gt()

static int vec_cmpsd_any_gt	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Greater Than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if all elements of a > b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpsd_any_le()

static int vec_cmpsd_any_le	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Less than equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a <= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpsd_any_lt()

static int vec_cmpsd_any_lt	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Less than Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a < b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpsd_any_ne()

static int vec_cmpsd_any_ne	(	vi64_t	a,
		vi64_t	b
	)

inlinestatic

Vector Compare any Not Equal Signed Doubleword.

Compare each signed long (64-bit) integer and return true if any elements of a and b are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit signed long integer (dword) elements.

Returns: boolean int for any 128-bits, true if equal, false otherwise.

◆ vec_cmpud_all_eq()

static int vec_cmpud_all_eq	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a and b are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_all_ge()

static int vec_cmpud_all_ge	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a >= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_gt()

static int vec_cmpud_all_gt	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a > b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_le()

static int vec_cmpud_all_le	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Less than equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a <= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_lt()

static int vec_cmpud_all_lt	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Less than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a < b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_all_ne()

static int vec_cmpud_all_ne	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare all Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a and b are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_any_eq()

static int vec_cmpud_any_eq	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a and b are equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if equal, false otherwise.

◆ vec_cmpud_any_ge()

static int vec_cmpud_any_ge	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Greater Than or Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a >= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_any_gt()

static int vec_cmpud_any_gt	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Greater Than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if all elements of a > b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for all 128-bits, true if all Greater Than, false otherwise.

◆ vec_cmpud_any_le()

static int vec_cmpud_any_le	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Less than equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a <= b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpud_any_lt()

static int vec_cmpud_any_lt	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Less than Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a < b.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for any 128-bits, true if any Greater Than, false otherwise.

◆ vec_cmpud_any_ne()

static int vec_cmpud_any_ne	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Compare any Not Equal Unsigned Doubleword.

Compare each unsigned long (64-bit) integer and return true if any elements of a and b are not equal.

processor	Latency	Throughput
power8	4-9	2/cycle
power9	3	2/cycle

Parameters

a	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.
b	128-bit vector treated as 2 x 64-bit unsigned long integer (dword) elements.

Returns: boolean int for any 128-bits, true if equal, false otherwise.

◆ vec_ctzd()

static vui64_t vec_ctzd ( vui64_t vra )

inlinestatic

Vector Count Trailing Zeros Doubleword for unsigned long long elements.

Count the number of trailing '0' bits (0-64) within each doubleword element of a 128-bit vector.

For POWER9 (PowerISA 3.0B) or later use the Vector Count Trailing Zeros Doubleword instruction vctzd. Otherwise use a sequence of pre ISA 3.0 VMX instructions leveraging the PVECLIB popcntd operation. SIMDized count Trailing zeros inspired by:

Warren, Henry S. Jr and Hacker's Delight, 2nd Edition, Addison Wesley, 2013. Chapter 5 Counting Bits, Section 5-4.

processor	Latency	Throughput
power8	8-10	2/2 cycles
power9	3	2/cycle

Parameters

vra	128-bit vector treated as 2 x 64-bit integer (doublewords) elements.

Returns: 128-bit vector with the trailng zeros count for each doubleword element.

◆ vec_maxsd()

static vi64_t vec_maxsd	(	vi64_t	vra,
		vi64_t	vrb
	)

inlinestatic

Vector Maximum Signed Doubleword.

For each doubleword element [0|1] of vra and vrb compare as signed integers and return the larger value in the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector long int.
vrb	128-bit vector long int.

Returns: vector long maximum of a and b.

◆ vec_maxud()

static vui64_t vec_maxud	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Maximum Unsigned Doubleword.

For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the larger value in the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector long int.
vrb	128-bit vector long int.

Returns: vector unsigned long maximum of a and b.

◆ vec_minsd()

static vi64_t vec_minsd	(	vi64_t	vra,
		vi64_t	vrb
	)

inlinestatic

Vector Minimum Signed Doubleword.

For each doubleword element [0|1] of vra and vrb compare as signed integers and return the smaller value in the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector long int.
vrb	128-bit vector long int.

Returns: vector long minimum of a and b.

◆ vec_minud()

static vui64_t vec_minud	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Minimum Unsigned Doubleword.

For each doubleword element [0|1] of vra and vrb compare as unsigned integers and return the smaller value in the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	128-bit vector unsigned long int.
vrb	128-bit vector unsignedlong int.

Returns: vector unsigned long minimum of a and b.

◆ vec_mrgahd()

static vui64_t vec_mrgahd	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Merge Algebraic High Doublewords.

Merge only the high doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This is effectively the Vector Merge Even Doubleword operation that is not modified for endian.

For example, merge the high 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply high unsigned doubleword.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned __int128.
vrb	128-bit vector unsigned __int128.

Returns: A vector merge from only the high doublewords of the 2 x algebraic quadwords across vra and vrb.

◆ vec_mrgald()

static vui64_t vec_mrgald	(	vui128_t	vra,
		vui128_t	vrb
	)

inlinestatic

Vector Merge Algebraic Low Doublewords.

Merge only the low doublewords from 2 x Algebraic quadwords across vectors vra and vrb. This effectively the Vector Merge Odd doubleword operation that is not modified for endian.

For example, merge the low 64-bits from 2 x 128-bit products as generated by vec_muleud/vec_muloud. This result is effectively a vector multiply low unsigned doubleword.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Parameters

vra	128-bit vector unsigned __int128.
vrb	128-bit vector unsigned __int128.

Returns: A vector merge from only the low doublewords of the 2 x algebraic quadwords across vra and vrb.

◆ vec_mrged()

static vui64_t vec_mrged	(	vui64_t	__VA,
		vui64_t	__VB
	)

inlinestatic

Vector Merge Even Doubleword. Merge the even doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

__VA	a 128-bit vector as the source of the results even doubleword.
__VB	a 128-bit vector as the source of the results odd doubleword.

Returns: A vector merge from only the even doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrghd()

static vui64_t vec_mrghd	(	vui64_t	__VA,
		vui64_t	__VB
	)

inlinestatic

Vector Merge High Doubleword. Merge the high doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

__VA	a 128-bit vector as the source of the results even doubleword.
__VB	a 128-bit vector as the source of the results odd doubleword.

Returns: A vector merge from only the high doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrgld()

static vui64_t vec_mrgld	(	vui64_t	__VA,
		vui64_t	__VB
	)

inlinestatic

Vector Merge Low Doubleword. Merge the low doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

__VA	a 128-bit vector as the source of the results even doubleword.
__VB	a 128-bit vector as the source of the results odd doubleword.

Returns: A vector merge from only the low doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_mrgod()

static vui64_t vec_mrgod	(	vui64_t	__VA,
		vui64_t	__VB
	)

inlinestatic

Vector Merge Odd Doubleword. Merge the odd doubleword elements from two vectors into the high and low doubleword elements of the result. This is effectively the VSX Permute Doubleword Immediate operation modified for endian.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

__VA	a 128-bit vector as the source of the results even doubleword.
__VB	a 128-bit vector as the source of the results odd doubleword.

Returns: A vector merge from only the odd doublewords of the 2 x quadwords across __VA and __VB.

◆ vec_msumudm()

static vui128_t vec_msumudm	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Unsigned Doubleword Modulo.

Note: this implementation exists in vec_int128_ppc::h::vec_msumudm() as it requires vec_adduqm().

◆ vec_muleud()

static vui128_t vec_muleud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Even Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_muleud() as it requires vec_vmuleud and vec_adduqm().

◆ vec_mulhud()

static vui64_t vec_mulhud	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Multiply High Unsigned Doubleword.

Note: this implementation exists in vec_int128_ppc::h::vec_mulhud() as it requires vec_vmuleud() and vec_vmuloud().

◆ vec_muloud()

static vui128_t vec_muloud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_muloud() as it requires vec_vmuloud() and vec_adduqm().

◆ vec_muludm()

static vui64_t vec_muludm	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Multiply Unsigned Doubleword Modulo.

Note: this implementation exists in vec_int128_ppc::h::vec_muludm() as it requires vec_vmuleud() and vec_vmuloud().

◆ vec_pasted()

static vui64_t vec_pasted	(	vui64_t	__VH,
		vui64_t	__VL
	)

inlinestatic

Vector doubleword paste. Concatenate the high doubleword of the 1st vector with the low double word of the 2nd vector.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

__VH	a 128-bit vector as the source of the high order doubleword.
__VL	a 128-bit vector as the source of the low order doubleword.

Returns: The combined 128-bit vector composed of the high order doubleword of __VH and the low order doubleword of __VL.

◆ vec_permdi()

static vui64_t vec_permdi	(	vui64_t	vra,
		vui64_t	vrb,
		const int	ctl
	)

inlinestatic

Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a doubleword selected from the 2nd (vrb) vector.

Note: This function implements the operation of a VSX Permute Doubleword Immediate instruction. This implementation is NOT Endian sensitive and the function is stable across BE/LE implementations.

The 2-bit control operand (ctl) selects which doubleword from the 1st and 2nd vector operands are transfered to the result vector. Control table:

ctl	vrt[0:63]	vrt[64:127]
0	vra[0:63]	vrb[0:63]
1	vra[0:63]	vrb[64:127]
2	vra[64:127]	vrb[0:63]
3	vra[64:127]	vrb[64:127]

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector as the source of the high order doubleword of the result.
vrb	a 128-bit vector as the source of the low order doubleword of the result.
ctl	const integer where the low order 2 bits control the selection of doublewords from input vector vra and vrb.

Returns: The combined 128-bit vector composed of the high order doubleword of vra and the low order doubleword of vrb.

◆ vec_popcntd()

static vui64_t vec_popcntd ( vui64_t vra )

inlinestatic

Vector Population Count doubleword.

Count the number of '1' bits (0-64) within each doubleword element of a 128-bit vector.

processor	Latency	Throughput
power8	4	2/2 cycles
power9	3	2/cycle

For POWER8 (PowerISA 2.07B) or later use the Vector Population Count DoubleWord (vpopcntd) instruction. Otherwise use the pveclib vec_popcntw to count each word then sum across with Vector Sum across Half Signed Word Saturate (vsum2sws).

Parameters

vra	128-bit vector treated as 2 x 64-bit integer (dwords) elements.

Returns: 128-bit vector with the population count for each dword element.

◆ vec_revbd()

static vui64_t vec_revbd ( vui64_t vra )

inlinestatic

byte reverse each doubleword for a vector unsigned long int.

For each doubleword of the input vector, reverse the order of bytes / octets within the doubleword.

processor	Latency	Throughput
power8	2-11	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector unsigned long int.

Returns: a 128-bit vector with the bytes of each doubleword reversed.

◆ vec_rldi()

static vui64_t vec_rldi	(	vui64_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Rotate left Doubleword Immediate.

Rotate left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The rotate amount is a const unsigned int in the range 0-63. A rotate count of 0 returns the original value of vra. Shift counts greater then 63 bits handled modulo 64.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned long int.
shb	rotate amount in the range 0-63.

Returns: 128-bit vector unsigned long int, shifted left shb bits.

◆ vec_selsd()

static vi64_t vec_selsd	(	vi64_t	vra,
		vi64_t	vrb,
		vb64_t	vrc
	)

inlinestatic

Vector Select Signed Doubleword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector treated as a vector signed long long int.
vrb	a 128-bit vector treated as a vector signed long long int.
vrc	a 128-bit vector treated as vector bool long long int.

Returns: The selected bits from vra and vrb

◆ vec_selud()

static vui64_t vec_selud	(	vui64_t	vra,
		vui64_t	vrb,
		vb64_t	vrc
	)

inlinestatic

Vector Select Unsigned Doubleword.

Return the value, (vra & ~vrc) | (vrb & vrc).

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned long long int.
vrb	a 128-bit vector treated as a vector unsigned long long int.
vrc	a 128-bit vector treated as vector bool long long int.

Returns: The selected bits from vra and vrb

◆ vec_setb_sd()

static vb64_t vec_setb_sd ( vi64_t vra )

inlinestatic

Vector Set Bool from Signed Doubleword.

For each doubleword, propagate the sign bit to all 64-bits of that doubleword. The result is vector bool long long reflecting the sign bit of each 64-bit doubleword.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	Vector signed long long.

Returns: vector bool long long reflecting the sign bits of each doubleword.

◆ vec_sldi()

static vui64_t vec_sldi	(	vui64_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift left Doubleword Immediate.

Shift left each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned long int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned long int.
shb	shift amount in the range 0-63.

Returns: 128-bit vector unsigned long int, shifted left shb bits.

◆ vec_splat_s64()

static vi64_t vec_splat_s64 ( const int sim )

inlinestatic

Vector Splat Immediate Signed Doubleword. Duplicate the signed integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Signed (Byte | Halfword |Word).

Note: POWER9/10 will generate the 2 instruction sequence xxspltib/vextsb2d for values -128 to 128. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section. POWER8 (and earlier) does not have vextsb2d instructions. For a smaller range (-16 -> 15) POWER8 can use the sequence vec_splat_s32/vec_unpackl but the latest compilers are too clever for this and generate a load from .rodata anyway.

processor	Latency	Throughput
power8	4 - 9	2/cycle
power9	5	2/cycle

Parameters

sim	a small signed integer const.

Returns: Vector with sim value splatted to doublewords.

◆ vec_splat_u64()

static vui64_t vec_splat_u64 ( const int sim )

inlinestatic

Vector Splat Immediate Unsigned Doubleword. Duplicate the unsigned integer constant across doubleword elements of the result. This is the doubleword equivalent Vector Splat Immediate Unsigned (Byte | Halfword |Word).

Note: POWER9/10 will generate the 2 instruction sequence xxspltib/vextsb2d for values -128 to 128. Larger values will be loaded as a quadword constant from the read-only data (.rodata) section. POWER8 (and earlier) does not have vextsb2d instructions. For a smaller range (-16 -> 15) POWER8 can use the sequence vec_splat_s32/vec_unpackl but the latest compilers are too clever for this and generate a load from .rodata anyway.

processor	Latency	Throughput
power8	4 - 9	2/cycle
power9	5	2/cycle

Parameters

sim	a small signed integer const.

Returns: Vector with sim value splatted to doublewords.

◆ vec_splatd()

static vui64_t vec_splatd	(	vui64_t	vra,
		const int	ctl
	)

inlinestatic

Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result. This is effectively the VSX Merge doubleword operation modified for endian.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl	vrt[0]	vrt[1]
0	vra[0]	vra[0]
1	vra[1]	vra[1]

Parameters

vra	a 128-bit vector.
ctl	a const integer encoding the source doubleword.

Returns: The original vector with the doubleword elements swapped.

◆ vec_spltd()

static vui64_t vec_spltd	(	vui64_t	vra,
		const int	ctl
	)

inlinestatic

Deprecated:: Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl	vrt[0:63]	vrt[64:127]
0	vra[0:63]	vra[0:63]
1	vra[64:127]	vra[64:127]

Parameters

vra	a 128-bit vector.
ctl	a const integer encoding the source doubleword.

Returns: The original vector with the doubleword elements swapped.

◆ vec_sradi()

static vi64_t vec_sradi	(	vi64_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Algebraic Doubleword Immediate.

Shift Right Algebraic each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return the sign bit propagated to each bit of each element.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	a 128-bit vector treated as a vector signed long int.
shb	shift amount in the range 0-63.

Returns: 128-bit vector signed long int, shifted right shb bits.

◆ vec_srdi()

static vui64_t vec_srdi	(	vui64_t	vra,
		const unsigned int	shb
	)

inlinestatic

Vector Shift Right Doubleword Immediate.

Shift Right each doubleword element [0-1], 0-63 bits, as specified by an immediate value. The shift amount is a const unsigned int in the range 0-63. A shift count of 0 returns the original value of vra. Shift counts greater then 63 bits return zero.

processor	Latency	Throughput
power8	2-4	2/cycle
power9	2-5	2/cycle

Parameters

vra	a 128-bit vector treated as a vector unsigned long int.
shb	shift amount in the range 0-63.

Returns: 128-bit vector unsigned long int, shifted right shb bits.

◆ vec_subudm()

static vui64_t vec_subudm	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Subtract Unsigned Doubleword Modulo.

For each unsigned long (64-bit) integer element c[i] = a[i] + NOT(b[i]) + 1.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

For POWER8 (PowerISA 2.07B) or later use the Vector Subtract Unsigned Doubleword Modulo (vsubudm) instruction. Otherwise use vector add word modulo forms and propagate the carry bits.

Parameters

a	128-bit vector treated as 2 X unsigned long int.
b	128-bit vector treated as 2 X unsigned long int.

Returns: vector unsigned long int sum of a[0] + NOT(b[0]) + 1 and a[1] + NOT(b[1]) + 1.

◆ vec_swapd()

static vui64_t vec_swapd ( vui64_t vra )

inlinestatic

Vector doubleword swap. Exchange the high and low doubleword elements of a vector.

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector.

Returns: The original vector with the doubleword elements swapped.

◆ vec_vgluddo()

static vui64_t vec_vgluddo	(	unsigned long long *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Offsets.

For each doubleword element [i] of vra, load the doubleword element at *(char*)array+vra[i]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).

Note: As effective address calculation is modulo 64-bits, signed or unsigned doubleword offsets are equivalent.

processor	Latency	Throughput
power8	12	1/cycle
power9	11	1/cycle

Parameters

array	Pointer to array of integer doublewords.
vra	Vector of doubleword (64-bit) byte offsets from &array.

Returns: vector doubleword containing elements loaded from *(char*)array+vra[0] and *(char*)array+vra[1].

◆ vec_vgluddsx()

static vui64_t vec_vgluddsx	(	unsigned long long *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, load the doubleword element array[vra[i] * (1 << scale)]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.

Note: As effective address calculation is modulo 64-bits, signed or unsigned doubleword indexes are equivalent.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of integer doublewords.
vra	Vector of signed doubleword indexes.
scale	8-bit integer. Indexes are multiplying by 2^scale.

Returns: vector containing doublewords from array[(vra[0,1]<<scale)].

◆ vec_vgluddx()

static vui64_t vec_vgluddx	(	unsigned long long *	array,
		vi64_t	vra
	)

inlinestatic

Vector Gather-Load Integer Doublewords from Vector Doubleword Indexes.

For each doubleword element [i] of vra, load the doubleword element from array[vra[i]]. Merge those doubleword elements and return the resulting vector. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left 3 bits.

Note: As effective address calculation is modulo 64-bits, signed or unsigned doubleword indexes are equivalent.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	13-22	1/cycle

Parameters

array	Pointer to array of integer doublewords.
vra	Vector of signed doubleword indexes.

Returns: vector containing doublewords array[vra[0,1]].

◆ vec_vgludso()

static vui64_t vec_vgludso	(	unsigned long long *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Gather-Load Integer Doublewords from Scalar Offsets.

For each scalar offset[0|1], load the doubleword element at *(char*)array+offset[0|1]. Merge those doubleword elements and return the resulting vector. For best performance &array and doubleword offsets should be doubleword aligned (integer multiple of 8).

processor	Latency	Throughput
power8	7	1/cycle
power9	8	1/cycle

Parameters

array	Pointer to array of integer doublewords.
offset0	Scalar (64-bit) byte offsets from &array.
offset1	Scalar (64-bit) byte offsets from &array.

Returns: vector doubleword containing elements loaded from *(char*)array+offset0 and *(char*)array+offset1.

◆ vec_vlsidx()

static vui64_t vec_vlsidx	(	const signed long long	ra,
		const unsigned long long *	rb
	)

inlinestatic

Vector Load Scalar Integer Doubleword Indexed.

Load the left most doubleword of vector xt as a scalar doubleword from the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. The result xt is returned as a vui64_t vector. For best performance rb and ra should be doubleword aligned (integer multiple of 8).

Note: the right most doubleword of vector xt is left undefined by this operation.

This operation is an alternate form of Vector Load Element (vec_lde), with the added simplification that data is always left justified in the vector. This simplifies merging elements for gather operations.

Note: This instruction was introduced in PowerISA 2.06 (POWER7). For POWER8/9 there are additional optimizations by effectively converting small constant index values into displacements. For POWER8 a specific pattern of addi/lsxdx instruction is fused into a single load displacement internal operation. For POWER9 we can use the lxsd (DS-form) instruction directly.

processor	Latency	Throughput
power8	5	2/cycle
power9	5	2/cycle

Parameters

ra	const signed doubleword index (offset/displacement).
rb	const doubleword pointer to an array of doubles.

Returns: The data stored at (ra + rb) is loaded into vector doubleword element 0. Element 1 is undefined.

◆ vec_vmadd2eud()

static vui128_t vec_vmadd2eud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c,
		vui64_t	d
	)

inlinestatic

Vector Multiply-Add2 Even Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmadd2eud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmadd2euw()

static vui64_t vec_vmadd2euw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c,
		vui32_t	d
	)

inlinestatic

Vector Multiply-Add2 Even Unsigned Words.

Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c and d (a_even * b_even) + EXTZ(c_even + EXTZ(d_even).

Note: The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	9	1/cycle
power9	9	1/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.
c	128-bit vector unsigned int.
d	128-bit vector unsigned int.

Returns: vector unsigned long int sum (a_even * b_even) + EXTZ(c_even) + EXTZ(d_even).

◆ vec_vmadd2oud()

static vui128_t vec_vmadd2oud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c,
		vui64_t	d
	)

inlinestatic

Vector Multiply-Add2 Odd Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmadd2oud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmadd2ouw()

static vui64_t vec_vmadd2ouw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c,
		vui32_t	d
	)

inlinestatic

Vector Multiply-Add2 Odd Unsigned Words.

Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c and d (a_odd * b_odd) + EXTZ(c_odd + EXTZ(d_odd).

Note: The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	9	1/cycle
power9	9	1/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.
c	128-bit vector unsigned int.
d	128-bit vector unsigned int.

Returns: vector unsigned long int sum (a_odd * b_odd) + EXTZ(c_odd + EXTZ(d_odd).

◆ vec_vmaddeud()

static vui128_t vec_vmaddeud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c
	)

inlinestatic

Vector Multiply-Add Even Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmaddeud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmaddeuw()

static vui64_t vec_vmaddeuw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c
	)

inlinestatic

Vector Multiply-Add Even Unsigned Words.

Multiply the even 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the even 32-bit words of c (a_even * b_even) + EXTZ(c_even).

Note: The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	9	2/cycle
power9	9	2/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.
c	128-bit vector unsigned int.

Returns: vector unsigned long int sum (a_even * b_even) + EXTZ(c_even).

◆ vec_vmaddoud()

static vui128_t vec_vmaddoud	(	vui64_t	a,
		vui64_t	b,
		vui64_t	c
	)

inlinestatic

Vector Multiply-Add Odd Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmaddoud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmaddouw()

static vui64_t vec_vmaddouw	(	vui32_t	a,
		vui32_t	b,
		vui32_t	c
	)

inlinestatic

Vector Multiply-Add Odd Unsigned Words.

Multiply the odd 32-bit Words of vector unsigned int values (a * b) and return sums of the unsigned 64-bit product and the odd 32-bit words of c (a_odd * b_odd) + EXTZ(c_odd).

Note: The advantage of this form (versus Multiply-Sum) is that the final 64 bit sums can not overflow.; This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	9	2/cycle
power9	9	2/cycle

Parameters

a	128-bit vector unsigned int.
b	128-bit vector unsigned int.
c	128-bit vector unsigned int.

Returns: vector unsigned long int sum (a_odd * b_odd) + EXTZ(c_odd).

◆ vec_vmsumeud()

static vui128_t vec_vmsumeud	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Even Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmsumeud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmsumoud()

static vui128_t vec_vmsumoud	(	vui64_t	a,
		vui64_t	b,
		vui128_t	c
	)

inlinestatic

Vector Multiply-Sum Odd Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmsumoud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmsumuwm()

static vui64_t vec_vmsumuwm	(	vui32_t	vra,
		vui32_t	vrb,
		vui64_t	vrc
	)

inlinestatic

Vector Multiply-Sum Unsigned Word Modulo.

Multiply the unsigned word elements of vra and vrb, internally generating doubleword products. Then generate three-way sum of adjacent doubleword product pairs, plus the doubleword elements from vrc. The final summation is modulo 64-bits.

Note: This function implements the operation of a Vector Multiply-Sum Unsigned Word Modulo instruction, if the PowerISA included such an instruction. This implementation is NOT endian sensitive and the function is stable across BE/LE implementations.

processor	Latency	Throughput
power8	11	1/cycle
power9	11	1/cycle

Parameters

vra	128-bit vector unsigned int.
vrb	128-bit vector unsigned int.
vrc	128-bit vector unsigned long.

Returns: vector of doubleword elements where each is the sum of the even and odd adjacent products of the vra and vrb, plus the corresponding doubleword element of vrc.

◆ vec_vmuleud()

static vui128_t vec_vmuleud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Even Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmuleud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vmuloud()

static vui128_t vec_vmuloud	(	vui64_t	a,
		vui64_t	b
	)

inlinestatic

Vector Multiply Odd Unsigned Doublewords.

Note: this implementation exists in vec_int128_ppc::h::vec_vmuloud() as it requires vec_msumudm() and vec_adduqm().

◆ vec_vpkudum()

static vui32_t vec_vpkudum	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Pack Unsigned Doubleword Unsigned Modulo.

The doubleword source is the concatination of vra and vrb. For each integer word from 0 to 3, of the result vector, do the following: place the contents of bits 32:63 of the corresponding doubleword source element [i] into word element [i] of the result.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Use vec_vpkudum naming but only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 2 x unsigned long integers.
vrb	a 128-bit vector treated as 2 x unsigned long integers.

Returns: 128-bit vector treated as 4 x unsigned integers.

◆ vec_vrld()

static vui64_t vec_vrld	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Rotate Left Doubleword.

Vector Rotate Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Use vec_vrld naming but only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 2 x unsigned long integers.
vrb	shift amount in bits 58:63 and 122:127.

Returns: Left shifted vector unsigned long.

◆ vec_vsld()

static vui64_t vec_vsld	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Shift Left Doubleword.

Vector Shift Left Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Can not use vec_sld naming here as that would conflict with the generic Shift Left Double Vector. Use vec_vsld but only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 2 x unsigned long integers.
vrb	shift amount in bits 58:63 and 122:127.

Returns: Left shifted vector unsigned long.

◆ vec_vsrad()

static vi64_t vec_vsrad	(	vi64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Shift Right Algebraic Doubleword.

Vector Shift Right Algebraic Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Use the vec_vsrad for consistency with vec_vsld above. Define vec_vsrad only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 2 x unsigned long integers.
vrb	shift amount in bits 58:63 and 122:127.

Returns: Right shifted vector unsigned long.

◆ vec_vsrd()

static vui64_t vec_vsrd	(	vui64_t	vra,
		vui64_t	vrb
	)

inlinestatic

Vector Shift Right Doubleword.

Vector Shift Right Doubleword 0-63 bits. The shift amount is from bits 58-63 and 122-127 of vrb.

processor	Latency	Throughput
power8	2	2/cycle
power9	2	2/cycle

Note: Use the vec_vsrd for consistency with vec_vsld above. Define vec_vsrd only if the compiler does not define it in <altivec.h>.

Parameters

vra	a 128-bit vector treated as 2 x unsigned long integers.
vrb	shift amount in bits 58:63 and 122:127.

Returns: Right shifted vector unsigned long.

◆ vec_vsstuddo()

static void vec_vsstuddo	(	vui64_t	xs,
		unsigned long long *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doublewords Offsets.

For each doubleword element [i] of vra, Store the doubleword element xs[i] at the address *(char*)array+vra[i] For best performance &array and doubleword offsets vra should be doubleword aligned (integer multiple of 8).

processor	Latency	Throughput
power8	12	1/cycle
power9	8	1/cycle

Parameters

xs	Vector of integer doubleword elements to scatter store.
array	Pointer to array of integer doublewords.
vra	Vector of doubleword (64-bit) byte offsets from &array.

◆ vec_vsstuddsx()

static void vec_vsstuddsx	(	vui64_t	xs,
		unsigned long long *	array,
		vi64_t	vra,
		const unsigned char	scale
	)

inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doubleword Scaled Indexes.

For each doubleword element [i] of vra, store the doubleword element xs[i] at array[(vra[i] << scale)]. Array element indices are converted to byte offsets from (array) by multiplying each index by (sizeof (array element) * scale), which is effected by shifting left (3+scale) bits.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector of integer doubleword elements to scatter store.
array	Pointer to array of integer doublewords.
vra	Vector of signed doubleword indexes.
scale	8-bit integer. Indexes are multiplying by 2^scale.

◆ vec_vsstuddx()

static void vec_vsstuddx	(	vui64_t	xs,
		unsigned long long *	array,
		vi64_t	vra
	)

inlinestatic

Vector Scatter-Store Integer Doublewords to Vector Doubleword Indexes.

For each doubleword element [i] of vra, store the doubleword element xs[i] at array[vra[i]]. Indexes are converted to offsets from *array by shifting each doubleword of vra left (3+scale) bits.

processor	Latency	Throughput
power8	14-23	1/cycle
power9	10-19	1/cycle

Parameters

xs	Vector of integer doubleword elements to scatter store.
array	Pointer to array of integer doublewords.
vra	Vector of signed doubleword indexes.

◆ vec_vsstudso()

static void vec_vsstudso	(	vui64_t	xs,
		unsigned long long *	array,
		const long long	offset0,
		const long long	offset1
	)

inlinestatic

Vector Scatter-Store Integer Doublewords to Scalar Offsets.

For each doubleword element [i] of vra, Store the doubleword element xs[i] at *(char*)array+offset[0|1]. For best performance, &array and doubleword offsets should be doubleword aligned (integer multiple of 8).

processor	Latency	Throughput
power8	12	1/cycle
power9	8	1/cycle

Parameters

xs	Vector of integer doubleword elements to scatter store.
array	Pointer to array of integer doublewords.
offset0	Scalar (64-bit) byte offset from &array.
offset1	Scalar (64-bit) byte offset from &array.

◆ vec_vstsidx()

static void vec_vstsidx	(	vui64_t	xs,
		const signed long long	ra,
		unsigned long long *	rb
	)

inlinestatic

Vector Store Scalar Integer Doubleword Indexed.

Stores the left most doubleword of vector xs as a scalar doubleword at the effective address formed by rb+ra. The operand rb is a pointer to an array of doublewords. The operand ra is a doubleword integer byte offset from rb. For best performance rb and ra should be doubleword aligned (integer multiple of 8).

This operation is an alternate form of vector store element, with the added simplification that data is always left justified in the vector. This simplifies scatter operations.

Note: This is instruction was introduced in PowerISA 2.06 (POWER7). For POWER9 there are additional optimizations by effectively converting small constant index values into displacements. For POWER9 we can use the stxsd (DS-form) instruction directly.

processor	Latency	Throughput
power8	0 - 2	2/cycle
power9	0 - 2	4/cycle

Parameters

xs	vector doubleword element 0 to be stored.
ra	const signed long long index (offset/displacement).
rb	const doubleword pointer to an array of doubles.

◆ vec_xxspltd()

static vui64_t vec_xxspltd	(	vui64_t	vra,
		const int	ctl
	)

inlinestatic

Vector splat doubleword. Duplicate the selected doubleword element across the doubleword elements of the result.

Note: This function implements the operation of a VSX Splat Doubleword Immediate instruction. This implementation is NOT Endian sensitive and the function is stable across BE/LE implementations.

The 1-bit control operand (ctl) selects which (0:1) doubleword element, from the vector operand, is replicated to both doublewords of the result vector. Control table:

ctl	vrt[0:63]	vrt[64:127]
0	vra[0:63]	vra[0:63]
1	vra[64:127]	vra[64:127]

processor	Latency	Throughput
power8	2	2/cycle
power9	3	2/cycle

Parameters

vra	a 128-bit vector.
ctl	a const integer encoding the source doubleword.

Returns: The original vector with the doubleword elements swapped.

Functions

Detailed Description

Some missing doubleword operations

Challenges and opportunities

More Challenges

Loading small Doubleword constants

Optimizing loads from .rodata

Alternatives to loading from .rodata

Some special quadword constants

Defining our own vec_splat_s64

Endian problems with doubleword operations

Vector Doubleword Examples

Vectorized 64-bit TimeBase conversion example

Performance data.

Function Documentation

◆ vec_absdud()

◆ vec_addudm()

◆ vec_clzd()

◆ vec_cmpeqsd()

◆ vec_cmpequd()

◆ vec_cmpgesd()

◆ vec_cmpgeud()

◆ vec_cmpgtsd()

◆ vec_cmpgtud()

◆ vec_cmplesd()

◆ vec_cmpleud()

◆ vec_cmpltsd()

◆ vec_cmpltud()

◆ vec_cmpnesd()

◆ vec_cmpneud()

◆ vec_cmpsd_all_eq()

◆ vec_cmpsd_all_ge()

◆ vec_cmpsd_all_gt()

◆ vec_cmpsd_all_le()

◆ vec_cmpsd_all_lt()

◆ vec_cmpsd_all_ne()

◆ vec_cmpsd_any_eq()

◆ vec_cmpsd_any_ge()

◆ vec_cmpsd_any_gt()

◆ vec_cmpsd_any_le()

◆ vec_cmpsd_any_lt()

◆ vec_cmpsd_any_ne()

◆ vec_cmpud_all_eq()

◆ vec_cmpud_all_ge()

◆ vec_cmpud_all_gt()

◆ vec_cmpud_all_le()

◆ vec_cmpud_all_lt()

◆ vec_cmpud_all_ne()

◆ vec_cmpud_any_eq()

◆ vec_cmpud_any_ge()

◆ vec_cmpud_any_gt()

◆ vec_cmpud_any_le()

◆ vec_cmpud_any_lt()

◆ vec_cmpud_any_ne()

◆ vec_ctzd()

◆ vec_maxsd()

◆ vec_maxud()

◆ vec_minsd()

◆ vec_minud()

◆ vec_mrgahd()

◆ vec_mrgald()

◆ vec_mrged()

◆ vec_mrghd()

◆ vec_mrgld()

◆ vec_mrgod()

◆ vec_msumudm()

◆ vec_muleud()

◆ vec_mulhud()

◆ vec_muloud()

◆ vec_muludm()

◆ vec_pasted()

◆ vec_permdi()

◆ vec_popcntd()

◆ vec_revbd()

◆ vec_rldi()

◆ vec_selsd()

◆ vec_selud()

◆ vec_setb_sd()

◆ vec_sldi()

◆ vec_splat_s64()