POWER Vector Library Manual
1.0.4
|
Header package containing a collection of multiple precision quadword integer computation functions implemented with 128-bit PowerISA VMX and VSX instructions. More...
#include <pveclib/vec_int128_ppc.h>
Go to the source code of this file.
Classes | |
struct | __VEC_U_256 |
A vector representation of a 256-bit unsigned integer. More... | |
struct | __VEC_U_512 |
A vector representation of a 512-bit unsigned integer. More... | |
struct | __VEC_U_640 |
A vector representation of a 640-bit unsigned integer. More... | |
union | __VEC_U_512x1 |
A vector representation of a 512-bit unsigned integer and a 128-bit carry-out. More... | |
struct | __VEC_U_1024 |
A vector representation of a 1024-bit unsigned integer. More... | |
struct | __VEC_U_1152 |
A vector representation of a 1152-bit unsigned integer. More... | |
struct | __VEC_U_2048 |
A vector representation of a 2048-bit unsigned integer. More... | |
union | __VEC_U_1024x512 |
A vector representation of a 1024-bit unsigned integer as two 512-bit fields. More... | |
union | __VEC_U_2048x512 |
A vector representation of a 2048-bit unsigned integer as 4 x 512-bit integer fields. More... | |
struct | __VEC_U_2176 |
A vector representation of a 2176-bit unsigned integer. More... | |
struct | __VEC_U_4096 |
A vector representation of a 4096-bit unsigned integer. More... | |
union | __VEC_U_4096x512 |
A vector representation of a 4096-bit unsigned integer as 8 x 512-bit integer fields. More... | |
Macros | |
#define | CONST_VINT512_Q(__q0, __q1, __q2, __q3) {__q3, __q2, __q1, __q0} |
Generate a 512-bit vector unsigned integer constant from 4 x quadword constants. More... | |
#define | COMPILE_FENCE __asm (";":::) |
A compiler fence to prevent excessive code motion. More... | |
#define | __VEC_PWR_IMP(FNAME) FNAME ## _PWR7 |
Macro to add platform suffix for static calls. | |
Functions | |
static __VEC_U_640 | vec_add512cu (__VEC_U_512 a, __VEC_U_512 b) |
Vector Add 512-bit Unsigned Integer & Write Carry. More... | |
static __VEC_U_640 | vec_add512ecu (__VEC_U_512 a, __VEC_U_512 b, vui128_t c) |
Vector Add Extended 512-bit Unsigned Integer & Write Carry. More... | |
static __VEC_U_512 | vec_add512eum (__VEC_U_512 a, __VEC_U_512 b, vui128_t c) |
Vector Add Extended 512-bit Unsigned Integer Modulo. More... | |
static __VEC_U_512 | vec_add512um (__VEC_U_512 a, __VEC_U_512 b) |
Vector Add 512-bit Unsigned Integer Modulo. More... | |
static __VEC_U_512 | vec_add512ze (__VEC_U_512 a, vui128_t c) |
Vector Add 512-bit to Zero Extended Unsigned Integer Modulo. More... | |
static __VEC_U_512 | vec_add512ze2 (__VEC_U_512 a, vui128_t c1, vui128_t c2) |
Vector Add 512-bit to Zero Extended2 Unsigned Integer Modulo. More... | |
static __VEC_U_256 | vec_mul128x128_inline (vui128_t a, vui128_t b) |
Vector 128x128bit Unsigned Integer Multiply. More... | |
static __VEC_U_512 | vec_mul256x256_inline (__VEC_U_256 m1, __VEC_U_256 m2) |
Vector 256x256-bit Unsigned Integer Multiply. More... | |
static __VEC_U_640 | vec_mul512x128_inline (__VEC_U_512 m1, vui128_t m2) |
Vector 512x128-bit Unsigned Integer Multiply. More... | |
static __VEC_U_640 | vec_madd512x128a128_inline (__VEC_U_512 m1, vui128_t m2, vui128_t a1) |
Vector 512x128-bit Multiply-Add Unsigned Integer. More... | |
static __VEC_U_640 | vec_madd512x128a512_inline (__VEC_U_512 m1, vui128_t m2, __VEC_U_512 a2) |
Vector 512x128-bit Multiply-Add Unsigned Integer. More... | |
static __VEC_U_640 | vec_madd512x128a128a512_inline (__VEC_U_512 m1, vui128_t m2, vui128_t a1, __VEC_U_512 a2) |
Vector 512x128-bit Multiply-Add Unsigned Integer. More... | |
static __VEC_U_1024 | vec_mul512x512_inline (__VEC_U_512 m1, __VEC_U_512 m2) |
Vector 512x512-bit Unsigned Integer Multiply. More... | |
static __VEC_U_1024 | vec_madd512x512a512_inline (__VEC_U_512 m1, __VEC_U_512 m2, __VEC_U_512 a1) |
Vector 512-bit Unsigned Integer Multiply-Add. More... | |
__VEC_U_256 | vec_mul128x128 (vui128_t m1, vui128_t m2) |
Vector 128x128bit Unsigned Integer Multiply. More... | |
__VEC_U_512 | vec_mul256x256 (__VEC_U_256 m1, __VEC_U_256 m2) |
Vector 256x256-bit Unsigned Integer Multiply. More... | |
__VEC_U_640 | vec_mul512x128 (__VEC_U_512 m1, vui128_t m2) |
Vector 512x128-bit Unsigned Integer Multiply. More... | |
__VEC_U_640 | vec_madd512x128a512 (__VEC_U_512 m1, vui128_t m2, __VEC_U_512 a2) |
Vector 512x128-bit Multiply-Add Unsigned Integer. More... | |
__VEC_U_1024 | vec_mul512x512 (__VEC_U_512 m1, __VEC_U_512 m2) |
Vector 512x512-bit Unsigned Integer Multiply. More... | |
void | vec_mul1024x1024 (__VEC_U_2048 *p2048, __VEC_U_1024 *m1, __VEC_U_1024 *m2) |
Vector 1024x1024-bit Unsigned Integer Multiply. More... | |
void | vec_mul2048x2048 (__VEC_U_4096 *p4096, __VEC_U_2048 *m1, __VEC_U_2048 *m2) |
Vector 2048x2048-bit Unsigned Integer Multiply. More... | |
void | vec_mul128_byMN (vui128_t *p, vui128_t *m1, vui128_t *m2, unsigned long M, unsigned long N) |
Vector Unsigned Integer Quadword MxN Multiply. More... | |
void | vec_mul512_byMN (__VEC_U_512 *p, __VEC_U_512 *m1, __VEC_U_512 *m2, unsigned long M, unsigned long N) |
Vector Unsigned Integer Quadword 4xMxN Multiply. More... | |
Header package containing a collection of multiple precision quadword integer computation functions implemented with 128-bit PowerISA VMX and VSX instructions.
PVECLIB vec_int128_ppc.h provides the 128x128-bit multiply and 128-bit add with carry/extend operations. This is most of what we need to implement multiple precision integer computation. This header builds on those operations to build 256x256, 512x128, 512x512, 1024x1024 and 2048x2048 multiplies. We also provide 512-bit add with carry/extend operations as a general aid to construct multiple quadword precision arithmetic.
We provide static inline implementations for up to 512x512 multiplies and 512x512 add with carry/extend. These in-line operations are provided as building blocks for coding implementations of larger multiply and sum operations. Otherwise the in-line code expansion is getting too large for normal coding. So we also provide callable (static and dynamic) library implementations as well (Building libraries for vec_int512_ppc).
The challenge is delivering a 2048x2048 bit multiply, producing a 4096-bit product, while minimizing cache and timing side-channel exploits. The goal is to minimize the memory visibility of intermediate products and sums and internal conditional logic (like early exit optimizations). The working theory is to use vector registers and operations and avoid storing intermediate results. This implies:
Achieving these goals requires some knowledge of the Application Binary Interface (ABI) and foibles of the Instruction Set Architecture (PowerISA) and how they impact what the compiler can generate. The compiler itself has internal strategies (and foibles) that need to be managed as well.
The computation requires a number of internal temporary vectors in addition to the inputs and outputs. The Power Architecture, 64-Bit ELF V2 ABI Specification (AKA the ABI) places some generous but important restrictions on how the compiler generates code (and how compliant assembler code is written).
Care is required in selecting the width (256, 512-bit etc) of parameter and return values. Parameters totaling more then 12 vector quadwords or return values totaling more then 8 vector quadwords will be spilled to the callers parameter save area. This may expose intermediate partial products to cache side-channel attacks. A 512x128-bit multiply returning a 640-bit product and a 512x512-bit multiply returning a 1024-bit product meets this criteria (both the parameters and return values fit within the ABI limits). But a 1024x128-bit multiply returning 1152-bits is not OK because the 1152-bit return value requires 9 vector registers, which will be returned in memory.
Also if any of these sub-functions are used without in-lining, the generated code must be inspected to insure it is not spilling any local variables. In my experiments with GCC 8.1 the 128x128, 256x256, and 512x128 multiplies all avoid spilling. However the stand-alone 512x512 implementation does require saving 3 non-volatile registers. This can be eliminated by in-lining the 512x512 multiply into the 2048x2048 multiply function.
The Power Instruction Set Architecture (PowerISA) also imposes some restriction on the registers vector instructions can access.
The compiler has to find a path though the ABI and ISA restriction above while it performs:
For operations defined in PVECLIB, most operations are defined in terms of AltiVec/VSX Built-in Functions. So the compiler does not get much choice for instruction selection. The PVECLIB coding style does leverage C language vector extensions to load constants and manage temporary variables. Using compiler Altivec/VSX built-ins and vector extensions allows the compiler visibility to and control of these optimizations.
Internal function calls effectively clobber all (34 VSRs) volatile registers. As the compiler marshals parameters into ABI prescribed VRs it needs to preserve previous live content for later computation. Similarly for volatile registers not used for parameter passing as they are assumed to be clobbered by the called function. The compiler preserves local live variables before the call by copying their contents to non-volatile registers or spilling to memory. This may put more register pressure on the available non-volatile registers. Small to medium sized functions often require only a fraction of the available volatile registers. In this case, in-lining the function avoids the disruptive volatile register clobber and allows better overall register allocation. So there is a strong incentive to in-line local/static functions.
These compiler optimizations are not independent processes. For example specific VSX instruction can access all 64 VSRs, others are restricted to the 32 VRs (like vector integer instructions). So the compiler prioritizes VRs (the higher 32 VSRs) for allocation to vector integer computation. While the lower 32 VSRs can be used for logical/permute operations and as a level 1 spill area for VRs. These restrictions combined with code size/complexity can increase register pressure to the point the compiler is forced to spill active (or live) vector registers to secondary storage. This secondary storage can be:
Instruction scheduling can increase register pressure by moving (reordering) instructions. This is more prevalent when there are large differences in instruction latency in the code stream. For example moving independent / long latency instructions earlier and dependent / short latency instructions later. This tends to increase the distance between the instruction that sets a register result and the next instruction the uses that result in its computation. The distance between a registers set and use is called the live range. This also tends to increase the number of concurrently active and overlapping live ranges.
For this specific (multi-precision integer multiply) example, integer multiple and add/carry/extend instructions predominate. For POWER9, vector integer multiply instructions run 7 cycles, while integer add/carry/extend quadword instruction run 3 cycles. The compiler will want to move the independent multiply instructions earlier while the dependent add/carry instructions are moved later until the latency of the (multiply) instruction (on which it depends) is satisfied. Moving dependent instructions apart and moving independent instructions into the scheduling gap increases register pressure.
In extreme cases, this can get out of hand. At high optimization levels, the compiler can push instruction scheduling to the point that it runs out of registers. This forces the compiler to spill live register values, splitting the live range into two smaller live ranges. Any spilled values have to be reloaded later so they can used in computation. This causes the compiler to generate more instructions that need additional register allocation and scheduling.
Compiler spill code usually needs registers in addition (perhaps of a different class) to the registers being spilled. This can be as simple as moving to a register of the same size but different class. For example, register moves to/from VRs and the lower 32 VSRs. But it gets more complex when spilling vector registers to memory. For example, vector register spill code needs GPRs to compute stack addresses for vector load/store instructions. Normally this OK, unless the the spill code consumes so many GPRs that it needs to spill GPRs. In that case we can see serious performance bottlenecks.
But remember that a primary goal (Security related implications) was to avoid spilling intermediate results to memory. Spilling between high and low VSRs is acceptable (no cache side-channel), but spilling to memory must be avoided. The compiler should have heuristics to back off in-lining and scheduling-driven code motions just enough to avoid negative performance impacts. But this is difficult to model and may not handle all cases with equal grace. Also this may not prevent spilling VRs to memory if the compiler scheduler's cost computation indicates that is an acceptable trade-off.
So we will have to directly override compiler settings and heuristics to guarantee the result we want/need. The PVECLIB implementation already marks most operations as static inline. But as we use these inline operations as building blocks to implement larger operations we can push the resulting code size over the compiler's default inline limits (-finline-limit). Then compiler will stop in-lining for the duration of compiling the current function.
This may require stronger options/attributes to the compiler like (attribute (always_inline)), (attribute (gnu_inline)), or (attribute (flatten)). The first two are not any help unless you are compiling at lower optimization level (-O0 or -O1). -O2 defaults to -finline-small-functions and -O3 defaults to the stronger -finline-functions. However attribute (flatten) seems do exactly what we want. Every call inside this function is in-lined unless explicitly told not to (attribute (noinline)). It seems that attribute (flatten) ignores the -finline-limit.
Now we have a large block of code for the compiler's instruction scheduler to work on. In this case the code is very repetitive (multiply, add the column, generate carries, repeat). The instruction will have lots of opportunity for scheduling long vs short latency instructions and create new and longer live ranges.
/note In fact after applying attribute (flatten) to vec_mul2048x2048_PWR9 we see a lot of spill code. This expands the code to over 9300 instructions with ~3300 instructions associated with spill code.
We need a mechanism to limit (set boundaries) on code motion while preserving optimization over smaller blocks of code. This is normally called a compiler fence but there are multiple definitions so we need to be careful what we use.
We want something that will prevent the compiler from moving instructions (in either direction) across specified lines in the code.
We don't need an atomic memory fence (like __atomic_thread_fence or __sync_synchronize) that forces the processor to order loads and stores relative to a specific synchronization point.
We don't need a compiler memory fence (like asm ("" ::: "memory")). The "memory" clobber forces GCC to assume that any memory may be arbitrarily read or written by the asm block. So any registers holding live local variables will be forced to memory before and need to be reloaded after. This prevents the compiler from reordering loads, stores, and arithmetic operations across it, but does not prevent the processor from reordering them.
Neither of the above are what we want for this case. We specifically want to avoid memory side effects in this computation. We only need the minimal compiler fence (like asm (";" :::)) that prevents the compiler from reordering any code across it but does not prevent the processor from reordering them.
By placing this compiler fence between multiply/sum stages of vec_mul512x128_inline(), vec_mul512x512_inline() and vec_mul2048x2048() we limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure to the point where all 64 VSRs are in use, but no spilling to stack memory is required.
The 2048x2048 multiplicands and the resulting product are so large (8192-bits, 64 quadwords total) that at the outer most function the inputs and the result must be in memory and passed by reference. The implementation of a 2048x2048-bit multiply requires 256 128x128-bit multiplies. Otherwise the code can be organized into sub-functions generating intermediate partial products and sums.
Coding 256 128x128 products and generating column sums would be tedious. One approach builds up products into larger and larger blocks in stages. For example code a vec_mul512x128_inline() operation then use that in the implementation of vec_mul512x512_inline(). We also provide 512-bit add/carry/extend operations to simplify generating sums of 512-bit partial products. Then load blocks of 512-bits (4 quadwords, 64-bytes) using vec_mul512x512_inline() to produce a 1024-bit partial product (Implications for parameter passing and Product size).
Then multiply the 512-bit blocks across one 2048-bit (4 x 512-bit) multiplicand. The completion of a 2048x512-bit partial product (of 2560-bits) includes the low order 512-bits ready to store to the output operand. Repeat for each 512-bit block of the other 2048-bit multiplicand summing across the 512-bit columns. The final sum, after the final 2048x512 partial product, produces the high order 2048-bits of the 2048x2048 product ready to store to the output operand.
It is best if the sub-functions code can be fully in-lined into the 2048x2048-bit multiply or the sub-functions are carefully written. In this case these sub-functions should be leaf-functions (does not call other functions) and can execute without spilling register state or requiring stored (by reference) parameters.
All levels of implementation should avoid conditional logic based on values of inputs or partial products (For example early exits for leading or trailing zero quadwords). Doing so may expose the multiply function to timing side-channel attacks. So the best case would be one large function implemented as straight-line code.
We will need all 64 VSX registers for operations and local variables. So the outer function will need to allocate a stack-frame and save all of the non-volatile floating point registers (allowing the use of vs14-vs31 for local vector variables) and vector registers (v20-v31 AKA vs51-vs63) on entry. This frees up (18+12=) 30 additional quadword registers for local vector variables within the outer multiply function.
These saved registers reflect the state of the calling (or higher) function and may not have any crypto sensitive content. These register save areas will not be updated with internal state from the 2048x2048-bit multiply operation itself.
The 128x128-bit vector multiply is implemented with Vector Multiply-Sum Unsigned Doubleword Modulo for Power9 and Vector Multiply Even/Odd Unsigned Word for Power8. The timing for vector integer multiply operations are fixed at 7 cycles latency for Power8/9. The sums of partial products are implemented with Vector Add Unsigned Quadword Modulo/write-Carry/Extended. The timing of integer add quadword operations are fixed at 4 cycles for Power8 and 3 cycles for Power9. The rest of the 128x128-bit multiply operation is a combination of Vector Doubleword Permute Immediate, Vector Shift Left Double by Octet Immediate, Vector Splats, and Vector Logical Or (used as a vector register move spanning the 64 VSRs). All of these have fixed timings of 2 or 3 cycles.
So the overall timing of the 2048x2048-bit multiply should be consistent independent of input values. The only measurable variations would be as the processor changes Simultaneous Multithreading (SMT) modes (controlled by the virtual machine and kernel). The SMT mode (1,2,4,8) controls each hardware thread's priority to issue instructions to the core and if the instruction stream is dual or single issue (from that thread's perspective).
But the better news is that with some extra function attributes (always_inline and flatten) the entire 2048x2048 multiply function can be flattened into a single function of straight line code (no internal function calls or conditional branches) running ~6.3K instructions. And no spill code was generated for local variables (no register spill within the function body).
As described in General Endian Issues and Endian problems with quadword implementations supporting both big and little endian in a single implementation has its challenges. But I think we can leave the details of quadword operations to the vec_int128_ppc.h implementation. The decision needed for these implementations is how the quadwords of a multi-quadword integer are ordered in storage. For example given an array or structure of 16 quadwords representing a single 2048-bit binary number which quadword contains the low order bits and which the high order bits.
This is largely arbitrary and independent from the system endian. But we should be consistent within the API defined by this header and PVECLIB as a whole. Placing the low order bits in the first (lowest address in memory) quadword and the high order bits in last (highest address in memory) quadword would be consistent with little endian. While placing the high order bits in the first (lowest address in memory) quadword and the low order bits in last (highest address in memory) quadfword would be consistent with big endian. Either is valid internal to the implementation where the key issue is accessing the quadwords of the multiplicands is a convenient order to generate the partial products in an order that support efficient generation of column sums and carries.
It is best for the API if the order of quadwords in multi-quadword integers match the endian of the platform. This should be helpful where we want the use the PVECLIB implementations under existing APIs using arrays of smaller integer types.
So on powerpc64le systems the low order quadword is the first quadword. While on older powwerpc64 systems the high order quadword is the first quadword. For example we can represent a 512-bit integer with the following structure.
In this example the field vx0 is always the low order quadword and vx3 is always the high order quadword, independent of endian. We repeat this pattern for the range of multi-quadword integer sizes (from __VEC_U_256 to __VEC_U_4096) supported by this header. In each case the field name vx0 is consistently the low order quadword. The field name suffix numbering continues from low to high with the highest numbered field name being the high order quadword.
As we have seen, initializing larger multiple precision constants an be challenging (Quadword Integer Constants). The good news we can continue to to use aggregate initializers for structures and arrays of vector quadwords. For example:
This example is in the expected high to low order for the 512-bit constant 1. Unfortunately endian raises it ugly head again and this would a different value on little endian platform.
So PVECLIB provides another helper macro (CONST_VINT512_Q()) to provide a consistent numbericial order for multiple quadword constants. For example:
and
Unfortunately the compiler can not help with multi-quadword decimal constants. So we must resort to external tools like bc to compute large constant values and convert them to hexadecimal which are easier to break into words and doubleword. These can then be used a constants in program source to represent arbitrarily large binary values.
Many of the implementations associated with 512-bit integer operations are uncomfortably large to expand as in-line code (Examples include vec_mul512x512(), vec_mul1024x1024(), and vec_mul2048x2048()). It is better to collect these large implementations in separately compiled run-time libraries. Another consideration is that most of these operations are multiple quadword multiplies and the optimum quadword multiply is processor (and PowerISA version) dependent. This is especially true for Vector integer multiplies across POWER7-POWER9.
This places requirements on the structure of runtime implementation codes and the library build process.
For the first requirement we can collect the runtime implementations for vec_int512_ppc in to a single source file (vec_int512_runtime.c). The build system can then collect this and other runtime source files to compile for different targets. This can be as simple as:
and similarly for vec_runtime_PWR7.c and vec_runtime_PWR8.c.
As the implementation of vec_int512_ppc.c is already leveraging _ARCH_PWR7/8/9 tuned static inline operations from vec_int512_ppc.h, vec_int128_ppc.h, etc, all we need to do is apply the appropriate -mcpu=power7/8/9 compile option to each (target qualified) runtime source file.
The second requirement is addressed by applying a target qualifying suffix to each runtime function implementation. Here we use the __VEC_PWR_IMP() as function name wrapper macro.
We need to apply the name wrapper to both the functions extern (in vec_int512_ppc.h) and the function implementation (in vec_int512_runtime.c). For example:
This ensures that target specific runtime implementations have unique function symbols. This is important to avoid linker errors (due to duplicate symbol names).
For static linkage the application is compiled for a specific platform target (via -mcpu=). So function calls should be bound to the matching platform specific implementations. The application may select the platform specific function directly by defining a extern and invoking the platform qualified function.
For applications binding to PVECLIB via static archives it is convenient to apply the __VEC_PWR_IMP() wrapper to the function call:
The function call symbol picks up the target suffix based on the compile target (-mcpu=) for the application (see Static linkage to platform specific functions). The linker will extract the matching implementations from the PVECLIB archive and (statically) bind them with the application. This simplifies binding the application to the matching target specific implementations.
For applications binding to dynamic libraries, the target qualified naming strategy also simplifies the implementation of IFUNC resolvers for the DSO library (see Building dynamic runtime libraries). Here the target qualified names of the PIC implementations are known to the corresponding resolver function but are not exported from the DSO. Allowing the application to bind to the target qualified names would defeat the automatic selection of target optimized implementations.
Applications using dynamic linkage will call the unqualified function symbol. For example:
This symbol's implementation has a special STT_GNU_IFUNC attribute recognized by the dynamic linker. This attribute associates this symbol with the corresponding runtime resolver function. So in addition to any platform specific implementations we need to provide the resolver function referenced by the IFUNC symbol. For example:
On the program's first call to a IFUNC symbol, the dynamic linker calls the resolver function associated with that symbol. The resolver function performs a runtime check to determine the platform, selects the (closest) matching platform specific function, then returns that functions address to the dynamic linker.
The dynamic linker stores this function address in the callers Procedure Linkage Tables (PLT) before forwarding the call to the resolved implementation. Any subsequent calls to this function symbol branch (via the PLT) directly to appropriate platform specific implementation.
#define COMPILE_FENCE __asm (";":::) |
A compiler fence to prevent excessive code motion.
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.
#define CONST_VINT512_Q | ( | __q0, | |
__q1, | |||
__q2, | |||
__q3 | |||
) | {__q3, __q2, __q1, __q0} |
Generate a 512-bit vector unsigned integer constant from 4 x quadword constants.
Combine 4 x quadwords constants into a 512-bit __VEC_U_512 constant. The 4 parameters are quadword integer constant values in high to low order. For example:
|
inlinestatic |
Vector Add 512-bit Unsigned Integer & Write Carry.
Compute the 512 bit sum of two 512 bit values a, b and produce the carry. The sum (with-carry) is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
b | vector representation of a unsigned 512-bit integer. |
|
inlinestatic |
Vector Add Extended 512-bit Unsigned Integer & Write Carry.
Compute the 512 bit sum of two 512 bit values a, b and 1 bit value carry-in value c. Produce the carry out of the high order bit of the sum. The sum (with-carry) is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
b | vector representation of a unsigned 512-bit integer. |
c | vector representation of a unsigned 1-bit carry. |
|
inlinestatic |
Vector Add Extended 512-bit Unsigned Integer Modulo.
Compute the 512 bit sum of two 512 bit values a, b and 1 bit value carry-in value c. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
b | vector representation of a unsigned 512-bit integer. |
c | vector representation of a unsigned 1-bit carry. |
|
inlinestatic |
Vector Add 512-bit Unsigned Integer Modulo.
Compute the 512 bit sum of two 512 bit values a, b. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
b | vector representation of a unsigned 512-bit integer. |
|
inlinestatic |
Vector Add 512-bit to Zero Extended Unsigned Integer Modulo.
The carry-in is zero extended to the left before computing the 512-bit sum a + c. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
c | vector representation of a unsigned 1-bit carry. |
|
inlinestatic |
Vector Add 512-bit to Zero Extended2 Unsigned Integer Modulo.
The two carry-ins are zero extended to the left before Computing the 512 bit sum a + c1 + c2. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.
processor | Latency | Throughput |
---|---|---|
power8 | 16 | 1/cycle |
power9 | 12 | 1/cycle |
a | vector representation of a unsigned 512-bit integer. |
c1 | vector representation of a unsigned 1-bit carry. |
c2 | vector representation of a unsigned 1-bit carry. |
|
inlinestatic |
Vector 512x128-bit Multiply-Add Unsigned Integer.
Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2 plus 128-bit value a1. The product is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
a1 | vector representation of a unsigned 128-bit integer. |
|
inlinestatic |
Vector 512x128-bit Multiply-Add Unsigned Integer.
Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2, plus 128-bit value a1, plus 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
a1 | vector representation of a unsigned 128-bit integer. |
a2 | vector representation of a unsigned 512-bit integer. |
__VEC_U_640 vec_madd512x128a512 | ( | __VEC_U_512 | m1, |
vui128_t | m2, | ||
__VEC_U_512 | a2 | ||
) |
Vector 512x128-bit Multiply-Add Unsigned Integer.
Compute the 640 bit sum of the product of the 512 bit value m1 and 128-bit value m2 plus the 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
a2 | vector representation of a unsigned 512-bit integer. |
|
inlinestatic |
Vector 512x128-bit Multiply-Add Unsigned Integer.
Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2 plus 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
a2 | vector representation of a unsigned 512-bit integer. |
|
inlinestatic |
Vector 512-bit Unsigned Integer Multiply-Add.
Compute the 1024 bit sum of the product of 512 bit values m1 and m2 and 512 bit addend a1. The sum is returned as single 1024-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | ~600 | 1/cycle |
power9 | ~210 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 512-bit integer. |
a1 | vector representation of a unsigned 512-bit integer. |
void vec_mul1024x1024 | ( | __VEC_U_2048 * | p2048, |
__VEC_U_1024 * | m1, | ||
__VEC_U_1024 * | m2 | ||
) |
Vector 1024x1024-bit Unsigned Integer Multiply.
Compute the 2048 bit product of 1024 bit values m1 and m2. The product is returned as single 2048-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | ~2500 | 1/cycle |
power9 | ~810 | 1/cycle |
p2048 | vector result as a unsigned 2048-bit integer in storage. |
m1 | vector representation of a unsigned 1024-bit integer. |
m2 | vector representation of a unsigned 1024-bit integer. |
void vec_mul128_byMN | ( | vui128_t * | p, |
vui128_t * | m1, | ||
vui128_t * | m2, | ||
unsigned long | M, | ||
unsigned long | N | ||
) |
Vector Unsigned Integer Quadword MxN Multiply.
Compute the M+N quadword product of two quadword arrays m1, m2. The product is returned as M+N quadword array p.
processor | Latency | Throughput |
---|---|---|
power8 | ??? | 1/cycle |
power9 | ??? | 1/cycle |
p | pointer to vector result as a unsigned (M+N)x128-bit integer in storage. |
m1 | pointer to vector representation of a unsigned Mx128-bit integer. |
m2 | pointer ro vector representation of a unsigned Nx128-bit integer. |
M | long int specifying the number of quadword in m1. |
N | long int specifying the number of quadword in m2. |
__VEC_U_256 vec_mul128x128 | ( | vui128_t | m1, |
vui128_t | m2 | ||
) |
Vector 128x128bit Unsigned Integer Multiply.
Compute the 256 bit product of two 128 bit values a, b. The product is returned as single 256-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 48-56 | 1/cycle |
power9 | 16-24 | 1/cycle |
m1 | vector representation of a unsigned 128-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
|
inlinestatic |
Vector 128x128bit Unsigned Integer Multiply.
Compute the 256 bit product of two 128 bit values a, b. The product is returned as single 256-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 56-64 | 1/cycle |
power9 | 33-39 | 1/cycle |
a | vector representation of a unsigned 128-bit integer. |
b | vector representation of a unsigned 128-bit integer. |
void vec_mul2048x2048 | ( | __VEC_U_4096 * | p4096, |
__VEC_U_2048 * | m1, | ||
__VEC_U_2048 * | m2 | ||
) |
Vector 2048x2048-bit Unsigned Integer Multiply.
Compute the 4096 bit product of 2048 bit values m1 and m2. The product is returned as single 4096-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | ~12000 | 1/cycle |
power9 | 4770 | 1/cycle |
p4096 | vector result as a unsigned 4096-bit integer in storage. |
m1 | vector representation of a unsigned 2048-bit integer. |
m2 | vector representation of a unsigned 2048-bit integer. |
__VEC_U_512 vec_mul256x256 | ( | __VEC_U_256 | m1, |
__VEC_U_256 | m2 | ||
) |
Vector 256x256-bit Unsigned Integer Multiply.
Compute the 512 bit product of two 256 bit values a, b. The product is returned as single 512-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 140-150 | 1/cycle |
power9 | 46-58 | 1/cycle |
m1 | vector representation of a unsigned 256-bit integer. |
m2 | vector representation of a unsigned 256-bit integer. |
|
inlinestatic |
Vector 256x256-bit Unsigned Integer Multiply.
Compute the 512 bit product of two 256 bit values a, b. The product is returned as single 512-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 256-bit integer. |
m2 | vector representation of a unsigned 256-bit integer. |
void vec_mul512_byMN | ( | __VEC_U_512 * | p, |
__VEC_U_512 * | m1, | ||
__VEC_U_512 * | m2, | ||
unsigned long | M, | ||
unsigned long | N | ||
) |
Vector Unsigned Integer Quadword 4xMxN Multiply.
Compute the 4xM+N quadword product of two quadword arrays m1, m2. The product is returned as 4xM+N quadword array p.
processor | Latency | Throughput |
---|---|---|
power8 | ~570*(M*N) | 1/cycle |
power9 | ~260*(M*N) | 1/cycle |
p | pointer to vector result as a unsigned (M+N)x512-bit integer in storage. |
m1 | pointer to vector representation of a unsigned Mx512-bit integer. |
m2 | pointer ro vector representation of a unsigned Nx512-bit integer. |
M | long int specifying the number of 4x quadwords in m1. |
N | long int specifying the number of 4x quadwords in m2. |
__VEC_U_640 vec_mul512x128 | ( | __VEC_U_512 | m1, |
vui128_t | m2 | ||
) |
Vector 512x128-bit Unsigned Integer Multiply.
Compute the 640 bit product of 512 bit value m1 and 128-bit value m2. The product is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
|
inlinestatic |
Vector 512x128-bit Unsigned Integer Multiply.
Compute the 640 bit product of 512 bit value m1 and 128-bit value m2. The product is returned as single 640-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | 224-232 | 1/cycle |
power9 | 132-135 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 128-bit integer. |
__VEC_U_1024 vec_mul512x512 | ( | __VEC_U_512 | m1, |
__VEC_U_512 | m2 | ||
) |
Vector 512x512-bit Unsigned Integer Multiply.
Compute the 1024 bit product of 512 bit values m1 and m2. The product is returned as single 1024-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | ~600 | 1/cycle |
power9 | ~210 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 512-bit integer. |
|
inlinestatic |
Vector 512x512-bit Unsigned Integer Multiply.
Compute the 1024 bit product of 512 bit values m1 and m2. The product is returned as single 1024-bit integer in a homogeneous aggregate structure.
processor | Latency | Throughput |
---|---|---|
power8 | ~600 | 1/cycle |
power9 | ~210 | 1/cycle |
m1 | vector representation of a unsigned 512-bit integer. |
m2 | vector representation of a unsigned 512-bit integer. |