POWER Vector Library Manual
1.0.4
|
Header package containing a collection of Binary Coded Decimal (BCD) computation and Zoned Character conversion operations on vector registers. More...
#include <pveclib/vec_common_ppc.h>
#include <pveclib/vec_char_ppc.h>
#include <pveclib/vec_int128_ppc.h>
Go to the source code of this file.
Macros | |
#define | vBCD_t vui32_t |
vector signed BCD integer of up to 31 decimal digits. More... | |
#define | vbBCD_t vb32_t |
vector vector bool from 128-bit signed BCD integer. | |
#define | _BCD_CONST_PLUS_NINES ((vBCD_t) CONST_VINT128_DW128(0x9999999999999999, 0x999999999999999c)) |
vector signed BCD constant +9s. | |
#define | _BCD_CONST_PLUS_ONE ((vBCD_t) CONST_VINT128_DW128(0, 0x1c)) |
vector signed BCD constant +1. | |
#define | _BCD_CONST_MINUS_ONE ((vBCD_t) CONST_VINT128_DW128(0, 0x1d)) |
vector signed BCD constant -1. | |
#define | _BCD_CONST_ZERO ((vBCD_t) CONST_VINT128_DW128(0, 0x0c)) |
vector signed BCD constant +0. | |
#define | _BCD_CONST_SIGN_MASK ((vBCD_t) CONST_VINT128_DW128(0, 0xf)) |
vector BCD sign mask in bits 124:127. | |
Functions | |
static vui64_t | vec_BCD2BIN (vBCD_t val) |
Convert vector of 2 x unsigned 16-digit BCD values to vector 2 x doubleword binary values. More... | |
static _Decimal128 | vec_BCD2DFP (vBCD_t val) |
Convert a Vector Signed BCD value to __Decimal128. More... | |
static vBCD_t | vec_BIN2BCD (vui64_t val) |
Convert vector unsigned doubleword binary values to Vector unsigned 16-digit BCD values. More... | |
static vBCD_t | vec_DFP2BCD (_Decimal128 val) |
Convert a __Decimal128 value to Vector BCD. More... | |
static vBCD_t | vec_bcdadd (vBCD_t a, vBCD_t b) |
Decimal Add Signed Modulo Quadword. More... | |
static vBCD_t | vec_bcdaddcsq (vBCD_t a, vBCD_t b) |
Decimal Add & write Carry Signed Quadword. More... | |
static vBCD_t | vec_bcdaddecsq (vBCD_t a, vBCD_t b, vBCD_t c) |
Decimal Add Extended & write Carry Signed Quadword. More... | |
static vBCD_t | vec_bcdaddesqm (vBCD_t a, vBCD_t b, vBCD_t c) |
Decimal Add Extended Signed Modulo Quadword. More... | |
static vBCD_t | vec_bcdcfsq (vi128_t vrb) |
Vector Decimal Convert From Signed Quadword returning up to 31 BCD digits. More... | |
static vBCD_t | vec_bcdcfud (vui64_t vrb) |
Vector Decimal Convert From Unsigned doubleword returning up to 2x16 BCD digits. More... | |
static vBCD_t | vec_bcdcfuq (vui128_t vra) |
Vector Decimal Convert From Unsigned Quadword returning up to 32 BCD digits. More... | |
static vBCD_t | vec_bcdcfz (vui8_t vrb) |
Vector Decimal Convert From Zoned. More... | |
static vbBCD_t | vec_bcdcmp_eqsq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for equal. More... | |
static vbBCD_t | vec_bcdcmp_gesq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for greater than or equal. More... | |
static vbBCD_t | vec_bcdcmp_gtsq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for greater than. More... | |
static vbBCD_t | vec_bcdcmp_lesq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for less than or equal. More... | |
static vbBCD_t | vec_bcdcmp_ltsq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for less than. More... | |
static vbBCD_t | vec_bcdcmp_nesq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for not equal. More... | |
static int | vec_bcdcmpeq (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for equal. More... | |
static int | vec_bcdcmpge (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for greater than or equal. More... | |
static int | vec_bcdcmpgt (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for greater than. More... | |
static int | vec_bcdcmple (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for less than or equal. More... | |
static int | vec_bcdcmplt (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for less than. More... | |
static int | vec_bcdcmpne (vBCD_t vra, vBCD_t vrb) |
Vector Compare Signed BCD Quadword for not equal. More... | |
static vBCD_t | vec_bcdcpsgn (vBCD_t vra, vBCD_t vrb) |
Vector copy sign BCD. More... | |
static vi128_t | vec_bcdctsq (vBCD_t vra) |
Vector Decimal Convert to Signed Quadword. More... | |
static vui8_t | vec_bcdctub (vBCD_t vra) |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs to binary unsigned bytes . More... | |
static vui16_t | vec_bcdctuh (vBCD_t vra) |
Vector Decimal Convert groups of 4 BCD digits to binary unsigned halfwords. More... | |
static vui32_t | vec_bcdctuw (vBCD_t vra) |
Vector Decimal Convert groups of 8 BCD digits to binary unsigned words. More... | |
static vui64_t | vec_bcdctud (vBCD_t vra) |
Vector Decimal Convert groups of 16 BCD digits to binary unsigned doublewords. More... | |
static vui128_t | vec_bcdctuq (vBCD_t vra) |
Vector Decimal Convert groups of 32 BCD digits to binary unsigned quadword. More... | |
static vui8_t | vec_bcdctz (vBCD_t vrb) |
Vector Decimal Convert To Zoned. More... | |
static vBCD_t | vec_bcddiv (vBCD_t a, vBCD_t b) |
Divide a Vector Signed BCD 31 digit value by another BCD value. More... | |
static vBCD_t | vec_bcddive (vBCD_t a, vBCD_t b) |
Decimal Divide Extended. More... | |
static vBCD_t | vec_bcdmul (vBCD_t a, vBCD_t b) |
Multiply two Vector Signed BCD 31 digit values. More... | |
static vBCD_t | vec_bcdmulh (vBCD_t a, vBCD_t b) |
Vector Signed BCD Multiply High. More... | |
static vBCD_t | vec_bcds (vBCD_t vra, vi8_t vrb) |
Decimal Shift. Shift a vector signed BCD value, left or right a variable amount of digits (nibbles). The sign nibble is preserved. More... | |
static vBCD_t | vec_bcdsetsgn (vBCD_t vrb) |
Vector Set preferred BCD Sign. More... | |
static vBCD_t | vec_bcdslqi (vBCD_t vra, const unsigned int _N) |
Vector BCD Shift Right Signed Quadword. More... | |
static vBCD_t | vec_bcdsluqi (vBCD_t vra, const unsigned int _N) |
Vector BCD Shift Right unsigned Quadword. More... | |
static vBCD_t | vec_bcdsr (vBCD_t vra, vi8_t vrb) |
Decimal Shift and Round. Shift a vector signed BCD value, left or right a variable amount of digits (nibbles). The sign nibble is preserved. If byte element 7 of the shift count is negative (right shift), and the last digit shifted out is greater then or equal to 5, then increment the shifted magnitude by 1. More... | |
static vBCD_t | vec_bcdsrqi (vBCD_t vra, const unsigned int _N) |
Vector BCD Shift Right Signed Quadword Immediate. More... | |
static vBCD_t | vec_bcdsrrqi (vBCD_t vra, const unsigned int _N) |
Vector BCD Shift Right and Round Signed Quadword Immediate. More... | |
static vBCD_t | vec_bcdsruqi (vBCD_t vra, const unsigned int _N) |
Vector BCD Shift Right Unsigned Quadword immediate. More... | |
static vBCD_t | vec_bcdsub (vBCD_t a, vBCD_t b) |
Subtract two Vector Signed BCD 31 digit values. More... | |
static vBCD_t | vec_bcdsubcsq (vBCD_t a, vBCD_t b) |
Decimal Sudtract & write Carry Signed Quadword. More... | |
static vBCD_t | vec_bcdsubecsq (vBCD_t a, vBCD_t b, vBCD_t c) |
Decimal Add Extended & write Carry Signed Quadword. More... | |
static vBCD_t | vec_bcdsubesqm (vBCD_t a, vBCD_t b, vBCD_t c) |
Decimal Subtract Extended Signed Modulo Quadword. More... | |
static vBCD_t | vec_bcdtrunc (vBCD_t vra, vui16_t vrb) |
Decimal Truncate. Truncate a vector signed BCD value vra to N-digits, where N is the unsigned integer value in bits 48-63 of vrb. The first 31-N digits are set to 0 and the result returned. More... | |
static vBCD_t | vec_bcdtruncqi (vBCD_t vra, const unsigned short _N) |
Decimal Truncate Quadword Immediate. Truncate a vector signed BCD value vra to N-digits, where N is a unsigned short integer constant. The first 31-N digits are set to 0 and the result returned. More... | |
static vBCD_t | vec_bcdus (vBCD_t vra, vi8_t vrb) |
Decimal Unsigned Shift. Shift a vector unsigned BCD value, left or right a variable amount of digits (nibbles). More... | |
static vBCD_t | vec_bcdutrunc (vBCD_t vra, vui16_t vrb) |
Decimal Unsigned Truncate. Truncate a vector unsigned BCD value vra to N-digits, where N is the unsigned integer value in bits 48-63 of vrb. The first 32-N digits are set to 0 and the result returned. More... | |
static vBCD_t | vec_bcdutruncqi (vBCD_t vra, const unsigned short _N) |
Decimal Unsigned Truncate Quadword Immediate. Truncate a vector unsigned BCD value vra to N-digits, where N is a unsigned short integer constant. The first 32-N digits are set to 0 and the result returned. More... | |
static vBCD_t | vec_cbcdaddcsq (vBCD_t *cout, vBCD_t a, vBCD_t b) |
Combined Decimal Add & Write Carry Signed Quadword. More... | |
static vBCD_t | vec_cbcdaddecsq (vBCD_t *cout, vBCD_t a, vBCD_t b, vBCD_t cin) |
Combined Decimal Add Extended & write Carry Signed Quadword. More... | |
static vBCD_t | vec_cbcdmul (vBCD_t *p_high, vBCD_t a, vBCD_t b) |
Combined Vector Signed BCD Multiply High/Low. More... | |
static vBCD_t | vec_cbcdsubcsq (vBCD_t *cout, vBCD_t a, vBCD_t b) |
Combined Decimal Subtract & Write Carry Signed Quadword. More... | |
static vf64_t | vec_pack_Decimal128 (_Decimal128 lval) |
Pack a FPR pair (_Decimal128) to a doubleword vector (vector double). More... | |
static _Decimal128 | vec_quantize0_Decimal128 (_Decimal128 val) |
Quantize (truncate) a _Decimal128 value before convert to BCD. More... | |
static vui8_t | vec_rdxcf100b (vui8_t vra) |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs from radix 100 binary integer bytes. More... | |
static vui8_t | vec_rdxcf10kh (vui16_t vra) |
Vector Decimal Convert radix 10,000 Binary halfwords to pairs of radix 100 binary bytes. More... | |
static vui16_t | vec_rdxcf100mw (vui32_t vra) |
Vector Decimal Convert radix 10**8 Binary words to pairs of radix 10,000 binary halfwords. More... | |
static vui32_t | vec_rdxcf10E16d (vui64_t vra) |
Vector Decimal Convert radix 10**16 Binary doublewords to pairs of radix 10**8 binary words. More... | |
static vui64_t | vec_rdxcf10e32q (vui128_t vra) |
Vector Decimal Convert radix 10**32 Binary quadword to pairs of radix 10**16 binary doublewords. More... | |
static vui8_t | vec_rdxcfzt100b (vui8_t zone00, vui8_t zone16) |
Vector Decimal Convert Zoned Decimal digit pairs to to radix 100 binary integer bytes.. More... | |
static vui8_t | vec_rdxct100b (vui8_t vra) |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs to radix 100 binary integer bytes. More... | |
static vui16_t | vec_rdxct10kh (vui8_t vra) |
Vector Decimal Convert radix 100 digit pairs to radix 10,000 binary integer halfwords. More... | |
static vui32_t | vec_rdxct100mw (vui16_t vra) |
Vector Decimal Convert radix 10,000 digit halfword pairs to radix 100,000,000 binary integer words. More... | |
static vui64_t | vec_rdxct10E16d (vui32_t vra) |
Vector Decimal Convert radix 100,000,000 digit word pairs to radix 10E16 binary integer doublewords. More... | |
static vui128_t | vec_rdxct10e32q (vui64_t vra) |
Vector Decimal Convert radix 10E16 digit pairs to radix 10E32 __int128 quadwords. More... | |
static vb128_t | vec_setbool_bcdinv (vBCD_t vra) |
Vector Set Bool from Signed BCD Quadword if invalid. More... | |
static vb128_t | vec_setbool_bcdsq (vBCD_t vra) |
Vector Set Bool from Signed BCD Quadword. More... | |
static int | vec_signbit_bcdsq (vBCD_t vra) |
Vector Sign bit from Signed BCD Quadword. More... | |
static _Decimal128 | vec_unpack_Decimal128 (vf64_t lval) |
Unpack a doubleword vector (vector double) into a FPR pair. (_Decimal128). More... | |
static vui128_t | vec_zndctuq (vui8_t zone00, vui8_t zone16) |
Vector Zoned Decimal Convert 32 digits to binary unsigned quadword. More... | |
Header package containing a collection of Binary Coded Decimal (BCD) computation and Zoned Character conversion operations on vector registers.
Many of these operations are implemented in a single VMX or DFP instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors (using existing VMX, VSX, and DFP instructions) and provides in-line assembler implementations for older compilers that do not provide the built-ins.
Starting with POWER6 introduced a Decimal Floating-point (DFP) Facility implementing the IEEE 754-2008 revision standard. This is implemented in hardware as an independent Decimal Floating-point Unit (DFU). This is supported with ISO C/C++ language bindings and runtime libraries.
The DFP Facility supports a different data format Densely packed decimal (DPD and a more extensive set of operations then BCD or Zoned. So DFP and the comprehensive C language and runtime library support makes it a better target for new business oriented applications. As the DFP Facility supports conversions between DPD and BCD, existing DFP operations can be used to emulate BCD operations on older processors and fill in operational gaps in the vector BCD instruction set.
As DFP is supported directly in the hardware and has extensive language and runtime support, there is little that PVECLIB can contribute to general decimal radix computation. However the vector unit and recent BCD and Zoned extensions can still be useful in areas include large order multiple precision computation and conversions between binary and decimal radix. Both are required to convert large decimal numeric or floating-point values with extreme exponents for input or print.
So what operations are needed, what does the PowerISA provide, and what does the ABI and/or compiler provide. Some useful operations include:
The original VMX (AKA Altivec) only defined a few instructions that operated on the 128-bit vector as a whole. This included the vector shifts by bit and octet, and generalized vector permute, general binary integer add, subtract and multiply for byte/halfword/word. But no BCD or decimal character operations.
POWER6 introduced the Decimal Floating-point Facility. DFP provides a robust set of operations with 7 (_Decimal32), 16 (_Decimal64), and 34 (_Decimal128) digit precision. Arithmetic operations include add, subtract, multiply, divide, and compare. Special operations insert/extract exponent, quantize, and digit shift. Conversions to and from signed (31-digits) and unsigned (32-digit) BCD. And conversions to and from binary signed long (64-bit) integer. DFP operations use the existing floating-point registers (FPRs). The 128-bit DFP (quadword) instructions operate on even/odd 64-bit Floating-point register pairs (FPRp).
POWER6 also implemented the Vector Facility (VMX) instructions. No additional vectors operations where added and the Vector Registers (VRs) where separate from the GRPs and FPRs. The only transfer data path between register sets is via storage. So while the DFP Facility could be used for BCD operations and conversions, there was little synergy with the vector unit, in POWER6.
POWER7 introduced the VSX facility providing 64x128-bit Vector Scalar Registers (VSRs) that overlaid both the FPRs (VSRs 0-31) and VRs (VSRs 32-63). It also added useful doubleword permute immediate (xxpermdi) and logical/select operations with access to all 64 VSRs. This greatly simplifies data transfers between VRs and FPRs (FPRps) (see vec_pack_Decimal128(), vec_unpack_Decimal128()). This makes it more practical to transfer vector contents to the DFP Facility for processing (see vec_BCD2DFP() and vec_DFP2BCD().
POWER8 added vector add/subtract modulo/carry/extend unsigned quadword for binary integer (vector [unsigned] __int128). This combined with the wider (word) multiply greatly enhances multiple precision operations on large (> 128-bit) binary numbers. POWER8 also added signed BCD add/subtract instructions with up to 31-digits. While the PowerISA did not provide carry/extend forms of bcdadd/bcdsub, it does set a condition code with bits for GT/LT/EQ/OVF. This allows for implementations of BCD compare and the overflow (OVF) bit supports carry/extend operations. Also the lack of BCD multiply/divide in the vector unit is not a problem because we can leverage DFP (see vec_bcdmul(), vec_bcddiv()).
POWER9 (PowerISA 3.0B) adds BCD copy sign, set sign, shift, round, and truncate instructions. There are also unsigned (32-digit) forms of the shift and truncate instructions. And instructions to convert between signed BCD and quadword (__int128) and signed BCD and Zoned. POWER9 also added quadword binary multiply 10 with carry extend forms than can also help with decimal to binary conversion.
The OpenPOWER ABI does have an Appendix B. Binary-Coded Decimal Built-In Functions and proposes that compilers provide a bcd.h header file. At this time no compiler provides this header. GCC does provides compiler built-ins to generate the bcdadd/bcdsub instructions and access the associated condition codes in if statements. GCC also provides built-ins to generate the DFP instruction encode/decode to and from BCD.
This header covers operations that are either:
See Returning extended quadword results. for more background on extended quadword computation.
Technically, operations on quadword elements should not require any endian specific transformation. There is only one element so there can be no confusion about element numbering or order. However some of the more complex quadword operations are constructed from operations on smaller elements. And those operations as provided by <altivec.h> are required by the OpenPOWER ABI to be endian sensitive. See Endian problems with doubleword operations for a more detailed discussion.
In any case, the arithmetic (high to low) order of digit nibbles in BCD or characters in Zoned are defined in the PowerISA. In the vector register, high order digits are on the left while low order digits and the sign are on the right. (See vec_bcdadd() and vec_bcdsub()). So pveclib implementations will need to either:
Binary-coded decimal (Also called packed decimal) and the related Zoned Decimal are common representations of signed decimal radix (base 10) numbers. BCD is more compact and usually faster then zoned. Zoned format is more closely aligned with human readable and printable character formats. In both formats the sign indicator is associated (in the same character or byte) with the low order digit.
BCD and Zoned formats and operations were implemented for some of the earliest computers. Then circuitry was costly and arithmetic was often implemented as a digit (or bit) serial operation. Modern computers have more circuitry with wider data paths and more complex arithmetic/logic units. The current trend is for each processor core implementation to include multiple computational units that can operate in parallel.
For POWER server class processors separate and multiple Fixed-Point Units (FXU), (binary) Floating-point Units (FPU), and Vector Processing Units (VPU) are the norm. POWER6 introduced a Decimal Floating-point (DFP) Facility implementing the IEEE 754-2008 revision standard. This is implemented in hardware as an independent Decimal Floating-point Unit (DFU). This is supported with ISO C/C++ language bindings and runtime libraries.
The DFU supports a different data format Densely packed decimal (DPD and a more extensive set of operations then BCD or Zoned. So hardware DFP and the comprehensive C language and runtime library support makes it a better target for new business oriented applications. As DFP is supported directly in the hardware and has extensive language and runtime support, there is little that PVECLIB can contribute to general decimal radix computation.
However the vector unit and recent BCD and Zoned extensions can still be useful in areas including large order multiple precision computation and conversions between binary and decimal radix. Both are required to convert large decimal numeric or floating-point values with extreme exponents for input or print. And conventions between _Float128 and _Decimal128 types is even more challenging. Basically both POSIX and IEEE 754-2008 require that it possible to convert floating-point values to an external character decimal representation, with the specified rounding, and back recovering the original value. This always requires more precision for the conversion then is available in the given format and size.
BCD and Zoned Decimal have a long history with multiple computer manufacturers, and this is reflected as multiple encodings of the same basic concept. This is in turn reflected in the PowerISA as Preferred Sign PS immediate operand on BCD instructions.
This header implementation assumes that users of PVECLIB are not interested in this detail and just want access to BCD computation with consistent results. So PVECLIB does not expose preferred sign at the API and provides reasonable defaults in the implementation.
PVECLIB is targeted at the Linux ecosystem with ASCII character encoding, so the implementation defaults for:
The PowerISA implementation is permissive of sign encoding of input values and will accept four (0xA, 0xC, 0xE, 0xF) encodings of positive and two (0xB, 0xD) for negative. But the sign code of the result is always set to the preferred sign.
The BCD encoding allows for signed zeros (-0, +0) but the PowerISA implementation prefers the positive encoding for zero results. Again the implementation is permissive of both encodings for input operands. Usually this is not an issue but can be when dealing with conversions from other formats (DFP also allows signed 0.0) and implementations of BCD operations for older (POWER7/8) processors.
This is most likely to effect user code in comparisons of BCD values for 0. One might expect the following vector binary word compare all
to give the same result as
The vector binary compare is likely to have lower latency (on POWER7/8), but will miss compare on -0. The BCD compare operation (i.e. vec_bcdcmpeq ()) is recommended, unless the programs knows the details for the source operands generation, and have good (performance and latency) reasons to to use the alternative compare. Pveclib strives to provide correct preferred zeros results in its implementation of BCD operations.
Extended precision requires carry and extend forms of bcdadd/sub. Also BCD multiply with multiply high and and double quadword (62-digit) forms. The vector unit does not support BCD multiply so pveclib leverages the DFP Facility to implement these operations. Finally algorithms and extended precision conversions require BCD divide and divide extended. Again leveraging the DPU to implement these operations.
The PowerISA does not provide the extend and write-carry forms of the bcdadd/sub instructions. But bcdadd/sub instructions do post status to CR field 6 which includes:
which provides a basis for BCD comparison and the overflow may be used for carry/extend logic. The GCC compiler provides built-ins to generate the bcdadd/sub and test the resulting CR bits in if statements.
Unfortunately, the Overflow flag generated by bcdadd/bcdsub is not a true carry/borrow. If the operands have the same sign for bcdadd (different sign for bcdsub) and there is a carry out of the high order digit, then:
This can be used to simulate a Add and Write-Carry operation. However if the operands have different signs the bcdadd (same sign for bcdsub) the operation does the following:
For a simple BCD add this is the desired result (overflow is avoided and the borrow is recorded in the sign). But for multiple precision BCD operation, this will delay propagation of borrows to the higher order digits and the result is a mixture of signs across elements of the larger multiple precision value. This would have to be corrected at some later stage. For example the sum of 32 digits:
This exceeds the 31-digit capacity of Vector signed BCD so we are forced to represent each number as two or more BCD values. For example:
The sum of the low order operands will overflow, so we need to detect this overflow and generate a carry that we can apply to sum of the high order operands. For example the following code using the GCC's __builtin_bcdadd_ov.
The higher operands requires a 3-way (a+b+c) sum to propagate the carry.
where vec_bcdadd is a pveclib wrapper around __builtin_bcdadd to simplify the code. The simplified multiple precision BCD use case looks like this:
But we should look at some more examples before we assume we have a complete solution. For example a subtract that requires a borrow:
The multiple precision BCD would look like this:
But with the example code above we expected result:
instead we see:
The BCD overflow flag only captures carry/borrow when the bcdadd operands have the same sign (or different signs for bcdsub). In this case it looks like (1 - 9 = -8) which does not overflow.
We need a way to detect the borrow and fix up the sum to look like (11 - 9 = 2) and generate a carry digit (-1) to propagate the borrow to the higher order digits.
The secondary borrow is detected by comparing the sign of the result to the sign of the first operand. Something like this:
For multiple precision operations it would be better to retain the sign from the first operand and generate a borrow digit (value of '1' with the sign of the uncorrected result).
This requires re-computing the sum/difference, while applying the effect of borrow, and replacing the carry (currently 0) with a signed borrow digit. The corrected sum is the 10's complement (9's complement +1) of the initial sum (like (10 - 8 = 2) or (9 - 8 + 1 = 2). As we obviously don't know how to represent signed BCD with more then 31-digits (10**32 is 32-digits), the 9's complement + 1 is a better plan. We know that initial sum has a different sign from the original first operand. So adding 10**31 with the sign of the first operand to the initial sum applies the borrow operation.
This does not fit well into the separate add modulo and add and write-carry operations commonly used for fixed binary arithmetic. Instead it requires a combined operation returning both the generated borrow and a sum/difference result with a corrected sign code. The combined add with carry looks like this:
and the usage example looks like this:
BCD multiply and divide operations are not directly supported in the current PowerISA. Decimal multiply and divide are supported in the Decimal Floating-point (DFP) Facility, as well as conversion to and from signed (unsigned) BCD.
So BCD multiply and divide operations can be routed through the DFP Facility with a few caveats.
This allows DFP to represent decimal integer and fixed point decimal values with a preferred exponent of 0. The DFP Facility will maintain this preferred exponent for DPF arithmetic operations until:
The implementation can insure that input operands are derived from 31-digit BCD values. The results of any divide operations can be truncated back to decimal integer with the preferred 0 exponent. This can be achieved with the DFP Quantize Immediate instruction, specifying the ideal exponent of 0 and a rounding mode of round toward 0 (see vec_quantize0_Decimal128()). This allows the following implementation:
The multiply case is bit more complicated as we need to produce up to 62 digit results without losing precision and DFP only supports 34 digits. This requires splitting the input operands into groups of digits where partial products of any combination of these groups is guaranteed not exceed 34 digits.
One way to do this is split each 31-digit operand into two 16-digit chunks (actually 15 and 16-digits). These chunks are converted to DFP extended format and multiplied to produce four 32-digit partial products. These partial products can be aligned and summed to produce the high and low 31-digits of the full 62-digit product. This is the basis for vec_bcd_mul(), vec_bcdmulh(), and vec_cbcdmul().
A simple vec_and() can be used to isolate the low order 16 BCD digits. It is simple at this point to detect if both operands are 16-digits or less by comparing the original operand to the isolate value. In this case the product can not exceed 32 digits and we can short circuit the product to a single multiply. Here we can safely use binary compare all.
This is a case where negative 0 can be generated in the DFP multiply and converted unchanged to BCD. This is handled with the following fix up code:
From here the code diverges for multiply low and multiply high (and full combined multiply). Multiply low only needs the 3 lower order partial products. The highest order partial product does not impact the lower order 31-digits and is not needed. Multiply high requires the generation and summation of all 4 partial products. Following code completes the implementation of BCD multiply low:
Here we know that there are higher order digits in one or both operands. First use vec_bcdsrqi() to isolate the high 15-digits of operands a and b. Both Vector unit and DFP Facility have decimal shift operations, but the vector shift operation is faster.
Then convert to DFP and multiply (high_a * low_b and high_b * low_a) for the two middle order partial products which are summed. This sum represents the high 32-digits (the 31-digit sum can carry) of a 48-digit product. Only the lower 16-digits of this sum is needed for the final sum and this needs to be aligned with the high 16 digits of the original lower order partial product.
For this case use DFP Shift Significand Left Immediate and DFP Shift Significand Right Immediate. All the data is in the DFP Facility and the high cost of the DFP Facility shift is offset by avoiding extra format conversions. We use shift left 17 followed by shift right 1 to clear the highest order DFP digit and avoid any overflow. A final DFP add produces the low order 32 digits of the product which will be truncated to 31-digits in the conversion to BCD.
How we can look at the BCD multiply high (generate the full 62-digit product returning the high 31 digits) and point out the differences. Multiply high also starts by isolating the low order 16 BCD digits, performing the low order multiply (low_a * low_b), and testing for the short circuit (all higher order digits are 0). The first difference (from multiply low) is that in this case only the high digit of the potential 32-digit product is returned.
So the short circuit code shifts the low partial product right 31 digits and returns that value.
If we can not short circuit, Multiply high requires the generation and summation of all four partial products. Following code completes the implementation of BCD multiply high:
Again we know that there are higher order digits in one or both operands and use vec_bcdsrqi() to isolate the high 15-digits of operands a and b. Then convert to DFP and multiply (high_a * low_b and high_b * low_a) for the two middle order partial products (d_hl and d_lh).
The low order partial product (d_p) was generated above but we need only the high order 15 digits for summation. Shift the low partial product right 16 digits then sum (d_hl + d_lh + d_ll) the low and middle order partial products. This produces the high 32 digits of the lower 48 digit partial sum. Shift this right 15 digits to align with the high order 31 digits for the product.
Then multiply (high_a * high_b) to generate the high order partial product. This represents the high 30 digits of a 62 digits. Shift this left 1 digit to correct the alignment. The sum of the adjusted high and middle order partials gives the high order 31 digits of the 62-digit product.
Conversions between Decimal (BCD, Zoned, or string) and binary is another topic which is more complicated that it first appears. Everyone that takes computer science should have learned about atoi and itoa for conversions between strings of decimal character and binary integers.
ASCII to integer is basically;
Integer to ASCII is basically;
You may have noticed that the algorithms above are not exactly vector ready. Both are serialized on expensive multiply and divide operations. This is not so bad for 9 digit (32-bit) integers but will be noticeable when converting between 128-bit binary and 31-digit BCD.
For the vector BCD equivalent of atoi we could use the PVECLIB implementation of Vector Multiply by 10 Extended Unsigned Quadword. For POWER8, vec_mul10euq() uses; multiple even/odd, a couple of shift left octet immediates, and add quadword. This sequence runs 5-7 instructions and has a minimum latency of 13 cycles. To convert from BCD to binary we need to shift and isolate, one BCD digit at time, then feed that into vec_mul10euq(). Ignoring for now the latency associated with shifting the BCD digits, we can quickly estimate 13 * 32 = 416 cycles to convert 32 digits.
For the vector BCD equivalent of itoa we could use the POWER8 Decimal Add Modulo instruction. For POWER8 vec_bcdadd() has a latency of 13 cycles. But the conversion would be one bit at a time. Use vec_bcdadd() to multiply by 2 then shift / issolate a bit from the binary value, format / convert that bit to BCD 0/1. and vec_bcdadd() again. So a quick estimate for this conversion is 13 * 2 * 128 = 3328 cycles.
Clearly just using bigger registers for bigger numbers is not helping. So we want to think about algorithms that do more in parallel and leverage the vector unit we have.
For POWER9 we have Decimal Convert From/To Signed Quadword and Decimal Convert From/To Zoned (See vec_bcdcfsq(), vec_bcdctsq(), vec_bcdcfz(), vec_bcdctz()). These provide direct conversion between quadword binary and signed BCD and between signed BCD and zoned characters.
The BCD convert from/to Zoned are simple operation that run 3 cycles latency on POWER9 and 14-27 cycles for the POWER8 implementation. For POWER8 there is some additional complexity verify and converting the preferred sign code between BCD and Zoned (of course they are different).
But the BCD convert from/to Signed Quadword operations are a bit heavier, running 37 and 23 cycles latency on POWER9. These instructions execute in the DFU and so are single issue. They also keeps the DFU pipeline busy (for 25 and 11 cycles) and block execution of the next DFU operation for a while. Still this is better than the serial conversion examples described above.
But part of the value of PVECLIB is to provide support across POWER7/8/9 and across compiler versions. The convert instructions above are not supported in current compilers with built-ins so PVECLIB provides in-line assembler implementations for these operations. Now we need look into better algorithms for implementing these operations on POWER7/8.
The Vector unit can multiply, add, or subtract integer elements in parallel. The conversion process is basically multiply and add/sub as we can replace divide operations with the multiplicative inverse. So if we are looking for a way to break the conversion down into steps that can be performed in parallel on elements of the larger value and require fewer steps.
For now we can simplify the problem to unsigned radix conversion and deal with signed conversion as a later cleanup step based on the complete unsigned conversion.
Starting with BCD (Radix 10) to Binary (Radix 2) conversion. The data is represented as 32 BCD digits encoded as 4-bit nibbles starting with high orders digits on the left, to low order digits on the right.
Said differently, unsigned BCD vectors are represented as 16-bytes each containing a pair of BCD digits, each in the range 00-99. This is helpful because the PowerISA has instructions that multiply and add integer bytes, in parallel. So it seems possible to convert bytes containing even/odd pairs of BCD digits to integer bytes, each in the range 0-99: simply multiply the even digit by 10 and add the odd digit.
The result is a vector of 16 x radix-100 bytes (binary integers in the range 0-99). Said differently a radix 100 vector represented as 8 halfwords each containing a pair of radix 100 digits, each in the range 0-99. Again these pairs of digits (bytes) can be converted by multiply and add to radix 10,000 halfwords.
Repeat the process three more times:
So in 5 steps, each only using vector multiply and add, we convert 32 BCD digits to a quadword integer.
Actually, it is a little more complicated than multiply and add. The digits of the digit pair must be isolated and shifted into alignment before the multiply and add. Looking something like this:
The PowerISA does not provide general nibble arithmetic, only byte. So the first operations involve isolating each nibble into separate (high_digit and low_digit) bytes. The high_digit shift also aligns the binary for the multiply and add.
The Multiply Unsigned Byte Modulo (vec_mulubm()) generates vmuleub/vmuloub then loads a permute control vector and permutes the low order bytes of the halfword (even/odd) products into a single vector. Finally, add the x10 product and low_digit to get the binary value in the range 0-99.
This sequence runs 6-10 instructions and 13-22 cycles latency. The lower values assume the shift control and permute control vectors are commoned with other operations.
This is a case where the process on paper is much simpler than the reality of programming computers. The operation is actually (bcd_byte / 16 * 10) + (bcd_byte * 16 / 16) where 16 is the alignment radix and 10 is the decimal radix at this step. The alignment radix operations are (fortunately) strength reduced to vector byte shift left/right.
Let's use a little algebra to eliminate some of these steps. One approach is to generate a correction factor from the high_digit and the difference between the alignment and decimal radix. This correction factor is subtracted directly from the original BCD byte and reduces the operation to (bcd_byte - ((bcd_byte / 16) x (16 - 10)) Which looks something like:
Another opportunity is to let the compiler strength reduce the multiply to shift and add. Newer versions of GCC will perform this optimization when using the generic vec_mul built-in for vector integer elements.
This eliminates vector multiply even/odd, the permute, and the load associated with the permute. The final sequence runs 5-7 instructions and 10-12 cycles latency and looks something like this:
The next step converts adjacent byte pairs to halfwords. We use the same basic formula but adjust the radix constants to; (rdx_hword - ((rdx_hword / 256) x (256 - 100)). Here we need a byte multiply producing a halfword correction factor. No shifts are needed as the vmuleub multiply will access the high byte of each halfword directly.
This requires: a constant load, a multiply even byte and subtract halfword. The final sequence runs 2-5 instructions and 9-18 cycles latency and looks something like this:
This pattern continues for converting halfwords to words, words to doublewords, and doublewords to quadwords. For POWER8 the first 4 steps are supported by vector multiply and subtract instructions. The last step requires a vec_vmuleud() operation implemented in vec_int128_ppc.h, based on vec_muleuw(), vec_mulouw() and vec_adduqm(). The vec_adduqm() operation is single instruction for POWER8. For POWER7 we will need to leverage more operations implemented in vec_int64_ppc.h and vec_int128_ppc.h for the last two steps.
The complete set of steps for converting 32 BCD digits to quadword __int128 binary looks like this:
For POWER8 the whole sequence runs 24-36 instructions and 65-78 cycles latency. For POWER9 the whole sequence runs 17-26 instructions and 52-65 cycles latency.
However we can leverage the POWER9 Vector Multiply by 10 Extended Unsigned Quadword instruction to extend the 31-digit convert to a full 32-digits. Basically use the bcdctsq to convert the high 31-digits and then multiply by 10 and add the last digit. See example below:
This adds a few more cycles to split the high digits from the low digit and insert a positive sign code. This requires loading some vector constants which may be commoned with loads from other operations. This adds 2-11 cycles. The mul10euq only adds 3 cycles latency to complete the BCD to Binary conversion. This is adds only a 21% to 60% latency over the base bcdctsq instruction.
In most senses, binary to BCD is the reverse of BCD to binary. The radix number in the conversion formula exchange places and the conversion starts with the largest element size (quadword) and works it's way down to the smallest (4-bit nibble).
Let's take a look at the conversion formula. For BCD to Binary we used:
So after swapping the conversion (to / from) radix constants we see:
The effect is to divide vector elements of 4*2N bits by 10**N and return the quotient in the high half of the element (in 4*N bits), and the remainder of this divide in the low half of the element (in 4*N bits), Where N is a power of 2n and n ranges from 0 to 4 (5 steps again).
This is good news and bad news. It is good that the correction subtract became a simple add. This allows the uses of multiply sum instruction (where PowerISA has such instructions for the element size). The bad news is that the radix divisor is not a power of two. And since the PowerISA does not have vector integer divide instructions, we use the multiplicative inverse. So in effect, each step of the binary to BCD conversion requires, two multiplies and an add.
So let's look at the first and last step of the conversion (the two extremes). The first step (after verifying that the quadword value is less than 1032<-1) looks like this:
The first multiply is an expensive (40 to 60 cycles) operation as it requires a full Multiply High Unsigned Quadword. The next operation requires a Multiply Odd Unsigned Doubleword then Add Unsigned Quadword Modulo. For POWER9 we can replace these two operations with a single Multiply Sum Unsigned Doubleword Modulo. The latency of this single step is in the same order at the complete BCD to Binary conversion (vec_bcdctuq()).
The conversion steps continue with doubleword to word, word to halfword, halfword to byte, byte to BCD (nibbles). The final step is simple by comparison to the first step.
The GCC vector extensions support dividing a vector char / short / int by a constant. So we can let the compiler generate the multiplicative inverse code for the last three steps. This is not supported (yet) for long and __int128 so the first two steps must explicitly code the multiplicative inverse.
Using GCC vector extensions for the following multiply and add works well in this case as it allows the compiler to perform strength reduction. It is not as useful in the other steps as the programmer knows more about the value ranges then the compiler can or should assume. We know the the quotient and corrective constant always fit into the lower half of the element. This allows the use of the half sized vector multiply odd unsigned while compiler will assume it needs to generate a multiply modulo for the full element size.
For example the third step (word to halfword) we can use Multiply Sum Unsigned Halfword Modulo to replace the multiply odd and add. This is similar to the multiply sum usage in the first step and it is a case not recognized by the compiler.
The full binary to BCD conversion requires all 5 steps to complete the operations and this adds up to 200+ cycles. So this is worth another look.
Initially using the DFP Facility for this binary to BCD conversion was rejected because:
Perhaps we can use the vec_rdxcf10e32q() operation we defined above as the first step (factoring quadwords into the 16 digit doublewords). Then use the DFP Facility to convert binary doublewords to BCD. In this case we are not concerned with signed conversion as 10**16 fits in 54-bits binary and guarantees positive binary values. We still have to deal with the VR to/from FPR transfers but that mechanism is already defined and at a reasonable cost (2-4 cycles each way).
If we assume that the second Decimal Convert From Fixed (dcffix) is independent and issues 19 cycles after the first, we get 32+19 = 51 cycles to complete. Then another 13+1 cycles to convert back to BCD. Add a few cycles for the unpack and pack operations and we estimate 69 cycles for POWER8 and 58 cycles for POWER9. The totals for vec_rdxcf10e32q() plus vec_BIN2BCD() come to 154-164 for POWER8 and 114-124 for POWER9. This is a 30-60% improvement over the previous (all vector) attempt. So the final unsigned binary to BCD conversion looks like this:
The simplest case is converting a vector unsigned __int128 to BCD. This requires up to 39 digits across two vectors. This can either be split into 8 and 31 digits for signed conversion or 7 and 32 for unsigned. Signed conversion is preferred where extended BCD result will be input to additional BCD arithmetic. Unsigned is preferred for conversion to Zoned characters for decimal display.
From Converting Vector __int128 values to BCD we see the divide / modulo quadword by constant operations which can used to factor binary quadwords into high and low digit groups for conversion. For example:
The general multiple precision binary to BCD conversion requires quadword long division as described in Quadword Long Division. After each long division the remainder is in a range for conversion to BCD. In the example below the remainder is converted to 32 digit BCD as the last step.
Each call to example_longbcdcf_10e32 () produces the next 32-digit group. Repeated calls where the previous iterations quotient is passed as the dividend to the next step, produce additional 32-digit groups. This continues until the quotient is less then the divisor (in this case 1032). This final quadword quotient provides the highest order 32-digit group for the conversion. The digit groups are produced in order from lowest to highest significance.
As the conversion process continues the number of quadwords in the extended dividend/quotient shrinks. The divide / modulo quadword by constant operations test for leading zeros and skip over them.
The general multiple precision binary from BCD conversion only requires extended quadword multiply as described in Extended Quadword multiply. Starting with the high order BCD (32 or 31) digit group, multiply by 1032 (or 1031) then add the next digit group to the extended product. Continue until the low order digit group is added. For example:
This process starts with a single quadword (the converted high order digit group). As additional digit groups are converted, the extended binary value is multiplied by 1032 before adding the converted digit group. The number of quadwords in the array d[] expand as needed to hold the binary value.
The interface includes:
High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how Latency and Throughput are derived see: Performance data.
#define vBCD_t vui32_t |
vector signed BCD integer of up to 31 decimal digits.
Convert vector of 2 x unsigned 16-digit BCD values to vector 2 x doubleword binary values.
Convert a vector of 16-digit unsigned BCD doublewords to a vector of unsigned long int doublewords. The vector unsigned long int doublewords are in the range 0-9999999999999999.
processor | Latency | Throughput |
---|---|---|
power8 | 55 | 1/51 cycle |
power9 | 59 | 1/53 cycle |
val | a vector treated a 2 unsigned BCD 16 digit values. |
|
inlinestatic |
Convert a Vector Signed BCD value to __Decimal128.
The BCD vector is permuted into a double float pair before conversion to DPD format via the DFP Encode BCD To DPD Quad instruction.
processor | Latency | Throughput |
---|---|---|
power8 | 17 | 1/cycle |
power9 | 15 | 1/cycle |
val | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Add Signed Modulo Quadword.
Two Signed 31 digit values are added and lower 31 digits of the sum are returned. Overflow (carry-out) is ignored.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Add & write Carry Signed Quadword.
Two Signed 31 digit BCD values are added, and the carry-out (the high order 32nd digit) of the sum is returned.
processor | Latency | Throughput |
---|---|---|
power8 | 15-21 | 1/cycle |
power9 | 6-18 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Add Extended & write Carry Signed Quadword.
Two Signed 31 digit values and a signed carry-in are added together and the carry-out (the high order 32nd digit) of the sum is returned.
processor | Latency | Throughput |
---|---|---|
power8 | 28-37 | 1/cycle |
power9 | 9-21 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
c | a 128-bit vector treated as a signed BCD carry with values -1, 0, or +1. |
Decimal Add Extended Signed Modulo Quadword.
Two Signed 31 digit values and a signed carry-in are added together and lower 31 digits of the sum are returned. Overflow (carry-out) is ignored.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 6 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
c | a 128-bit vector treated as a signed BCD carry with values -1, 0, or +1. |
Vector Decimal Convert From Signed Quadword returning up to 31 BCD digits.
Vector convert a quadword containing a signed __int128 in the range -9999999999999999999999999999999 to +9999999999999999999999999999999 to the equivalent signed BCD value with up to 31 digits.
processor | Latency | Throughput |
---|---|---|
power8 | 166-176 | 1/19 cycle |
power9 | 37 | 1/26 cycle |
vrb | vector signed __int128 number in the range -9999999999999999999999999999999 to +9999999999999999999999999999999. |
Vector Decimal Convert From Unsigned doubleword returning up to 2x16 BCD digits.
Vector convert doubleword containing a unsigned long int each in the range 0-9999999999999999 to the equivalent unsigned BCD doubleword value each up to 16 digits.
processor | Latency | Throughput |
---|---|---|
power8 | 69 | 1/19 cycle |
power9 | 58 | 1/21 cycle |
vrb | a 128-bit vector of unsigned long int numbers, each in the range 0-9999999999999999. |
Vector Decimal Convert From Unsigned Quadword returning up to 32 BCD digits.
Vector convert a quadword containing a unsigned __int128 in the range 0-99999999999999999999999999999999 to the equivalent unsigned BCD value with up to 32 digits.
processor | Latency | Throughput |
---|---|---|
power8 | 154-164 | 1/19 cycle |
power9 | 117-128 | 1/21 cycle |
vra | a 128-bit vector as an unsigned __int128 number in the range 0-99999999999999999999999999999999. |
Vector Decimal Convert From Zoned.
Given a Signed 16-digit signed Zoned value vrb, return equivalent Signed BCD value. For Zoned (PS=0) the sign code is in bits 0:3 of byte 15.
The resulting BCD value with up to 16 digits magnitude and set to the preferred BCD sign (0xc or 0xd).
processor | Latency | Throughput |
---|---|---|
power8 | 14-27 | 1/cycle |
power9 | 3 | 2/cycle |
vrb | a 128-bit vector treated as a signed Zoned 16 digit value. |
Vector Compare Signed BCD Quadword for equal.
Compare vector signed BCD values and return vector bool true if vra and vrb are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for greater than or equal.
Compare vector signed BCD values and return vector bool true if vra and vrb are greater than or equal.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for greater than.
Compare vector signed BCD values and return vector bool true if vra and vrb are greater than.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for less than or equal.
Compare vector signed BCD values and return vector bool true if vra and vrb are less than or equal.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for less than.
Compare vector signed BCD values and return vector bool true if vra and vrb are less than.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for not equal.
Compare vector signed BCD values and return vector bool true if vra and vrb are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 6-9 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for equal.
Compare vector signed BCD values and return boolean true if vra and vrb are equal.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for greater than or equal.
Compare vector signed BCD values and return boolean true if vra and vrb are greater than or equal.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for greater than.
Compare vector signed BCD values and return boolean true if vra and vrb are greater than.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for less than or equal.
Compare vector signed BCD values and return boolean true if vra and vrb are less than or equal.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for less than.
Compare vector signed BCD values and return boolean true if vra and vrb are less than.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector Compare Signed BCD Quadword for not equal.
Compare vector signed BCD values and return boolean true if vra and vrb are not equal.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an vector signed BCD (qword) element. |
vrb | 128-bit vector treated as an vector signed BCD (qword) element. |
Vector copy sign BCD.
Given Two Signed BCD 31 digit values vra and vrb, return the magnitude from vra (bits 0:123) and the sign (bits 124:127) from vrb.
processor | Latency | Throughput |
---|---|---|
power8 | 2-11 | 1/cycle |
power9 | 3 | 2/cycle |
vra | a 128-bit vector treated as a signed BCD 31 digit value. |
vrb | a 128-bit vector treated as a signed BCD 31 digit value. |
Vector Decimal Convert to Signed Quadword.
Vector convert a BCD quadword containing signed 31 digits values to signed __int128, in the range +/- 0-9999999999999999999999999999999.
processor | Latency | Throughput |
---|---|---|
power8 | 80-95 | 1/cycle |
power9 | 23 | 1/12 cycle |
vra | a 128-bit vector treated as a signed 31-digit BCD number. |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs to binary unsigned bytes .
Vector convert 16 bytes each containing 2 BCD digits to the equivalent unsigned char, in the range 0-99. Input values should be valid 2 x BCD nibbles in the range 0-9.
processor | Latency | Throughput |
---|---|---|
power8 | 13-22 | 1/cycle |
power9 | 14-23 | 1/cycle |
vra | a 128-bit vector treated as 16 unsigned 2-digit BCD numbers. |
Vector Decimal Convert groups of 16 BCD digits to binary unsigned doublewords.
Vector convert 2 doublewords each containing 16 BCD digits to the equivalent unsigned long int, in the range 0-9999999999999999. Input values should be valid 16 x BCD nibbles in the range 0-9.
processor | Latency | Throughput |
---|---|---|
power8 | 40-51 | 1/cycle |
power9 | 41-52 | 1/cycle |
vra | a 128-bit vector treated as 2 unsigned 16-digit BCD numbers. |
Vector Decimal Convert groups of 4 BCD digits to binary unsigned halfwords.
Vector convert 8 halfwords each containing 4 BCD digits to the equivalent unsigned short, in the range 0-9999. Input values should be valid 4 x BCD nibbles in the range 0-9.
processor | Latency | Throughput |
---|---|---|
power8 | 22-31 | 1/cycle |
power9 | 23-32 | 1/cycle |
vra | a 128-bit vector treated as 8 unsigned 4-digit BCD numbers. |
Vector Decimal Convert groups of 32 BCD digits to binary unsigned quadword.
Vector convert a quadword containing 32 BCD digits to the equivalent unsigned __int128, in the range 0-99999999999999999999999999999999. Input values should be valid 32 x BCD nibbles in the range 0-9.
processor | Latency | Throughput |
---|---|---|
power8 | 65-78 | 1/cycle |
power9 | 28-37 | 1/12 cycle |
vra | a 128-bit vector treated as an unsigned 32-digit BCD number. |
Vector Decimal Convert groups of 8 BCD digits to binary unsigned words.
Vector convert 4 words each containing 8 BCD digits to the equivalent unsigned int, in the range 0-99999999. Input values should be valid 8 x BCD nibbles in the range 0-9.
processor | Latency | Throughput |
---|---|---|
power8 | 31-42 | 1/cycle |
power9 | 32-43 | 1/cycle |
vra | a 128-bit vector treated as 4 unsigned 8-digit BCD numbers. |
Vector Decimal Convert To Zoned.
Given a Signed 16-digit signed BCD value vrb, return equivalent Signed Zoned value. For Zoned (PS=0) the sign code is in bits 0:3 of byte 15.
The resulting Zone value will up to 16 digits magnitude and set to the preferred Zoned sign codes (0x30 or 0x70).
processor | Latency | Throughput |
---|---|---|
power8 | 14-27 | 1/cycle |
power9 | 3 | 2/cycle |
vrb | a 128-bit vector treated as a signed BCD 16 digit value. |
Divide a Vector Signed BCD 31 digit value by another BCD value.
One Signed 31 digit value is divided by a second 31 digit value and the quotient is returned.
processor | Latency | Throughput |
---|---|---|
power8 | 102-238 | 1/cycle |
power9 | 96-228 | 1/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Divide Extended.
The dividend a is a Signed BCD 31 digit value extended to right internally with 31 decimal 0s. The divisor b is Signed BCD 31 digit value. The quotient of a || 031 / b is truncated to a Decimal integer and returned in Signed BCD format.
processor | Latency | Throughput |
---|---|---|
power8 | 102-238 | 1/cycle |
power9 | 96-228 | 1/cycle |
a | a 128-bit vector treated as the high 31-digits of a 62-digit value extended with 0's. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Multiply two Vector Signed BCD 31 digit values.
Two Signed 31 digit values are multiplied and the lower 31 digits of the product are returned. Overflow is ignored.
The vector unit does not have a BCD multiply, so we convert the operands to _Decimal128 format and use the DFP quadword multiply. This gets tricky as the product can be up to 62 digits, and _Decimal128 format can only hold 34 digits.
To avoid overflow in the DFP Facility, we split each BCD operand into 15 upper and 16 lower digit halves. This requires up to four decimal multiplies and produces up to four 30-32 digit partial products. These are aligned appropriately (via DFP decimal shift) and summed (via DFP Decimal add) to generate the high and low (31-digit) parts of the 62 digit product.
In this case we only need the lower 31-digits of the product. So only 3 (not 4) DFP multiplies are required. Also we can discard any high digits above 31.
processor | Latency | Throughput |
---|---|---|
power8 | 94-194 | 1/cycle |
power9 | 88-227 | 1/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Vector Signed BCD Multiply High.
Two Signed 31 digit values are multiplied and the higher 31 digits of the product are returned.
The vector unit does not have a BCD multiply, so we convert the operands to _Decimal128 format and use the DFP quadword multiply. This gets tricky as the product can be up to 62 digits, and _Decimal128 format can only hold 34 digits.
To avoid overflow in the DFP Facility, we split each BCD operand into 15 upper and 16 lower digit halves. This requires up four decimal multiplies and produces four 30-32 digit partial products. These are aligned appropriately (via DFP decimal shift) and summed (via DFP Decimal add) to generate the high and low (31-digit) parts of the 62 digit product.
In this case we only need the upper 31-digits of the product. The lower 31-digits are discarded.
processor | Latency | Throughput |
---|---|---|
power8 | 106-361 | 1/cycle |
power9 | 99-271 | 1/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Shift. Shift a vector signed BCD value, left or right a variable amount of digits (nibbles). The sign nibble is preserved.
processor | Latency | Throughput |
---|---|---|
power8 | 14-25 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 31 digit value. |
vrb | Digit shift count in vector byte 7. |
Vector Set preferred BCD Sign.
Given a Signed BCD 31 digit value vrb, return the magnitude from vrb (bits 0:123) and the sign (bits 124:127) set to the preferred sign (0xc or 0xd). Valid positive sign codes are; 0xA, 0xC, 0xE, or 0xF. Valid negative sign codes are; 0xB or 0xD.
processor | Latency | Throughput |
---|---|---|
power8 | 6-26 | 1/cycle |
power9 | 3 | 2/cycle |
vrb | a 128-bit vector treated as a signed BCD 31 digit value. |
Vector BCD Shift Right Signed Quadword.
Shift a vector signed BCD value right _N digits.
processor | Latency | Throughput |
---|---|---|
power8 | 6-15 | 2/cycle |
power9 | 3-6 | 2/cycle |
vra | 128-bit vector signed BCD 31 digit value. |
_N | int constant for the number of digits to shift right. |
Vector BCD Shift Right unsigned Quadword.
Shift a vector unsigned BCD value right _N digits.
processor | Latency | Throughput |
---|---|---|
power8 | 6-15 | 2/cycle |
power9 | 3-6 | 2/cycle |
vra | 128-bit vector unsigned BCD 32 digit value. |
_N | int constant for the number of digits to shift right. |
Decimal Shift and Round. Shift a vector signed BCD value, left or right a variable amount of digits (nibbles). The sign nibble is preserved. If byte element 7 of the shift count is negative (right shift), and the last digit shifted out is greater then or equal to 5, then increment the shifted magnitude by 1.
processor | Latency | Throughput |
---|---|---|
power8 | 14-25 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 31 digit value. |
vrb | Digit shift count in vector byte 7. |
Vector BCD Shift Right Signed Quadword Immediate.
Shift a vector signed BCD value right _N digits.
processor | Latency | Throughput |
---|---|---|
power8 | 6-15 | 2/cycle |
power9 | 3-6 | 2/cycle |
vra | 128-bit vector signed BCD 31 digit value. |
_N | int constant for the number of digits to shift right. |
Vector BCD Shift Right and Round Signed Quadword Immediate.
Shift and round a vector signed BCD value right _N digits.
processor | Latency | Throughput |
---|---|---|
power8 | 25-34 | 2/cycle |
power9 | 3-6 | 2/cycle |
vra | 128-bit vector signed BCD 31 digit value. |
_N | int constant for the number of digits to shift right. |
Vector BCD Shift Right Unsigned Quadword immediate.
Shift a vector unsigned BCD value right _N digits.
processor | Latency | Throughput |
---|---|---|
power8 | 6-15 | 2/cycle |
power9 | 3-6 | 2/cycle |
vra | 128-bit vector unsigned BCD 32 digit value. |
_N | int constant for the number of digits to shift right. |
Subtract two Vector Signed BCD 31 digit values.
Subtract Signed 31 digit values and return the lower 31 digits of of the result. Overflow (carry-out/barrow) is ignored.
processor | Latency | Throughput |
---|---|---|
power8 | 13 | 1/cycle |
power9 | 3 | 2/cycle |
a | a 128-bit vector treated a signed BCD 31 digit value. |
b | a 128-bit vector treated a signed BCD 31 digit value. |
Decimal Sudtract & write Carry Signed Quadword.
Two Signed 31 digit BCD values are subtracted, and the carry-out (the high order 32nd digit) of the difference is returned.
processor | Latency | Throughput |
---|---|---|
power8 | 15-21 | 1/cycle |
power9 | 6-18 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Decimal Add Extended & write Carry Signed Quadword.
Two Signed 31 digit values and a signed carry-in are added together and the carry-out (the high order 32nd digit) of the sum is returned.
processor | Latency | Throughput |
---|---|---|
power8 | 28-37 | 1/cycle |
power9 | 9-21 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
c | a 128-bit vector treated as a signed BCD carry with values -1, 0, or +1. |
Decimal Subtract Extended Signed Modulo Quadword.
Two Signed 31 digit values and a signed carry-in are subtracted (a - b- c) and lower 31 digits of the subtraction is returned. Overflow (carry-out) is ignored.
processor | Latency | Throughput |
---|---|---|
power8 | 26 | 1/cycle |
power9 | 6 | 2/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
c | a 128-bit vector treated as a signed BCD carry with values -1, 0, or +1. |
Decimal Truncate. Truncate a vector signed BCD value vra to N-digits, where N is the unsigned integer value in bits 48-63 of vrb. The first 31-N digits are set to 0 and the result returned.
processor | Latency | Throughput |
---|---|---|
power8 | 18-27 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 31 digit value. |
vrb | Digit truncate count in vector halfword 3 (bits 48:63). |
Decimal Truncate Quadword Immediate. Truncate a vector signed BCD value vra to N-digits, where N is a unsigned short integer constant. The first 31-N digits are set to 0 and the result returned.
processor | Latency | Throughput |
---|---|---|
power8 | 6-17 | 1/cycle |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 31 digit value. |
_N | a unsigned short integer constant truncate count. |
Decimal Unsigned Shift. Shift a vector unsigned BCD value, left or right a variable amount of digits (nibbles).
processor | Latency | Throughput |
---|---|---|
power8 | 12-14 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 32 digit value. |
vrb | Digit shift count in vector byte 7. |
Decimal Unsigned Truncate. Truncate a vector unsigned BCD value vra to N-digits, where N is the unsigned integer value in bits 48-63 of vrb. The first 32-N digits are set to 0 and the result returned.
processor | Latency | Throughput |
---|---|---|
power8 | 16-25 | 1/cycle |
power9 | 3 | 2/cycle |
vra | 128-bit vector treated as an unsigned BCD 32 digit value. |
vrb | Digit truncate count in vector halfword 3 (bits 48:63). |
Decimal Unsigned Truncate Quadword Immediate. Truncate a vector unsigned BCD value vra to N-digits, where N is a unsigned short integer constant. The first 32-N digits are set to 0 and the result returned.
processor | Latency | Throughput |
---|---|---|
power8 | 6-17 | 1/cycle |
power9 | 6 | 2/cycle |
vra | 128-bit vector treated as a signed BCD 31 digit value. |
_N | a unsigned short integer constant truncate count. |
Convert vector unsigned doubleword binary values to Vector unsigned 16-digit BCD values.
Convert a vector of 2 unsigned long int doubleword to 2 16-digit unsigned BCD doublewords. Input doublewords should each be in the range 0-9999999999999999.
processor | Latency | Throughput |
---|---|---|
power8 | 69 | 1/19 cycle |
power9 | 58 | 1/21 cycle |
val | a vector unsigned long int. |
Combined Decimal Add & Write Carry Signed Quadword.
Two Signed 31 digit BCD values are added, and the carry-out (the high order 32nd digit) of the sum is generated. Alternatively if the intermediate sum changes sign we need to, borrow '1' from the magnitude of the higher BCD value and correct (invert by subtracting from 10**31) the intermediate sum. Both the sum and the carry/borrow are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 15-24 | 1/cycle |
power9 | 6-15 | 2/cycle |
cout | a pointer to a 128-bit vector to recieve the BCD carry-out. |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
Combined Decimal Add Extended & write Carry Signed Quadword.
Two Signed 31 digit values and a signed carry-in are added together and the carry-out (the high order 32nd digit) of the sum is generated. Alternatively if the intermediate sum changes sign we need to, borrow '1' from the magnitude of the next higher BCD value and correct (invert by subtracting from 10**31) the intermediate sum. Both the sum and the carry/borrow are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 54-63 | 1/cycle |
power9 | 15-24 | 2/cycle |
cout | a pointer to a 128-bit vector to recieve the BCD carry-out. |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
cin | a 128-bit vector treated as a signed BCD carry with values -1, 0, or +1. |
Combined Vector Signed BCD Multiply High/Low.
Two Signed 31 digit values are multiplied and generates the 62 digit product.
The vector unit does not have a BCD multiply, so we convert the operands to _Decimal128 format and use the DFP quadword multiply. This gets tricky as the product can be up to 62 digits, and _Decimal128 format can only hold 34 digits.
To avoid overflow in the DFP Facility, we split each BCD operand into 15 upper and 16 lower digit halves. This requires up four decimal multiplies and produces four 30-32 digit partial products. These are aligned appropriately (via DFP decimal shift) and summed (via DFP Decimal add) to generate the high and low (31-digit) parts of the 62 digit product.
In this case we compute and return the whole 62-digit product split into two 31-digit BCD vectors.
processor | Latency | Throughput |
---|---|---|
power8 | 107-413 | 1/cycle |
power9 | 115-294 | 1/cycle |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
p_high | a pointer to a 128-bit vector to receive the high 31-digits of the product (a * b). |
Combined Decimal Subtract & Write Carry Signed Quadword.
Subtract (a -b) Signed 31 digit BCD values and detect the carry/barrow (the high order 32nd digit). If the intermediate sum changes sign we need to, borrow '1' from the magnitude of the higher BCD value and correct (invert by subtracting from 10**31) the intermediate sum. Both the sum and the carry/borrow are returned.
processor | Latency | Throughput |
---|---|---|
power8 | 15-24 | 1/cycle |
power9 | 6-15 | 2/cycle |
cout | a pointer to a 128-bit vector to recieve the BCD carry-out (alues are -1, 0, and +1). |
a | a 128-bit vector treated as a signed BCD 31 digit value. |
b | a 128-bit vector treated as a signed BCD 31 digit value. |
|
inlinestatic |
Convert a __Decimal128 value to Vector BCD.
The _Decimal128 value is converted to a signed BCD 31 digit value via "DFP Decode DPD To BCD Quad". The conversion result is still in a double float register pair and so is permuted into single vector register for use.
processor | Latency | Throughput |
---|---|---|
power8 | 17 | 1/cycle |
power9 | 15 | 1/cycle |
val | a __Decimal128 in a double float pair. |
|
inlinestatic |
Pack a FPR pair (_Decimal128) to a doubleword vector (vector double).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 2/cycle |
power9 | 3 | 2/cycle |
lval | FPR pair containing a _Decimal128. |
|
inlinestatic |
Quantize (truncate) a _Decimal128 value before convert to BCD.
Truncate (round toward 0) and justify right the input _Decimal128 value so that the unit digit is in the right most position. This supports BCD multiply and divide using DFP instructions by truncating fractional digits before conversion back to BCD.
processor | Latency | Throughput |
---|---|---|
power8 | 15 | 1/cycle |
power9 | 12 | 1/cycle |
val | a _Decimal128 value. |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs from radix 100 binary integer bytes.
Convert 16 radix 100 digits to 32 BCD Format decimal digits. Input is radix 100 digits as binary bytes in the range 0-99. Each byte converted to the equivalent BCD digit pair in adjacent nibbles.
This can be used as the last stage operation in wider binary to decimal conversions.
processor | Latency | Throughput |
---|---|---|
power8 | 24-34 | 1/cycle |
power9 | 27-37 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned char of radix 100 digits. |
Vector Decimal Convert radix 10**8 Binary words to pairs of radix 10,000 binary halfwords.
Convert 4 radix 10**8 digits to 8 adjacent radix 10,000 digits. Input is radix 10**8 digits as binary words in the range 0-99999999. Each word converted to the equivalent radix 10,000 pair in adjacent halfword.
This can be used as a intermediate stage operation in wider binary to decimal conversions.
processor | Latency | Throughput |
---|---|---|
power8 | 18-25 | 1/cycle |
power9 | 19-26 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned int of radix 10**8 digits. |
Vector Decimal Convert radix 10**16 Binary doublewords to pairs of radix 10**8 binary words.
Convert 2 radix 10**16 digits to 4 adjacent radix 10**8 digits. Input is radix 10**16 digits as binary doublewords in the range 0-9999999999999999. Each doubleword converted to the equivalent radix 10**8 pair in adjacent words.
This can be used as a intermediate stage operation in wider binary to decimal conversions.
processor | Latency | Throughput |
---|---|---|
power8 | 51-61 | 1/cycle |
power9 | 30-40 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned long of radix 10**16 digits. |
Vector Decimal Convert radix 10**32 Binary quadword to pairs of radix 10**16 binary doublewords.
Convert a binary quadword to 2 adjacent radix 10**16 digits. Input is a binary quadwords in the range 0-99999999999999999999999999999999. The quadword converted to the equivalent radix 10**18 pair in adjacent doublewords.
This can be used as a first stage operation in binary to decimal conversions.
processor | Latency | Throughput |
---|---|---|
power8 | 85-95 | 1/cycle |
power9 | 56-66 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned __int128 in the range 0-99999999999999999999999999999999. |
Vector Decimal Convert radix 10,000 Binary halfwords to pairs of radix 100 binary bytes.
Convert 8 radix 10,000 digits to 16 adjacent radix 100 digits. Input is radix 10,000 digits as binary halfwords in the range 0-9999. Each halfword converted to the equivalent radix 100 pair in adjacent bytes.
This can be used as a intermediate stage operation in wider binary to decimal conversions.
processor | Latency | Throughput |
---|---|---|
power8 | 24-34 | 1/cycle |
power9 | 27-37 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned short of radix 10,000 digits. |
Vector Decimal Convert Zoned Decimal digit pairs to to radix 100 binary integer bytes..
Convert 32 decimal digits from Zoned Format (one character per digit, in 2 vectors) to Binary coded century format. Century format is adjacent digit pairs converted to a binary integer in the range 0-99. Each century digit is stored in a byte. Input values should be valid decimal characters in the range 0-9.
This can be used as the first stage operation in wider decimal to binary conversions. Basically the result of this stage are binary coded 100s "digits" that can be passed to vec_bcdctb10ks().
processor | Latency | Throughput |
---|---|---|
power8 | 15-17 | 1/cycle |
power9 | 17-20 | 1/cycle |
zone00 | a 128-bit vector char containing the high order 16 digits of a 32-digit number. |
zone16 | a 128-bit vector char containing the low order 16 digits of a 32-digit number. |
Vector Decimal Convert Binary Coded Decimal (BCD) digit pairs to radix 100 binary integer bytes.
Convert 32 decimal digits from BCD Format (one 4-bit nibble per digit) to Binary coded century format. Century format is adjacent digit pairs converted to a binary integer in the range 0-99. Each century digit is stored in a byte. Input values should be valid BCD nibbles in the range 0-9.
This can be used as the first stage operation in wider decimal to binary conversions. Basically the result of this stage are binary coded Century "digits" that can be passed to vec_bcdctb10ks().
processor | Latency | Throughput |
---|---|---|
power8 | 13-22 | 1/cycle |
power9 | 14-23 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned char of BCD nibble pairs. |
Vector Decimal Convert radix 10,000 digit halfword pairs to radix 100,000,000 binary integer words.
Convert from 10k digit Format (one 10k per halfword) to Binary coded 100m (one per word) format. 100m format is adjacent 10k digit pairs converted to a binary integer in the range 0-99999999. Input halfword values should be valid 10Ks in the range 0-9999. The result will be binary int values in the range 0-99999999.
This can be used as the intermediate stage operation in a wider BCD to binary conversions. Basically the result of this stage are binary coded 100,000,000s "digit" words which can be passed to vec_bcdctb10es().
processor | Latency | Throughput |
---|---|---|
power8 | 9-18 | 1/cycle |
power9 | 9-18 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned short of radix 10k digit pairs. |
Vector Decimal Convert radix 100,000,000 digit word pairs to radix 10E16 binary integer doublewords.
Convert from 100m digit format (one 100m digit per word) to Binary coded 10p (one per doubleword) format. 10p format is adjacent 100m digit pairs converted to a binary long integer in the range 0-9999999999999999 (10 quadrillion). Input word values should be valid 100m in the range 0-99999999.
This can be used as the intermediate stage operation in a wider BCD to binary conversions. Basically the result of this stage are binary coded 10,000,000,000,000,000s "digits" doublewords which can be passed to vec_bcdctb10e32().
processor | Latency | Throughput |
---|---|---|
power8 | 9-18 | 1/cycle |
power9 | 9-18 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned int of radix 100m digit pairs. |
Vector Decimal Convert radix 10E16 digit pairs to radix 10E32 __int128 quadwords.
Convert from 10p digit format (one 10p digit per doubleword) to binary __int128 (one per quadword) format. Input doubleword values should be valid long integers in the range 0-9999999999999999. The result will be a binary _int128 value in the range 0-99999999999999999999999999999999.
This can be used as the final stage operation in a 32-digit BCD to binary __int128 conversion.
processor | Latency | Throughput |
---|---|---|
power8 | 25-32 | 1/cycle |
power9 | 10-19 | 2/cycle |
vra | a 128-bit vector treated as a vector unsigned long of radix 10e16 digit pairs. |
Vector Decimal Convert radix 100 digit pairs to radix 10,000 binary integer halfwords.
Convert from 16 century digit Format (one century per byte) to 8 Binary coded 10k (one per halfword) format. 10K format is adjacent century digit pairs converted to a binary integer in the range 0-9999 . Input byte values should be valid 100s in the range 0-99. The result vector will be 8 short int values in the range 0-9999.
This can be used as the intermediate stage operation in wider BCD to binary conversions. Basically the result of this stage are binary coded 10,000s "digits" which can be passed to vec_bcdctb100ms().
processor | Latency | Throughput |
---|---|---|
power8 | 9-18 | 1/cycle |
power9 | 9-18 | 1/cycle |
vra | a 128-bit vector treated as a vector unsigned char of radix 100 digit pairs. |
Vector Set Bool from Signed BCD Quadword if invalid.
If the quadword's sign nibble is 0xB, 0xD, 0xA, 0xC, 0xE, or 0xF and all 31 digit nibbles 0-9 then return a vector bool __int128 that is all '0's. Otherwise return all '1's.
processor | Latency | Throughput |
---|---|---|
power8 | 15 - 39 | 1/cycle |
power9 | 3 - 15 | 1/cycle |
vra | a 128-bit vector treated as signed BCD quadword. |
Vector Set Bool from Signed BCD Quadword.
If the quadword's sign nibble is 0xB or 0xD then return a vector bool __int128 that is all '1's. Otherwise if the sign nibble is 0xA, 0xC, 0xE, or 0xF then return all '0's.
/note For _ARCH_PWR7 and earlier (No vector BCD instructions),
this implementation only tests for a valid plus sign nibble. Otherwise the BCD value is assumed to be negative.
processor | Latency | Throughput |
---|---|---|
power8 | 17 - 26 | 2/cycle |
power9 | 5 - 14 | 2/cycle |
vra | a 128-bit vector treated as signed BCD quadword. |
|
inlinestatic |
Vector Sign bit from Signed BCD Quadword.
If the quadword's sign nibble is 0xB or 0xD then return a non-zero value. Otherwise if the sign nibble is 0xA, 0xC, 0xE, or 0xF then return all '0's.
/note For _ARCH_PWR7 and earlier (No vector BCD instructions), this implementation only tests for a valid minus sign nibble. Otherwise the BCD value is assumed to be positive.
processor | Latency | Throughput |
---|---|---|
power8 | 15 - 26 | 2/cycle |
power9 | 5 - 14 | 2/cycle |
vra | a 128-bit vector treated as signed BCD quadword. |
|
inlinestatic |
Unpack a doubleword vector (vector double) into a FPR pair. (_Decimal128).
processor | Latency | Throughput |
---|---|---|
power8 | 2 | 1/cycle |
power9 | 3 | 1/cycle |
lval | FPR pair containing a _Decimal128. |
Vector Zoned Decimal Convert 32 digits to binary unsigned quadword.
Vector convert 2x quadwords each containing 16 digits to the equivalent unsigned __int128, in the range 0-99999999999999999999999999999999. Input values should be valid 32 zoned digits in the range '0'-'9'.
processor | Latency | Throughput |
---|---|---|
power8 | 67-73 | 1/cycle |
power9 | 55-62 | 1/cycle |
zone00 | a 128-bit vector char containing the high order 16 digits of a 32-digit number. |
zone16 | a 128-bit vector char containing the low order 16 digits of a 32-digit number. |