POWER Vector Library Manual  1.0.4
vec_int16_ppc.h
Go to the documentation of this file.
1 /*
2  Copyright (c) [2018] IBM Corporation.
3 
4  Licensed under the Apache License, Version 2.0 (the "License");
5  you may not use this file except in compliance with the License.
6  You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  vec_int16_ppc.h
17 
18  Contributors:
19  IBM Corporation, Steven Munroe
20  Created on: Apr 06, 2018
21  */
22 
23 #ifndef VEC_INT16_PPC_H_
24 #define VEC_INT16_PPC_H_
25 
26 #include <pveclib/vec_char_ppc.h>
27 
484 #ifndef vec_popcnth
486 static inline vui16_t vec_popcnth (vui16_t vra);
487 #else
488 /* Work around for GCC PR85830. */
489 #undef vec_popcnth
490 #define vec_popcnth __builtin_vec_vpopcnth
491 #endif
492 static inline vui16_t vec_vmrgeh (vui16_t vra, vui16_t vrb);
493 static inline vui16_t vec_vmrgoh (vui16_t vra, vui16_t vrb);
495 
511 static inline vui16_t
513 {
514  vui16_t result;
515 #ifdef _ARCH_PWR9
516 #ifdef vec_absdh
517  result = vec_absdh (vra, vrb);
518 #else
519  __asm__(
520  "vabsduh %0,%1,%2;"
521  : "=v" (result)
522  : "v" (vra), "v" (vrb)
523  : );
524 #endif
525 #else
526  vui16_t vmin, vmax;
527 
528  vmin = vec_min (vra, vrb);
529  vmax = vec_max (vra, vrb);
530  result = vec_sub (vmax, vmin);
531 #endif
532  return (result);
533 }
534 
559 static inline vui16_t
561 {
562  vui16_t r;
563 #ifdef _ARCH_PWR8
564 #if defined (vec_vclzh)
565  r = vec_vclzh (vra);
566 #elif defined (__clang__)
567  r = vec_cntlz (vra);
568 #else
569  __asm__(
570  "vclzh %0,%1;"
571  : "=v" (r)
572  : "v" (vra)
573  : );
574 #endif
575 #else
576 //#warning Implememention pre power8
577  vui16_t n, nt, y, x, s, m;
578  vui16_t z= { 0,0,0,0, 0,0,0,0};
579  vui16_t one = { 1,1,1,1, 1,1,1,1};
580 
581  /* n = 16 s = 8 */
582  s = vec_splat_u16(8);
583  n = vec_add (s, s);
584  x = vra;
585 
586  /* y=x>>8 if (y!=0) (n=n-8 x=y) */
587  y = vec_sr(x, s);
588  nt = vec_sub(n,s);
589  m = (vui16_t)vec_cmpgt(y, z);
590  s = vec_sr(s,one);
591  x = vec_sel (x , y, m);
592  n = vec_sel (n , nt, m);
593 
594  /* y=x>>4 if (y!=0) (n=n-4 x=y) */
595  y = vec_sr(x, s);
596  nt = vec_sub(n,s);
597  m = (vui16_t)vec_cmpgt(y, z);
598  s = vec_sr(s,one);
599  x = vec_sel (x , y, m);
600  n = vec_sel (n , nt, m);
601 
602  /* y=x>>2 if (y!=0) (n=n-2 x=y) */
603  y = vec_sr(x, s);
604  nt = vec_sub(n,s);
605  m = (vui16_t)vec_cmpgt(y, z);
606  s = vec_sr(s,one);
607  x = vec_sel (x , y, m);
608  n = vec_sel (n , nt, m);
609 
610  /* y=x>>1 if (y!=0) return (n=n-2) */
611  y = vec_sr(x, s);
612  nt = vec_sub(n,s);
613  nt = vec_sub(nt,s);
614  m = (vui16_t)vec_cmpgt(y, z);
615  n = vec_sel (n , nt, m);
616 
617  /* else return (x-n) */
618  nt = vec_sub (n, x);
619  r = vec_sel (nt , n, m);
620 #endif
621 
622  return (r);
623 }
624 
649 static inline vui16_t
651 {
652  vui16_t r;
653 #ifdef _ARCH_PWR9
654 #if defined (vec_cnttz) || defined (__clang__)
655  r = vec_cnttz (vra);
656 #else
657  __asm__(
658  "vctzh %0,%1;"
659  : "=v" (r)
660  : "v" (vra)
661  : );
662 #endif
663 #else
664 // For _ARCH_PWR8 and earlier. Generate 1's for the trailing zeros
665 // and 0's otherwise. Then count (popcnt) the 1's. _ARCH_PWR8 uses
666 // the hardware vpopcnth instruction. _ARCH_PWR7 and earlier use the
667 // PVECLIB vec_popcnth implementation which runs ~20-26 instructions.
668  const vui16_t ones = vec_splat_u16 (1);
669  vui16_t tzmask;
670  // tzmask = (!vra & (vra - 1))
671  tzmask = vec_andc (vec_sub (vra, ones), vra);
672  // return = vec_popcnt (!vra & (vra - 1))
673  r = vec_popcnth (tzmask);
674 #endif
675  return ((vui16_t) r);
676 }
677 
698 static inline vui16_t
700 {
701  return vec_vmrgeh ((vui16_t) vra, (vui16_t) vrb);
702 }
703 
724 static inline vui16_t
726 {
727  return vec_vmrgoh ((vui16_t) vra, (vui16_t) vrb);
728 }
729 
748 static inline vui16_t
750 {
751 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
752  return vec_vmrgoh ( vrb, vra );
753 #else
754  return vec_vmrgeh ( vra, vrb );
755 #endif
756 }
757 
776 static inline vui16_t
778 {
779 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
780  return vec_vmrgeh ( vrb, vra );
781 #else
782  return vec_vmrgoh ( vra, vrb );
783 #endif
784 }
785 
802 static inline vi16_t
804 {
805 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
806  return (vi16_t) vec_mrgahh ((vui32_t)vec_mulo (vra, vrb),
807  (vui32_t)vec_mule (vra, vrb));
808 #else
809  return (vi16_t) vec_mrgahh ((vui32_t)vec_mule (vra, vrb),
810  (vui32_t)vec_mulo (vra, vrb));
811 #endif
812 }
813 
830 static inline vui16_t
832 {
833 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
834  return vec_mrgahh (vec_mulo (vra, vrb), vec_mule (vra, vrb));
835 #else
836  return vec_mrgahh (vec_mule (vra, vrb), vec_mulo (vra, vrb));
837 #endif
838 }
839 
859 static inline vui16_t
861 {
862 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
863  return vec_mrgalh (vec_mulo (vra, vrb), vec_mule (vra, vrb));
864 #else
865  return vec_mrgalh (vec_mule (vra, vrb), vec_mulo (vra, vrb));
866 #endif
867 }
868 
892 #ifndef vec_popcnth
893 static inline vui16_t
895 {
896  vui16_t r;
897 #ifdef _ARCH_PWR8
898 #if defined (vec_vpopcnth)
899  r = vec_vpopcnth (vra);
900 #elif defined (__clang__)
901  r = vec_popcnt (vra);
902 #else
903  __asm__(
904  "vpopcnth %0,%1;"
905  : "=v" (r)
906  : "v" (vra)
907  : );
908 #endif
909 #else
910  //#warning Implememention pre power8
911  __vector unsigned short n, x1, x2, x, s;
912  __vector unsigned short ones = { 1,1,1,1, 1,1,1,1};
913  __vector unsigned short fives =
914  {0x5555,0x5555,0x5555,0x5555, 0x5555,0x5555,0x5555,0x5555};
915  __vector unsigned short threes =
916  {0x3333,0x3333,0x3333,0x3333, 0x3333,0x3333,0x3333,0x3333};
917  __vector unsigned short fs =
918  {0x0f0f,0x0f0f,0x0f0f,0x0f0f, 0x0f0f,0x0f0f,0x0f0f,0x0f0f};
919  /* n = 8 s = 4 */
920  s = ones;
921  x = vra;
922 
923  /* x = x - ((x >> 1) & 0x5555) */
924  x2 = vec_and (vec_sr (x, s), fives);
925  n = vec_sub (x, x2);
926  s = vec_add (s, s);
927 
928  /* x = (x & 0x3333) + ((x & 0xcccc) >> 2) */
929  x1 = vec_and (n, threes);
930  x2 = vec_andc (n, threes);
931  n = vec_add (x1, vec_sr (x2, s));
932  s = vec_add (s, s);
933 
934  /* x = (x + (x >> 4)) & 0x0f0f) */
935  x1 = vec_add (n, vec_sr (n, s));
936  n = vec_and (x1, fs);
937  s = vec_add (s, s);
938 
939  /* This avoids the extra load const. */
940  /* x = (x + (x << 8)) >> 8) */
941  x1 = vec_add (n, vec_sl (n, s));
942  r = vec_sr (x1, s);
943 #endif
944  return (r);
945 }
946 #else
947 /* Work around for GCC PR85830. */
948 #undef vec_popcnth
949 #define vec_popcnth __builtin_vec_vpopcnth
950 #endif
951 
966 static inline vui16_t
968 {
969  vui16_t result;
970 
971 #ifdef _ARCH_PWR9
972 #if defined (vec_revb) || defined (__clang__)
973  result = vec_revb (vra);
974 #else
975  __asm__(
976  "xxbrh %x0,%x1;"
977  : "=wa" (result)
978  : "wa" (vra)
979  : );
980 #endif
981 #else
982 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
983  const vui64_t vconstp = CONST_VINT64_DW(0x0100030205040706UL, 0x09080B0A0D0C0F0EUL);
984 #else
985  const vui64_t vconstp =
986  CONST_VINT64_DW(0x0E0F0C0D0A0B0809UL, 0x0607040502030001UL);
987 #endif
988  result = (vui16_t) vec_perm ((vui8_t) vra, (vui8_t) vra, (vui8_t) vconstp);
989 #endif
990 
991  return (result);
992 }
993 
1010 static inline vb16_t
1012 {
1013  vb16_t result;
1014 
1015 #if defined (_ARCH_PWR10) && (__GNUC__ >= 10)
1016  __asm__(
1017  "vexpandhm %0,%1"
1018  : "=v" (result)
1019  : "v" (vra)
1020  : );
1021 #else
1022  const vui16_t rshift = vec_splat_u16( 15 );
1023  // Vector Shift Right Algebraic Halfwords 15-bits.
1024  result = (vb16_t) vec_sra (vra, rshift);
1025 #endif
1026  return result;
1027 }
1028 
1046 static inline vui16_t
1047 vec_slhi (vui16_t vra, const unsigned int shb)
1048 {
1049  vui16_t lshift;
1050  vui16_t result;
1051 
1052  if (shb < 16)
1053  {
1054  /* Load the shift const in a vector. The element shifts require
1055  a shift amount for each element. For the immediate form the
1056  shift constant is splatted to all elements of the
1057  shift control. */
1058  if (__builtin_constant_p(shb))
1059  lshift = (vui16_t) vec_splat_s16(shb);
1060  else
1061  lshift = vec_splats ((unsigned short) shb);
1062 
1063  /* Vector Shift right bytes based on the lower 4-bits of
1064  corresponding element of lshift. */
1065  result = vec_vslh (vra, lshift);
1066  }
1067  else
1068  { /* shifts greater then 15 bits return zeros. */
1069  result = vec_xor ((vui16_t) vra, (vui16_t) vra);
1070  }
1071 
1072  return (vui16_t) result;
1073 }
1074 
1092 static inline vui16_t
1093 vec_srhi (vui16_t vra, const unsigned int shb)
1094 {
1095  vui16_t lshift;
1096  vui16_t result;
1097 
1098  if (shb < 16)
1099  {
1100  /* Load the shift const in a vector. The element shifts require
1101  a shift amount for each element. For the immediate form the
1102  shift constant is splatted to all elements of the
1103  shift control. */
1104  if (__builtin_constant_p(shb))
1105  lshift = (vui16_t) vec_splat_s16(shb);
1106  else
1107  lshift = vec_splats ((unsigned short) shb);
1108 
1109  /* Vector Shift right halfword based on the lower 4-bits of
1110  corresponding element of lshift. */
1111  result = vec_vsrh (vra, lshift);
1112  }
1113  else
1114  { /* shifts greater then 15 bits return zeros. */
1115  result = vec_xor ((vui16_t) vra, (vui16_t) vra);
1116  }
1117  return (vui16_t) result;
1118 }
1119 
1138 static inline vi16_t
1139 vec_srahi (vi16_t vra, const unsigned int shb)
1140 {
1141  vui16_t lshift;
1142  vi16_t result;
1143 
1144  if (shb < 16)
1145  {
1146  /* Load the shift const in a vector. The element shifts require
1147  a shift amount for each element. For the immediate form the
1148  shift constant is splatted to all elements of the
1149  shift control. */
1150  if (__builtin_constant_p(shb))
1151  lshift = (vui16_t) vec_splat_s16(shb);
1152  else
1153  lshift = vec_splats ((unsigned short) shb);
1154 
1155  /* Vector Shift Right Algebraic Halfwords based on the lower 4-bits
1156  of corresponding element of lshift. */
1157  result = vec_vsrah (vra, lshift);
1158  }
1159  else
1160  { /* shifts greater then 15 bits returns the sign bit propagated to
1161  all bits. This is equivalent to shift Right Algebraic of
1162  15 bits. */
1163  lshift = (vui16_t) vec_splat_s16(15);
1164  result = vec_vsrah (vra, lshift);
1165  }
1166 
1167  return (vi16_t) result;
1168 }
1169 
1192 static inline vui32_t
1194 {
1195  const vui16_t zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
1196  vui32_t res;
1197  vui16_t c_euh = vec_mrgahh ((vui32_t) zero, (vui32_t) c);
1198 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1199  res = vec_vmulouh (a, b);
1200 #else
1201  res = vec_vmuleuh (a, b);
1202 #endif
1203  return vec_vadduwm (res, (vui32_t) c_euh);
1204 }
1205 
1228 static inline vui32_t
1230 {
1231  const vui16_t zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
1232  vui32_t res;
1233  vui16_t c_ouh = vec_mrgalh ((vui32_t) zero, (vui32_t) c);
1234 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1235  res = vec_vmuleuh (a, b);
1236 #else
1237  res = vec_vmulouh (a, b);
1238 #endif
1239  return vec_vadduwm (res, (vui32_t) c_ouh);
1240 }
1241 
1270 static inline vui16_t
1272 {
1273 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1274  const vui16_t permute =
1275  { 0x0302,0x1312, 0x0706,0x1716, 0x0B0A,0x1B1A, 0x0F0E,0x1F1E };
1276  return vec_perm (vrb, vra, (vui8_t)permute);
1277 #else
1278  const vui16_t permute =
1279  { 0x0001,0x1011, 0x0405,0x1415, 0x0809,0x1819, 0x0C0D,0x1C1D };
1280  return vec_perm (vra, vrb, (vui8_t)permute);
1281 #endif
1282 }
1283 
1312 static inline vui16_t
1314 {
1315 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1316  const vui16_t permute =
1317  { 0x0100,0x1110, 0x0504,0x1514, 0x0908,0x1918, 0x0D0C,0x1D1C };
1318  return vec_perm (vrb, vra, (vui8_t)permute);
1319 #else
1320  const vui16_t permute =
1321  { 0x0203,0x1213, 0x0607,0x1617, 0x0A0B,0x1A1B, 0x0E0F,0x1E1F };
1322  return vec_perm (vra, vrb, (vui8_t)permute);
1323 #endif
1324 }
1325 
1326 #endif /* VEC_INT16_PPC_H_ */
vec_revbh
static vui16_t vec_revbh(vui16_t vra)
byte reverse each halfword of a vector unsigned short.
Definition: vec_int16_ppc.h:967
vec_vmrgoh
static vui16_t vec_vmrgoh(vui16_t vra, vui16_t vrb)
Vector Merge Odd Halfwords.
Definition: vec_int16_ppc.h:1313
vec_absduh
static vui16_t vec_absduh(vui16_t vra, vui16_t vrb)
Vector Absolute Difference Unsigned halfword.
Definition: vec_int16_ppc.h:512
vec_setb_sh
static vb16_t vec_setb_sh(vi16_t vra)
Vector Set Bool from Signed Halfword.
Definition: vec_int16_ppc.h:1011
vec_mrgalh
static vui16_t vec_mrgalh(vui32_t vra, vui32_t vrb)
Vector Merge Algebraic Low Halfword operation.
Definition: vec_int16_ppc.h:725
vec_clzh
static vui16_t vec_clzh(vui16_t vra)
Vector Count Leading Zeros Halfword for unsigned short elements.
Definition: vec_int16_ppc.h:560
CONST_VINT64_DW
#define CONST_VINT64_DW(__dw0, __dw1)
Arrange elements of dword initializer in high->low order.
Definition: vec_common_ppc.h:295
vui16_t
__vector unsigned short vui16_t
vector of 16-bit unsigned short elements.
Definition: vec_common_ppc.h:204
vec_mulhuh
static vui16_t vec_mulhuh(vui16_t vra, vui16_t vrb)
Vector Multiply High Unsigned halfword.
Definition: vec_int16_ppc.h:831
vec_mulhsh
static vi16_t vec_mulhsh(vi16_t vra, vi16_t vrb)
Vector Multiply High Signed halfword.
Definition: vec_int16_ppc.h:803
vec_ctzh
static vui16_t vec_ctzh(vui16_t vra)
Vector Count Trailing Zeros Halfword for unsigned short elements.
Definition: vec_int16_ppc.h:650
vui64_t
__vector unsigned long long vui64_t
vector of 64-bit unsigned long long elements.
Definition: vec_common_ppc.h:208
vui8_t
__vector unsigned char vui8_t
vector of 8-bit unsigned char elements.
Definition: vec_common_ppc.h:202
vec_slhi
static vui16_t vec_slhi(vui16_t vra, const unsigned int shb)
Vector Shift left Halfword Immediate.
Definition: vec_int16_ppc.h:1047
vec_srhi
static vui16_t vec_srhi(vui16_t vra, const unsigned int shb)
Vector Shift Right Halfword Immediate.
Definition: vec_int16_ppc.h:1093
vec_popcnth
static vui16_t vec_popcnth(vui16_t vra)
Vector Population Count halfword.
Definition: vec_int16_ppc.h:894
vec_mrgeh
static vui16_t vec_mrgeh(vui16_t vra, vui16_t vrb)
Vector Merge Even Halfwords operation.
Definition: vec_int16_ppc.h:749
vec_vmrgeh
static vui16_t vec_vmrgeh(vui16_t vra, vui16_t vrb)
Vector Merge Even Halfwords.
Definition: vec_int16_ppc.h:1271
vb16_t
__vector __bool short vb16_t
vector of 16-bit bool short elements.
Definition: vec_common_ppc.h:226
vec_mrgoh
static vui16_t vec_mrgoh(vui16_t vra, vui16_t vrb)
Vector Merge Odd Halfwords operation.
Definition: vec_int16_ppc.h:777
vui32_t
__vector unsigned int vui32_t
vector of 32-bit unsigned int elements.
Definition: vec_common_ppc.h:206
vi16_t
__vector short vi16_t
vector of 16-bit signed short elements.
Definition: vec_common_ppc.h:213
vec_vmaddouh
static vui32_t vec_vmaddouh(vui16_t a, vui16_t b, vui16_t c)
Vector Multiply-Add Odd Unsigned Halfwords.
Definition: vec_int16_ppc.h:1229
vec_muluhm
static vui16_t vec_muluhm(vui16_t vra, vui16_t vrb)
Vector Multiply Unsigned halfword Modulo.
Definition: vec_int16_ppc.h:860
vec_mrgahh
static vui16_t vec_mrgahh(vui32_t vra, vui32_t vrb)
Vector Merge Algebraic High Halfword operation.
Definition: vec_int16_ppc.h:699
vec_srahi
static vi16_t vec_srahi(vi16_t vra, const unsigned int shb)
Vector Shift Right Algebraic Halfword Immediate.
Definition: vec_int16_ppc.h:1139
vec_vmaddeuh
static vui32_t vec_vmaddeuh(vui16_t a, vui16_t b, vui16_t c)
Vector Multiply-Add Even Unsigned Halfwords.
Definition: vec_int16_ppc.h:1193
vec_char_ppc.h
Header package containing a collection of 128-bit SIMD operations over 8-bit integer (char) elements.