POWER Vector Library Manual  1.0.4
vec_int512_ppc.h
Go to the documentation of this file.
1 /*
2  Copyright (c) [2019] Steven Munroe
3 
4  Licensed under the Apache License, Version 2.0 (the "License");
5  you may not use this file except in compliance with the License.
6  You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  vec_int128_ppc.h
17 
18  Contributors:
19  Steven Munroe
20  Created on: Aug 24, 2019
21  Steven Munroe, additional contributions for POWER9.
22  */
23 
24 #ifndef SRC_PVECLIB_VEC_INT512_PPC_H_
25 #define SRC_PVECLIB_VEC_INT512_PPC_H_
26 
27 #include <pveclib/vec_int128_ppc.h>
28 
810 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
811 #define CONST_VINT512_Q(__q0, __q1, __q2, __q3) {__q3, __q2, __q1, __q0}
812 #else
813 #define CONST_VINT512_Q(__q0, __q1, __q2, __q3) {__q0, __q1, __q2, __q3}
814 #endif
815 
816 
823 typedef struct
824 {
826 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
827  vui128_t vx0;
828  vui128_t vx1;
829 #else
830  vui128_t vx1;
831  vui128_t vx0;
832 #endif
833 } __VEC_U_256;
835 
842 typedef struct
843 {
845 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
846  vui128_t vx0;
847  vui128_t vx1;
848  vui128_t vx2;
849  vui128_t vx3;
850 #else
851  vui128_t vx3;
852  vui128_t vx2;
853  vui128_t vx1;
854  vui128_t vx0;
855 #endif
856 } __VEC_U_512;
858 
867 typedef struct
868 {
870 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
871  vui128_t vx0;
872  vui128_t vx1;
873  vui128_t vx2;
874  vui128_t vx3;
875  vui128_t vx4;
876 #else
877  vui128_t vx4;
878  vui128_t vx3;
879  vui128_t vx2;
880  vui128_t vx1;
881  vui128_t vx0;
882 #endif
883 } __VEC_U_640;
885 
901 typedef union
902 {
904  __VEC_U_640 x640;
905  struct
906  {
907 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
908  __VEC_U_512 v0x512;
909  vui128_t v1x128;
910 #else
911  vui128_t v1x128;
912  __VEC_U_512 v0x512;
913 #endif
914  } x2;
915  struct
916  {
917 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
918  vui128_t v1x128;
919  __VEC_U_512 v0x512;
920 #else
921  __VEC_U_512 v0x512;
922  vui128_t v1x128;
923 #endif
924  } x3;
926 } __VEC_U_512x1;
927 
934 typedef struct
935 {
937 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
938  vui128_t vx0;
939  vui128_t vx1;
940  vui128_t vx2;
941  vui128_t vx3;
942  vui128_t vx4;
943  vui128_t vx5;
944  vui128_t vx6;
945  vui128_t vx7;
946 #else
947  vui128_t vx7;
948  vui128_t vx6;
949  vui128_t vx5;
950  vui128_t vx4;
951  vui128_t vx3;
952  vui128_t vx2;
953  vui128_t vx1;
954  vui128_t vx0;
955 #endif
956 } __VEC_U_1024;
958 
971 typedef struct
972 {
974 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
975  vui128_t vx0;
976  vui128_t vx1;
977  vui128_t vx2;
978  vui128_t vx3;
979  vui128_t vx4;
980  vui128_t vx5;
981  vui128_t vx6;
982  vui128_t vx7;
983  vui128_t vx8;
984 #else
985  vui128_t vx8;
986  vui128_t vx7;
987  vui128_t vx6;
988  vui128_t vx5;
989  vui128_t vx4;
990  vui128_t vx3;
991  vui128_t vx2;
992  vui128_t vx1;
993  vui128_t vx0;
994 #endif
995 } __VEC_U_1152;
997 
1008 typedef struct
1009 {
1011 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1012  vui128_t vx0;
1013  vui128_t vx1;
1014  vui128_t vx2;
1015  vui128_t vx3;
1016  vui128_t vx4;
1017  vui128_t vx5;
1018  vui128_t vx6;
1019  vui128_t vx7;
1020  vui128_t vx8;
1021  vui128_t vx9;
1022  vui128_t vx10;
1023  vui128_t vx11;
1024  vui128_t vx12;
1025  vui128_t vx13;
1026  vui128_t vx14;
1027  vui128_t vx15;
1028 #else
1029  vui128_t vx15;
1030  vui128_t vx14;
1031  vui128_t vx13;
1032  vui128_t vx12;
1033  vui128_t vx11;
1034  vui128_t vx10;
1035  vui128_t vx9;
1036  vui128_t vx8;
1037  vui128_t vx7;
1038  vui128_t vx6;
1039  vui128_t vx5;
1040  vui128_t vx4;
1041  vui128_t vx3;
1042  vui128_t vx2;
1043  vui128_t vx1;
1044  vui128_t vx0;
1045 #endif
1046 } __VEC_U_2048;
1048 
1061 typedef union
1062 {
1064  __VEC_U_1024 x1024;
1065  struct
1066  {
1067 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1068  __VEC_U_512 v0x512;
1069  __VEC_U_512 v1x512;
1070 #else
1071  __VEC_U_512 v1x512;
1072  __VEC_U_512 v0x512;
1073 #endif
1074  } x2;
1077 
1095 typedef union
1096 {
1098  __VEC_U_2048 x2048;
1099  struct
1100  {
1101 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1102  __VEC_U_1024 v0x1024;
1103  __VEC_U_1024 v1x1024;
1104 #else
1105  __VEC_U_1024 v1x1024;
1106  __VEC_U_1024 v0x1024;
1107 #endif
1108  } x2;
1109  struct
1110  {
1111 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1112  __VEC_U_512 v0x512;
1113  __VEC_U_512 v1x512;
1114  __VEC_U_512 v2x512;
1115  __VEC_U_512 v3x512;
1116 #else
1117  __VEC_U_512 v3x512;
1118  __VEC_U_512 v2x512;
1119  __VEC_U_512 v1x512;
1120  __VEC_U_512 v0x512;
1121 #endif
1122  } x4;
1125 
1138 typedef struct
1139 {
1141 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1142  vui128_t vx0;
1143  vui128_t vx1;
1144  vui128_t vx2;
1145  vui128_t vx3;
1146  vui128_t vx4;
1147  vui128_t vx5;
1148  vui128_t vx6;
1149  vui128_t vx7;
1150  vui128_t vx8;
1151  vui128_t vx9;
1152  vui128_t vx10;
1153  vui128_t vx11;
1154  vui128_t vx12;
1155  vui128_t vx13;
1156  vui128_t vx14;
1157  vui128_t vx15;
1158  vui128_t vx16;
1159 #else
1160  vui128_t vx16;
1161  vui128_t vx15;
1162  vui128_t vx14;
1163  vui128_t vx13;
1164  vui128_t vx12;
1165  vui128_t vx11;
1166  vui128_t vx10;
1167  vui128_t vx9;
1168  vui128_t vx8;
1169  vui128_t vx7;
1170  vui128_t vx6;
1171  vui128_t vx5;
1172  vui128_t vx4;
1173  vui128_t vx3;
1174  vui128_t vx2;
1175  vui128_t vx1;
1176  vui128_t vx0;
1177 #endif
1178 } __VEC_U_2176;
1180 
1191 typedef struct
1192 {
1194 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1195  vui128_t vx0;
1196  vui128_t vx1;
1197  vui128_t vx2;
1198  vui128_t vx3;
1199  vui128_t vx4;
1200  vui128_t vx5;
1201  vui128_t vx6;
1202  vui128_t vx7;
1203  vui128_t vx8;
1204  vui128_t vx9;
1205  vui128_t vx10;
1206  vui128_t vx11;
1207  vui128_t vx12;
1208  vui128_t vx13;
1209  vui128_t vx14;
1210  vui128_t vx15;
1211  vui128_t vx16;
1212  vui128_t vx17;
1213  vui128_t vx18;
1214  vui128_t vx19;
1215  vui128_t vx20;
1216  vui128_t vx21;
1217  vui128_t vx22;
1218  vui128_t vx23;
1219  vui128_t vx24;
1220  vui128_t vx25;
1221  vui128_t vx26;
1222  vui128_t vx27;
1223  vui128_t vx28;
1224  vui128_t vx29;
1225  vui128_t vx30;
1226  vui128_t vx31;
1227 #else
1228  vui128_t vx31;
1229  vui128_t vx30;
1230  vui128_t vx29;
1231  vui128_t vx28;
1232  vui128_t vx27;
1233  vui128_t vx26;
1234  vui128_t vx25;
1235  vui128_t vx24;
1236  vui128_t vx23;
1237  vui128_t vx22;
1238  vui128_t vx21;
1239  vui128_t vx20;
1240  vui128_t vx19;
1241  vui128_t vx18;
1242  vui128_t vx17;
1243  vui128_t vx16;
1244  vui128_t vx15;
1245  vui128_t vx14;
1246  vui128_t vx13;
1247  vui128_t vx12;
1248  vui128_t vx11;
1249  vui128_t vx10;
1250  vui128_t vx9;
1251  vui128_t vx8;
1252  vui128_t vx7;
1253  vui128_t vx6;
1254  vui128_t vx5;
1255  vui128_t vx4;
1256  vui128_t vx3;
1257  vui128_t vx2;
1258  vui128_t vx1;
1259  vui128_t vx0;
1260 #endif
1261 } __VEC_U_4096;
1263 
1281 typedef union
1282 {
1284  __VEC_U_4096 x4096;
1285  struct
1286  {
1287 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1288  __VEC_U_2048 v0x2048;
1289  __VEC_U_2048 v1x2048;
1290 #else
1291  __VEC_U_2048 v1x2048;
1292  __VEC_U_2048 v0x2048;
1293 #endif
1294  } x2;
1295  struct
1296  {
1297 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1298  __VEC_U_1024 v0x1024;
1299  __VEC_U_1024 v1x1024;
1300  __VEC_U_1024 v2x1024;
1301  __VEC_U_1024 v3x1024;
1302 #else
1303  __VEC_U_1024 v3x1024;
1304  __VEC_U_1024 v2x1024;
1305  __VEC_U_1024 v1x1024;
1306  __VEC_U_1024 v0x1024;
1307 #endif
1308  } x4;
1309  struct
1310  {
1311 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1312  __VEC_U_512 v0x512;
1313  __VEC_U_512 v1x512;
1314  __VEC_U_512 v2x512;
1315  __VEC_U_512 v3x512;
1316  __VEC_U_512 v4x512;
1317  __VEC_U_512 v5x512;
1318  __VEC_U_512 v6x512;
1319  __VEC_U_512 v7x512;
1320 #else
1321  __VEC_U_512 v7x512;
1322  __VEC_U_512 v6x512;
1323  __VEC_U_512 v5x512;
1324  __VEC_U_512 v4x512;
1325  __VEC_U_512 v3x512;
1326  __VEC_U_512 v2x512;
1327  __VEC_U_512 v1x512;
1328  __VEC_U_512 v0x512;
1329 #endif
1330  } x8;
1333 
1340 #ifdef __VEC_EXPLICIT_FENCE_NOPS__
1341 // Generate NOPS inline to make compiler fences visible in obj code.
1342 #define COMPILE_FENCE __asm ("nop":::)
1343 #else
1344 #define COMPILE_FENCE __asm (";":::)
1345 #endif
1346 
1348 #ifdef _ARCH_PWR10
1349 #define __VEC_PWR_IMP(FNAME) FNAME ## _PWR10
1350 #else
1351 #ifdef _ARCH_PWR9
1352 #define __VEC_PWR_IMP(FNAME) FNAME ## _PWR9
1353 #else
1354 #ifdef _ARCH_PWR8
1355 #define __VEC_PWR_IMP(FNAME) FNAME ## _PWR8
1356 #else
1357 #define __VEC_PWR_IMP(FNAME) FNAME ## _PWR7
1358 #endif
1359 #endif
1360 #endif
1361 
1379 static inline __VEC_U_640
1381 {
1382  __VEC_U_640 result;
1383  vui128_t mc, mp;
1384 
1385  result.vx0 = vec_addcq (&mc, a.vx0, b.vx0);
1386  result.vx1 = vec_addeq (&mp, a.vx1, b.vx1, mc);
1387  result.vx2 = vec_addeq (&mc, a.vx2, b.vx2, mp);
1388  result.vx3 = vec_addeq (&result.vx4, a.vx3, b.vx3, mc);
1389  return result;
1390 }
1391 
1411 static inline __VEC_U_640
1413 {
1414  __VEC_U_640 result;
1415  vui128_t mp, mq;
1416 
1417  result.vx0 = vec_addeq (&mq, a.vx0, b.vx0, c);
1418  result.vx1 = vec_addeq (&mp, a.vx1, b.vx1, mq);
1419  result.vx2 = vec_addeq (&mq, a.vx2, b.vx2, mp);
1420  result.vx3 = vec_addeq (&result.vx4, a.vx3, b.vx3, mq);
1421  return result;
1422 }
1423 
1443 static inline __VEC_U_512
1445 {
1446  __VEC_U_512 result;
1447  vui128_t mp, mq;
1448 
1449  result.vx0 = vec_addeq (&mq, a.vx0, b.vx0, c);
1450  result.vx1 = vec_addeq (&mp, a.vx1, b.vx1, mq);
1451  result.vx2 = vec_addeq (&mq, a.vx2, b.vx2, mp);
1452  result.vx3 = vec_addeuqm (a.vx3, b.vx3, mq);
1453  return result;
1454 }
1455 
1473 static inline __VEC_U_512
1475 {
1476  __VEC_U_512 result;
1477  vui128_t mc, mp;
1478 
1479  result.vx0 = vec_addcq (&mc, a.vx0, b.vx0);
1480  result.vx1 = vec_addeq (&mp, a.vx1, b.vx1, mc);
1481  result.vx2 = vec_addeq (&mc, a.vx2, b.vx2, mp);
1482  result.vx3 = vec_addeuqm ( a.vx3, b.vx3, mc);
1483  return result;
1484 }
1485 
1506 static inline vec_add512ze (__VEC_U_512 a, vui128_t c)
1507 {
1508  __VEC_U_512 result;
1509  vui128_t mp, mq;
1510 
1511  result.vx0 = vec_adduqm (a.vx0, c);
1512  mq = vec_addcuq (a.vx0, c);
1513  result.vx1 = vec_adduqm (a.vx1, mq);
1514  mp = vec_addcuq (a.vx1, mq);
1515  result.vx2 = vec_adduqm (a.vx2, mp);
1516  mq = vec_addcuq (a.vx2, mp);
1517  result.vx3 = vec_adduqm (a.vx3, mq);
1518  return result;
1519 }
1520 
1542 static inline vec_add512ze2 (__VEC_U_512 a, vui128_t c1, vui128_t c2)
1543 {
1544  __VEC_U_512 result;
1545  vui128_t mp, mq;
1546 
1547  result.vx0 = vec_addeuqm (a.vx0, c1, c2);
1548  mq = vec_addecuq (a.vx0, c1, c2);
1549  result.vx1 = vec_adduqm (a.vx1, mq);
1550  mp = vec_addcuq (a.vx1, mq);
1551  result.vx2 = vec_adduqm (a.vx2, mp);
1552  mq = vec_addcuq (a.vx2, mp);
1553  result.vx3 = vec_adduqm (a.vx3, mq);
1554  return result;
1555 }
1556 
1573 static inline __VEC_U_256
1575 {
1576  __VEC_U_256 result;
1577  result.vx0 = vec_muludq (&result.vx1, a, b);
1578  return result;
1579 }
1580 
1604 static inline __VEC_U_512
1606 {
1607  __VEC_U_512 result;
1608  vui128_t mp, mq;
1609  vui128_t mphh, mphl, mplh, mpll;
1610  mpll = vec_muludq (&mplh, m1.vx0, m2.vx0);
1611 
1612  mp = vec_madduq (&mphl, m1.vx1, m2.vx0, mplh);
1613  mplh = mp;
1614  COMPILE_FENCE;
1615 
1616  mp = vec_madduq (&mq, m1.vx0, m2.vx1, mplh);
1617  mplh = mp;
1618  mp = vec_madd2uq (&mphh, m1.vx1, m2.vx1, mphl, mq);
1619  mphl = mp;
1620 
1621  result.vx0 = mpll;
1622  result.vx1 = mplh;
1623  result.vx2 = mphl;
1624  result.vx3 = mphh;
1625  return result;
1626 }
1627 
1652 static inline __VEC_U_640
1654 {
1655  __VEC_U_640 result;
1656  vui128_t mq3, mq2, mq1, mq0;
1657  vui128_t mpx0, mpx1, mpx2, mpx3;
1658 
1659  mpx0 = vec_muludq (&mq0, m1.vx0, m2);
1660  mpx1 = vec_madduq (&mq1, m1.vx1, m2, mq0);
1661  COMPILE_FENCE;
1662  mpx2 = vec_madduq (&mq2, m1.vx2, m2, mq1);
1663  mpx3 = vec_madduq (&mq3, m1.vx3, m2, mq2);
1664 
1665  result.vx0 = mpx0;
1666  result.vx1 = mpx1;
1667  result.vx2 = mpx2;
1668  result.vx3 = mpx3;
1669  result.vx4 = mq3;
1670  return result;
1671 }
1672 
1701 static inline __VEC_U_640
1703 {
1704  __VEC_U_640 result;
1705  vui128_t mq3, mq2, mq1, mq0;
1706  vui128_t mpx0, mpx1, mpx2, mpx3;
1707 
1708  mpx0 = vec_madduq (&mq0, m1.vx0, m2, a1);
1709  mpx1 = vec_madduq (&mq1, m1.vx1, m2, mq0);
1710  COMPILE_FENCE;
1711  mpx2 = vec_madduq (&mq2, m1.vx2, m2, mq1);
1712  mpx3 = vec_madduq (&mq3, m1.vx3, m2, mq2);
1713 
1714  result.vx0 = mpx0;
1715  result.vx1 = mpx1;
1716  result.vx2 = mpx2;
1717  result.vx3 = mpx3;
1718  result.vx4 = mq3;
1719  return result;
1720 }
1721 
1750 static inline __VEC_U_640
1752 {
1753  __VEC_U_640 result;
1754  vui128_t mq3, mq2, mq1, mq0;
1755  vui128_t mpx0, mpx1, mpx2, mpx3;
1756 
1757  mpx0 = vec_madduq (&mq0, m1.vx0, m2, a2.vx0);
1758  mpx1 = vec_madd2uq (&mq1, m1.vx1, m2, mq0, a2.vx1);
1759  COMPILE_FENCE;
1760  mpx2 = vec_madd2uq (&mq2, m1.vx2, m2, mq1, a2.vx2);
1761  mpx3 = vec_madd2uq (&mq3, m1.vx3, m2, mq2, a2.vx3);
1762 
1763  result.vx0 = mpx0;
1764  result.vx1 = mpx1;
1765  result.vx2 = mpx2;
1766  result.vx3 = mpx3;
1767  result.vx4 = mq3;
1768  return result;
1769 }
1770 
1800 static inline __VEC_U_640
1802 {
1803  __VEC_U_640 result;
1804  vui128_t mq3, mq2, mq1, mq0;
1805  vui128_t mpx0, mpx1, mpx2, mpx3;
1806 
1807  mpx0 = vec_madd2uq (&mq0, m1.vx0, m2, a1, a2.vx0);
1808  mpx1 = vec_madd2uq (&mq1, m1.vx1, m2, mq0, a2.vx1);
1809  COMPILE_FENCE;
1810  mpx2 = vec_madd2uq (&mq2, m1.vx2, m2, mq1, a2.vx2);
1811  mpx3 = vec_madd2uq (&mq3, m1.vx3, m2, mq2, a2.vx3);
1812 
1813  result.vx0 = mpx0;
1814  result.vx1 = mpx1;
1815  result.vx2 = mpx2;
1816  result.vx3 = mpx3;
1817  result.vx4 = mq3;
1818  return result;
1819 }
1820 
1845 static inline __VEC_U_1024
1847 {
1848  __VEC_U_1024 result;
1849  __VEC_U_512x1 mp3, mp2, mp1, mp0;
1850 
1851  mp0.x640 = vec_mul512x128_inline (m1, m2.vx0);
1852  result.vx0 = mp0.x3.v1x128;
1853  COMPILE_FENCE;
1854  mp1.x640 = vec_madd512x128a512_inline (m1, m2.vx1, mp0.x3.v0x512);
1855  result.vx1 = mp1.x3.v1x128;
1856  COMPILE_FENCE;
1857  mp2.x640 = vec_madd512x128a512_inline (m1, m2.vx2, mp1.x3.v0x512);
1858  result.vx2 = mp2.x3.v1x128;
1859  COMPILE_FENCE;
1860  mp3.x640 = vec_madd512x128a512_inline (m1, m2.vx3, mp2.x3.v0x512);
1861 
1862  result.vx3 = mp3.x3.v1x128;
1863  result.vx4 = mp3.x3.v0x512.vx0;
1864  result.vx5 = mp3.x3.v0x512.vx1;
1865  result.vx6 = mp3.x3.v0x512.vx2;
1866  result.vx7 = mp3.x3.v0x512.vx3;
1867  return result;
1868 }
1869 
1897 static inline __VEC_U_1024
1899 {
1900  __VEC_U_1024 result;
1901  __VEC_U_512x1 mp3, mp2, mp1, mp0;
1902 
1903  mp0.x640 = vec_madd512x128a512_inline (m1, m2.vx0, a1);
1904  result.vx0 = mp0.x3.v1x128;
1905  COMPILE_FENCE;
1906  mp1.x640 = vec_madd512x128a512_inline (m1, m2.vx1, mp0.x3.v0x512);
1907  result.vx1 = mp1.x3.v1x128;
1908  COMPILE_FENCE;
1909  mp2.x640 = vec_madd512x128a512_inline (m1, m2.vx2, mp1.x3.v0x512);
1910  result.vx2 = mp2.x3.v1x128;
1911  COMPILE_FENCE;
1912  mp3.x640 = vec_madd512x128a512_inline (m1, m2.vx3, mp2.x3.v0x512);
1913 
1914  result.vx3 = mp3.x3.v1x128;
1915  result.vx4 = mp3.x3.v0x512.vx0;
1916  result.vx5 = mp3.x3.v0x512.vx1;
1917  result.vx6 = mp3.x3.v0x512.vx2;
1918  result.vx7 = mp3.x3.v0x512.vx3;
1919  return result;
1920 }
1921 
1943 extern __VEC_U_256
1945 
1967 extern __VEC_U_512
1969 
1992 extern __VEC_U_640
1994 
2019 extern __VEC_U_640
2021 
2043 extern __VEC_U_1024
2045 
2073 extern void
2075 
2104 extern void
2106  __VEC_U_2048 *m1, __VEC_U_2048 *m2);
2107 
2136 extern void
2138  vui128_t *m1, vui128_t *m2,
2139  unsigned long M, unsigned long N);
2140 
2169 extern void
2171  __VEC_U_512 *m1, __VEC_U_512 *m2,
2172  unsigned long M, unsigned long N);
2173 
2175 /* Doxygen can not handle macros or attributes */
2176 extern __VEC_U_256
2178 
2179 extern __VEC_U_512
2181 
2182 extern __VEC_U_640
2184 
2185 extern __VEC_U_640
2186  __VEC_PWR_IMP (vec_madd512x128a128) (__VEC_U_512 m1, vui128_t m2,
2187  vui128_t a1);
2188 
2189 extern __VEC_U_640
2191  __VEC_U_512 a2);
2192 
2193 extern __VEC_U_640
2194  __VEC_PWR_IMP (vec_madd512x128a128a512) (__VEC_U_512 m1, vui128_t m2,
2195  vui128_t a1, __VEC_U_512 a2);
2196 
2197 extern __VEC_U_1024
2199 
2200 extern __VEC_U_1024
2201 __VEC_PWR_IMP (vec_madd512x512a512) (__VEC_U_512 m1, __VEC_U_512 m2,
2202  __VEC_U_512 a1);
2203 
2204 extern void
2206  __VEC_U_1024 *m1_1024, __VEC_U_1024 *m2_1024);
2207 
2208 extern void
2210  __VEC_U_2048 *m1_2048, __VEC_U_2048 *m2_2048);
2211 
2212 extern void
2214  vui128_t *m1, vui128_t *m2,
2215  unsigned long M, unsigned long N);
2216 
2217 extern void
2219  __VEC_U_512 *m1, __VEC_U_512 *m2,
2220  unsigned long M, unsigned long N);
2222 
2223 #endif /* SRC_PVECLIB_VEC_INT512_PPC_H_ */
__VEC_PWR_IMP
#define __VEC_PWR_IMP(FNAME)
Macro to add platform suffix for static calls.
Definition: vec_int512_ppc.h:1357
__VEC_U_1152
A vector representation of a 1152-bit unsigned integer.
Definition: vec_int512_ppc.h:971
__VEC_U_2048x512
A vector representation of a 2048-bit unsigned integer as 4 x 512-bit integer fields.
Definition: vec_int512_ppc.h:1095
vec_madd512x512a512_inline
static __VEC_U_1024 vec_madd512x512a512_inline(__VEC_U_512 m1, __VEC_U_512 m2, __VEC_U_512 a1)
Vector 512-bit Unsigned Integer Multiply-Add.
Definition: vec_int512_ppc.h:1898
vec_addecuq
static vui128_t vec_addecuq(vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extended & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2622
vec_mul512x512
__VEC_U_1024 vec_mul512x512(__VEC_U_512 m1, __VEC_U_512 m2)
Vector 512x512-bit Unsigned Integer Multiply.
vec_madd512x128a512
__VEC_U_640 vec_madd512x128a512(__VEC_U_512 m1, vui128_t m2, __VEC_U_512 a2)
Vector 512x128-bit Multiply-Add Unsigned Integer.
__VEC_U_4096x512
A vector representation of a 4096-bit unsigned integer as 8 x 512-bit integer fields.
Definition: vec_int512_ppc.h:1281
__VEC_U_512
A vector representation of a 512-bit unsigned integer.
Definition: vec_int512_ppc.h:842
vec_mul128x128
__VEC_U_256 vec_mul128x128(vui128_t m1, vui128_t m2)
Vector 128x128bit Unsigned Integer Multiply.
vec_add512cu
static __VEC_U_640 vec_add512cu(__VEC_U_512 a, __VEC_U_512 b)
Vector Add 512-bit Unsigned Integer & Write Carry.
Definition: vec_int512_ppc.h:1380
vec_addeuqm
static vui128_t vec_addeuqm(vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extended Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:2684
__VEC_U_640
A vector representation of a 640-bit unsigned integer.
Definition: vec_int512_ppc.h:867
__VEC_U_2176
A vector representation of a 2176-bit unsigned integer.
Definition: vec_int512_ppc.h:1138
vec_madd2uq
static vui128_t vec_madd2uq(vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c1, vui128_t c2)
Vector Multiply-Add2 Unsigned Quadword.
Definition: vec_int128_ppc.h:6184
__VEC_U_256
A vector representation of a 256-bit unsigned integer.
Definition: vec_int512_ppc.h:823
vec_madd512x128a128_inline
static __VEC_U_640 vec_madd512x128a128_inline(__VEC_U_512 m1, vui128_t m2, vui128_t a1)
Vector 512x128-bit Multiply-Add Unsigned Integer.
Definition: vec_int512_ppc.h:1702
__VEC_U_1024
A vector representation of a 1024-bit unsigned integer.
Definition: vec_int512_ppc.h:934
vec_add512um
static __VEC_U_512 vec_add512um(__VEC_U_512 a, __VEC_U_512 b)
Vector Add 512-bit Unsigned Integer Modulo.
Definition: vec_int512_ppc.h:1474
vec_mul128x128_inline
static __VEC_U_256 vec_mul128x128_inline(vui128_t a, vui128_t b)
Vector 128x128bit Unsigned Integer Multiply.
Definition: vec_int512_ppc.h:1574
vec_mul1024x1024
void vec_mul1024x1024(__VEC_U_2048 *p2048, __VEC_U_1024 *m1, __VEC_U_1024 *m2)
Vector 1024x1024-bit Unsigned Integer Multiply.
vec_int128_ppc.h
Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...
vec_madduq
static vui128_t vec_madduq(vui128_t *mulu, vui128_t a, vui128_t b, vui128_t c)
Vector Multiply-Add Unsigned Quadword.
Definition: vec_int128_ppc.h:5956
vec_muludq
static vui128_t vec_muludq(vui128_t *mulu, vui128_t a, vui128_t b)
Vector Multiply Unsigned Double Quadword.
Definition: vec_int128_ppc.h:5734
vec_madd512x128a128a512_inline
static __VEC_U_640 vec_madd512x128a128a512_inline(__VEC_U_512 m1, vui128_t m2, vui128_t a1, __VEC_U_512 a2)
Vector 512x128-bit Multiply-Add Unsigned Integer.
Definition: vec_int512_ppc.h:1801
vec_add512ecu
static __VEC_U_640 vec_add512ecu(__VEC_U_512 a, __VEC_U_512 b, vui128_t c)
Vector Add Extended 512-bit Unsigned Integer & Write Carry.
Definition: vec_int512_ppc.h:1412
vec_addcuq
static vui128_t vec_addcuq(vui128_t a, vui128_t b)
Vector Add & write Carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2568
vui128_t
__vector unsigned __int128 vui128_t
vector of one 128-bit unsigned __int128 element.
Definition: vec_common_ppc.h:237
__VEC_U_4096
A vector representation of a 4096-bit unsigned integer.
Definition: vec_int512_ppc.h:1191
vec_madd512x128a512_inline
static __VEC_U_640 vec_madd512x128a512_inline(__VEC_U_512 m1, vui128_t m2, __VEC_U_512 a2)
Vector 512x128-bit Multiply-Add Unsigned Integer.
Definition: vec_int512_ppc.h:1751
__VEC_U_512x1
A vector representation of a 512-bit unsigned integer and a 128-bit carry-out.
Definition: vec_int512_ppc.h:901
vec_addcq
static vui128_t vec_addcq(vui128_t *cout, vui128_t a, vui128_t b)
Vector Add with carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2788
vec_add512eum
static __VEC_U_512 vec_add512eum(__VEC_U_512 a, __VEC_U_512 b, vui128_t c)
Vector Add Extended 512-bit Unsigned Integer Modulo.
Definition: vec_int512_ppc.h:1444
vec_add512ze
static __VEC_U_512 vec_add512ze(__VEC_U_512 a, vui128_t c)
Vector Add 512-bit to Zero Extended Unsigned Integer Modulo.
Definition: vec_int512_ppc.h:1506
vec_mul512x128_inline
static __VEC_U_640 vec_mul512x128_inline(__VEC_U_512 m1, vui128_t m2)
Vector 512x128-bit Unsigned Integer Multiply.
Definition: vec_int512_ppc.h:1653
vec_mul256x256
__VEC_U_512 vec_mul256x256(__VEC_U_256 m1, __VEC_U_256 m2)
Vector 256x256-bit Unsigned Integer Multiply.
vec_mul512x512_inline
static __VEC_U_1024 vec_mul512x512_inline(__VEC_U_512 m1, __VEC_U_512 m2)
Vector 512x512-bit Unsigned Integer Multiply.
Definition: vec_int512_ppc.h:1846
vec_mul256x256_inline
static __VEC_U_512 vec_mul256x256_inline(__VEC_U_256 m1, __VEC_U_256 m2)
Vector 256x256-bit Unsigned Integer Multiply.
Definition: vec_int512_ppc.h:1605
vec_adduqm
static vui128_t vec_adduqm(vui128_t a, vui128_t b)
Vector Add Unsigned Quadword Modulo.
Definition: vec_int128_ppc.h:2739
vec_addeq
static vui128_t vec_addeq(vui128_t *cout, vui128_t a, vui128_t b, vui128_t ci)
Vector Add Extend with carry Unsigned Quadword.
Definition: vec_int128_ppc.h:2849
COMPILE_FENCE
#define COMPILE_FENCE
A compiler fence to prevent excessive code motion.
Definition: vec_int512_ppc.h:1344
__VEC_U_2048
A vector representation of a 2048-bit unsigned integer.
Definition: vec_int512_ppc.h:1008
vec_mul512_byMN
void vec_mul512_byMN(__VEC_U_512 *p, __VEC_U_512 *m1, __VEC_U_512 *m2, unsigned long M, unsigned long N)
Vector Unsigned Integer Quadword 4xMxN Multiply.
vec_mul128_byMN
void vec_mul128_byMN(vui128_t *p, vui128_t *m1, vui128_t *m2, unsigned long M, unsigned long N)
Vector Unsigned Integer Quadword MxN Multiply.
vec_add512ze2
static __VEC_U_512 vec_add512ze2(__VEC_U_512 a, vui128_t c1, vui128_t c2)
Vector Add 512-bit to Zero Extended2 Unsigned Integer Modulo.
Definition: vec_int512_ppc.h:1542
vec_mul512x128
__VEC_U_640 vec_mul512x128(__VEC_U_512 m1, vui128_t m2)
Vector 512x128-bit Unsigned Integer Multiply.
__VEC_U_1024x512
A vector representation of a 1024-bit unsigned integer as two 512-bit fields.
Definition: vec_int512_ppc.h:1061
vec_mul2048x2048
void vec_mul2048x2048(__VEC_U_4096 *p4096, __VEC_U_2048 *m1, __VEC_U_2048 *m2)
Vector 2048x2048-bit Unsigned Integer Multiply.