-rw-r--r-- 178340 djbsort-20260210/int64/neon/sort.c raw
/* WARNING: auto-generated (by autogen/sort); do not edit */ #include <arm_neon.h> #include "int64_sort.h" #define int64 int64_t #define int64_largest 0x7fffffffffffffff #include "crypto_int64.h" #define int64_min crypto_int64_min #define int64_MINMAX(a,b) crypto_int64_minmax(&(a),&(b)) #define NOINLINE __attribute__((noinline)) #include "crypto_int8.h" #define int8 crypto_int8 #define int8_min crypto_int8_min #define int8x16 int8x16_t #include "crypto_uint8.h" #define uint8 crypto_uint8 #define uint8x16 uint8x16_t #include "crypto_int32.h" #define int32 crypto_int32 #define int32_min crypto_int32_min #define int32x4 int32x4_t #define int64x2 int64x2_t #define uint64x2 uint64x2_t #define int64x2_load vld1q_s64 #define int64x2_store vst1q_s64 #define int64x2_ifthenelse vbslq_s64 #define int32x4_smaller_umask vcltq_s32 #define int64x2_smaller_umask vcltq_s64 #define int64x2_MINMAX(a,b) \ do { \ uint64x2 t = int64x2_smaller_umask(a,b); \ int64x2 c = int64x2_ifthenelse(t,a,b); \ b = int64x2_ifthenelse(t,b,a); \ a = c; \ } while(0) #define int8x16_load vld1q_s8 #define int8x16_varextract vqtbl1q_s8 #define int8x16_add vaddq_s8 #define int8x16_sub vsubq_s8 #define int8x16_broadcast vdupq_n_s8 #define uint8x16_load vld1q_u8 #define uint8x16_add vaddq_u8 #define uint8x16_sub vsubq_u8 #define uint8x16_broadcast vdupq_n_u8 #define int8x16_from_int64x2 vreinterpretq_s8_s64 #define uint64x2_from_int64x2 vreinterpretq_u64_s64 #define int64x2_from_uint64x2 vreinterpretq_s64_u64 #define int64x2_from_int8x16 vreinterpretq_s64_s8 #define int32x4_load vld1q_s32 #define int32x4_add vaddq_s32 #define int32x4_sub vsubq_s32 #define int64x2_broadcast vdupq_n_s64 static inline uint8x16 uint8x16_set(uint8 x0,uint8 x1,uint8 x2,uint8 x3,uint8 x4,uint8 x5,uint8 x6,uint8 x7,uint8 x8,uint8 x9,uint8 x10,uint8 x11,uint8 x12,uint8 x13,uint8 x14,uint8 x15) { uint8 x[16] = {x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15}; return uint8x16_load(x); } static inline int64x2 int64x2_set(int64 x0,int64 x1) { int64 x[2] = {x0,x1}; return int64x2_load(x); } #define int64x2_10(v) vextq_s64(v,v,1) #define int64x2_leftleft vzip1q_s64 #define int64x2_rightright vzip2q_s64 NOINLINE static void int64_sort_3through7(int64 *x,long long n) { if (n >= 4) { int64 x0 = x[0]; int64 x1 = x[1]; int64 x2 = x[2]; int64 x3 = x[3]; int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x1,x2); if (n >= 5) { if (n == 5) { int64 x4 = x[4]; int64_MINMAX(x0,x4); int64_MINMAX(x2,x4); int64_MINMAX(x1,x2); int64_MINMAX(x3,x4); x[4] = x4; } else { int64 x4 = x[4]; int64 x5 = x[5]; int64_MINMAX(x4,x5); if (n == 6) { int64_MINMAX(x0,x4); int64_MINMAX(x2,x4); int64_MINMAX(x1,x5); int64_MINMAX(x3,x5); } else { int64 x6 = x[6]; int64_MINMAX(x4,x6); int64_MINMAX(x5,x6); int64_MINMAX(x0,x4); int64_MINMAX(x2,x6); int64_MINMAX(x2,x4); int64_MINMAX(x1,x5); int64_MINMAX(x3,x5); int64_MINMAX(x5,x6); x[6] = x6; } int64_MINMAX(x1,x2); int64_MINMAX(x3,x4); x[4] = x4; x[5] = x5; } } x[0] = x0; x[1] = x1; x[2] = x2; x[3] = x3; } else { int64 x0 = x[0]; int64 x1 = x[1]; int64 x2 = x[2]; int64_MINMAX(x0,x1); int64_MINMAX(x0,x2); int64_MINMAX(x1,x2); x[0] = x0; x[1] = x1; x[2] = x2; } } NOINLINE static void int64_sort_8through16(int64 *x,long long n) { int32_t pos4,pos5,pos6,pos7; int64x2 infty,t,x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2,y3,y4,y5,y6; x0 = int64x2_load(x+0); // 0 8 x1 = int64x2_load(x+2); // 1 9 x2 = int64x2_load(x+4); // 2 10 x3 = int64x2_load(x+6); // 3 11 infty = int64x2_broadcast(int64_largest); pos4 = int32_min(10,n); x4 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(8,9),int64x2_broadcast(pos4)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos4-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos4)&1))))),infty); // 4 12 pos5 = int32_min(12,n); x5 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(10,11),int64x2_broadcast(pos5)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos5-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos5)&1))))),infty); // 5 13 pos6 = int32_min(14,n); x6 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(12,13),int64x2_broadcast(pos6)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos6-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos6)&1))))),infty); // 6 14 pos7 = int32_min(16,n); x7 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(14,15),int64x2_broadcast(pos7)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos7-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos7)&1))))),infty); // 7 15 // stage (1,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 8 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 9 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 2 10 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 3 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 4 12 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 5 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 6 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 7 15 // stage (2,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 0 8 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 2 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x3)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x3); // 1 9 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x1); // 3 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y6)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 4 12 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 6 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 5 13 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 7 15 // stage (2,0) 1:2 5:6 9:10 13:14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y1)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y1); // 1 9 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y2); // 2 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y5)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y5); // 5 13 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y6); // 6 14 // stage (3,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 1 9 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 5 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 0 8 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 4 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y5)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y5); // 2 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y1); // 6 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 3 11 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 7 15 // stage (3,1) 2:4 3:5 10:12 11:13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y3)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y3); // 3 11 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x6); // 5 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x1)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x1); // 2 10 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x4); // 4 12 // stage (3,0) 1:2 3:4 5:6 9:10 11:12 13:14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,x1)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x1); // 3 11 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,y6); // 4 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y2)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y2); // 1 9 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y4); // 2 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y3)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y3); // 5 13 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y5); // 6 14 // stage (4,3) 0:15 1:14 2:13 3:12 4:11 5:10 6:9 7:8 x1 = int64x2_10(x1); // 12 4 y3 = int64x2_10(y3); // 14 6 y2 = int64x2_10(y2); // 10 2 x7 = int64x2_10(x7); // 15 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x1)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x1); // 3 4 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x6); // 12 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,y3)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,y3); // 1 6 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x4); // 14 9 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,y2)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,y2); // 5 2 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x5); // 10 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,x7)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,x7); // 0 7 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,y0); // 15 8 // stage (4,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 x0 = int64x2_10(x0); // 7 0 x7 = int64x2_10(x7); // 8 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,x0)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x0); // 3 0 x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,y6); // 7 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 1 2 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 5 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y2)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 10 9 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 14 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x7)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x7); // 8 11 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x1); // 12 15 // stage (4,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x4)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 1 0 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,y5)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,y5); // 5 4 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y1)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y1); // 8 9 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x3); // 10 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,x7)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x7); // 12 13 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,y2); // 14 15 // stage (4,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 x6 = int64x2_leftleft(y6,x4); // 1 3 x4 = int64x2_rightright(y6,x4); // 0 2 x0 = int64x2_leftleft(y0,y5); // 5 7 y5 = int64x2_rightright(y0,y5); // 4 6 x3 = int64x2_leftleft(y3,y1); // 8 10 y1 = int64x2_rightright(y3,y1); // 9 11 y2 = int64x2_leftleft(x2,x7); // 12 14 x7 = int64x2_rightright(x2,x7); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x4)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 0 2 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,y5)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,y5); // 4 6 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x0); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y1)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y1); // 8 10 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x3); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,x7)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x7); // 12 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,y2); // 13 15 x6 = int64x2_leftleft(y6,x4); // 0 1 x4 = int64x2_rightright(y6,x4); // 2 3 x0 = int64x2_leftleft(y0,y5); // 4 5 y5 = int64x2_rightright(y0,y5); // 6 7 x3 = int64x2_leftleft(y3,y1); // 8 9 y1 = int64x2_rightright(y3,y1); // 10 11 y2 = int64x2_leftleft(x2,x7); // 12 13 x7 = int64x2_rightright(x2,x7); // 14 15 int64x2_store(x+pos7-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x7),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos7)&1)))))); int64x2_store(x+pos6-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y2),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos6)&1)))))); int64x2_store(x+pos5-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y1),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos5)&1)))))); int64x2_store(x+pos4-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x3),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos4)&1)))))); int64x2_store(x+6,y5); int64x2_store(x+4,x0); int64x2_store(x+2,x4); int64x2_store(x+0,x6); } NOINLINE static void int64_sort_16through32(int64 *x,long long n) { int32_t pos10,pos11,pos12,pos13,pos14,pos15,pos8,pos9; int64x2 infty,t,x0,x1,x10,x11,x12,x13,x14,x15,x2,x3,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y2,y3,y4,y5,y6,y7,y8,y9; x0 = int64x2_load(x+0); // 0 16 x1 = int64x2_load(x+2); // 1 17 x2 = int64x2_load(x+4); // 2 18 x3 = int64x2_load(x+6); // 3 19 x4 = int64x2_load(x+8); // 4 20 x5 = int64x2_load(x+10); // 5 21 x6 = int64x2_load(x+12); // 6 22 x7 = int64x2_load(x+14); // 7 23 infty = int64x2_broadcast(int64_largest); pos8 = int32_min(18,n); x8 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(16,17),int64x2_broadcast(pos8)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos8-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos8)&1))))),infty); // 8 24 pos9 = int32_min(20,n); x9 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(18,19),int64x2_broadcast(pos9)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos9-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos9)&1))))),infty); // 9 25 pos10 = int32_min(22,n); x10 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(20,21),int64x2_broadcast(pos10)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos10-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos10)&1))))),infty); // 10 26 pos11 = int32_min(24,n); x11 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(22,23),int64x2_broadcast(pos11)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos11-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos11)&1))))),infty); // 11 27 pos12 = int32_min(26,n); x12 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(24,25),int64x2_broadcast(pos12)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos12-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos12)&1))))),infty); // 12 28 pos13 = int32_min(28,n); x13 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(26,27),int64x2_broadcast(pos13)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos13-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos13)&1))))),infty); // 13 29 pos14 = int32_min(30,n); x14 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(28,29),int64x2_broadcast(pos14)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos14-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos14)&1))))),infty); // 14 30 pos15 = int32_min(32,n); x15 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(30,31),int64x2_broadcast(pos15)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos15-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos15)&1))))),infty); // 15 31 // stage (1,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 16 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 2 18 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 3 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 4 20 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 6 22 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 8 24 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 10 26 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 12 28 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 14 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 15 31 // stage (2,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 0 16 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x3)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x3); // 1 17 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x1); // 3 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y6)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 4 20 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 5 21 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y10)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y10); // 8 24 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y8); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x11)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x11); // 9 25 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x9); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y14)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 12 28 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x15)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x15); // 13 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x13); // 15 31 // stage (2,0) 1:2 5:6 9:10 13:14 17:18 21:22 25:26 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y1)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y1); // 1 17 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y2); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y5)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y5); // 5 21 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y6); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y9)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y9); // 9 25 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y10); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y13)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y13); // 13 29 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y14); // 14 30 // stage (3,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 1 17 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 0 16 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y5)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y5); // 2 18 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y1); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 3 19 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x14)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x14); // 9 25 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x10); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x12)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x12); // 8 24 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x8); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y13)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y13); // 10 26 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y9); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x15)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x15); // 11 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x11); // 15 31 // stage (3,1) 2:4 3:5 10:12 11:13 18:20 19:21 26:28 27:29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y3)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y3); // 3 19 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x6); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x1)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x1); // 2 18 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x4); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y11)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y11); // 11 27 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,x14); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x9)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x9); // 10 26 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x12); // 12 28 // stage (3,0) 1:2 3:4 5:6 9:10 11:12 13:14 17:18 19:20 21:22 25:26 27:28 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,x1)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x1); // 3 19 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,y6); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y2)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y2); // 1 17 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y4); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y3)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y3); // 5 21 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y5); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,x9)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,x9); // 11 27 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,y14); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y10); // 9 25 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y12); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 13 29 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 14 30 // stage (4,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x14)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x14); // 3 19 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x6); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x12)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x12); // 1 17 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x4); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x13)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x13); // 5 21 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x5); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y8)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y8); // 0 16 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y0); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x9)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x9); // 4 20 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x1); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y10)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y10); // 2 18 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y2); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y11)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y11); // 6 22 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y3); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x15)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x15); // 7 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x7); // 15 31 // stage (4,2) 4:8 5:9 6:10 7:11 20:24 21:25 22:26 23:27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y7)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y7); // 7 23 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x14); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,y5)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,y5); // 5 21 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x12); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y1)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y1); // 4 20 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y8); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,x3)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,x3); // 6 22 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y10); // 10 26 // stage (4,1) 2:4 3:5 6:8 7:9 10:12 11:13 18:20 19:21 22:24 23:25 26:28 27:29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y5)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y5); // 7 23 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y14); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y6)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y6); // 3 19 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y12); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,y7)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,y7); // 11 27 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x13); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x2)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x2); // 2 18 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x8); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x3)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x3); // 10 26 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x9); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,y1)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y1); // 6 22 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x10); // 8 24 // stage (4,0) 1:2 3:4 5:6 7:8 9:10 11:12 13:14 17:18 19:20 21:22 23:24 25:26 27:28 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y1)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y1); // 7 23 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x14); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x2)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x2); // 3 19 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x12); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,x3)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,x3); // 11 27 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y13); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y4)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y4); // 1 17 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y8); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y5)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y5); // 9 25 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y9); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y6)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y6); // 5 21 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y10); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y11,y7)); x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y7); // 13 29 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y11); // 14 30 // stage (5,4) 0:31 1:30 2:29 3:28 4:27 5:26 6:25 7:24 8:23 9:22 10:21 11:20 12:19 13:18 14:17 15:16 y1 = int64x2_10(y1); // 24 8 x3 = int64x2_10(x3); // 28 12 x2 = int64x2_10(x2); // 20 4 y7 = int64x2_10(y7); // 30 14 y6 = int64x2_10(y6); // 22 6 y5 = int64x2_10(y5); // 26 10 y4 = int64x2_10(y4); // 18 2 x15 = int64x2_10(x15); // 31 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y1)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y1); // 7 8 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y14); // 24 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x3)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x3); // 3 12 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y12); // 28 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x2)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x2); // 11 4 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x13); // 20 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,y7)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,y7); // 1 14 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x8); // 30 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,y6)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,y6); // 9 6 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x9); // 22 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,y5)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y5); // 5 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x10); // 26 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,y4)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,y4); // 13 2 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x11); // 18 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x15)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x15); // 0 15 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x0); // 31 16 // stage (5,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 y0 = int64x2_10(y0); // 15 0 x15 = int64x2_10(x15); // 16 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y0)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y0); // 7 0 y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,x14); // 15 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,y13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,y13); // 3 4 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,x12); // 11 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x2)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 20 19 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 28 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 1 6 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 9 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y7,y6)); x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 22 17 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 30 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 5 2 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 13 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y4)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 18 21 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 26 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,x15)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x15); // 16 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y1); // 24 31 // stage (5,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y12)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 3 0 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 7 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y13)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y13); // 11 8 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y0); // 15 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,x1)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x1); // 16 19 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,y3); // 20 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x10)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x10); // 1 2 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x8); // 5 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x5)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 18 17 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 22 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y11)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y11); // 9 10 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y9); // 13 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y4)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 26 25 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 30 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x15)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x15); // 24 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x2); // 28 31 // stage (5,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y8)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y8); // 1 0 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,x14); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x9)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x9); // 9 8 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x0); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y7); // 16 17 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x3); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x10); // 5 4 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y12); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 20 21 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 13 12 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y2)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y2); // 24 25 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x6); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,x15)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x15); // 28 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y4); // 30 31 // stage (5,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 x14 = int64x2_leftleft(y14,y8); // 1 3 y8 = int64x2_rightright(y14,y8); // 0 2 x0 = int64x2_leftleft(y0,x9); // 9 11 x9 = int64x2_rightright(y0,x9); // 8 10 x3 = int64x2_leftleft(y3,y7); // 16 18 y7 = int64x2_rightright(y3,y7); // 17 19 y12 = int64x2_leftleft(x12,x10); // 5 7 x10 = int64x2_rightright(x12,x10); // 4 6 x1 = int64x2_leftleft(y1,x5); // 20 22 x5 = int64x2_rightright(y1,x5); // 21 23 y13 = int64x2_leftleft(x13,y11); // 13 15 y11 = int64x2_rightright(x13,y11); // 12 14 x6 = int64x2_leftleft(y6,y2); // 24 26 y2 = int64x2_rightright(y6,y2); // 25 27 y4 = int64x2_leftleft(x4,x15); // 28 30 x15 = int64x2_rightright(x4,x15); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y8)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y8); // 0 2 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,x14); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x9)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x9); // 8 10 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x0); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y7); // 16 18 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x3); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x10); // 4 6 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y12); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 20 22 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 12 14 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y2)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y2); // 24 26 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x6); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,x15)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x15); // 28 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y4); // 29 31 x14 = int64x2_leftleft(y14,y8); // 0 1 y8 = int64x2_rightright(y14,y8); // 2 3 y12 = int64x2_leftleft(x12,x10); // 4 5 x10 = int64x2_rightright(x12,x10); // 6 7 x0 = int64x2_leftleft(y0,x9); // 8 9 x9 = int64x2_rightright(y0,x9); // 10 11 y13 = int64x2_leftleft(x13,y11); // 12 13 y11 = int64x2_rightright(x13,y11); // 14 15 x3 = int64x2_leftleft(y3,y7); // 16 17 y7 = int64x2_rightright(y3,y7); // 18 19 x1 = int64x2_leftleft(y1,x5); // 20 21 x5 = int64x2_rightright(y1,x5); // 22 23 x6 = int64x2_leftleft(y6,y2); // 24 25 y2 = int64x2_rightright(y6,y2); // 26 27 y4 = int64x2_leftleft(x4,x15); // 28 29 x15 = int64x2_rightright(x4,x15); // 30 31 int64x2_store(x+pos15-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x15),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos15)&1)))))); int64x2_store(x+pos14-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y4),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos14)&1)))))); int64x2_store(x+pos13-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y2),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos13)&1)))))); int64x2_store(x+pos12-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x6),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos12)&1)))))); int64x2_store(x+pos11-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x5),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos11)&1)))))); int64x2_store(x+pos10-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x1),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos10)&1)))))); int64x2_store(x+pos9-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y7),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos9)&1)))))); int64x2_store(x+pos8-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x3),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos8)&1)))))); int64x2_store(x+14,y11); int64x2_store(x+12,y13); int64x2_store(x+10,x9); int64x2_store(x+8,x0); int64x2_store(x+6,x10); int64x2_store(x+4,y12); int64x2_store(x+2,y8); int64x2_store(x+0,x14); } NOINLINE static void int64_sort_32_xor(int64 *x,int64 xor) { int64x2 t,vecxor,x0,x1,x10,x11,x12,x13,x14,x15,x2,x3,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y2,y3,y4,y5,y6,y7,y8,y9; vecxor = int64x2_broadcast(xor); x0 = vecxor^int64x2_load(x+0); // 0 16 x1 = vecxor^int64x2_load(x+2); // 1 17 x2 = vecxor^int64x2_load(x+4); // 2 18 x3 = vecxor^int64x2_load(x+6); // 3 19 x4 = vecxor^int64x2_load(x+8); // 4 20 x5 = vecxor^int64x2_load(x+10); // 5 21 x6 = vecxor^int64x2_load(x+12); // 6 22 x7 = vecxor^int64x2_load(x+14); // 7 23 x8 = vecxor^int64x2_load(x+16); // 8 24 x9 = vecxor^int64x2_load(x+18); // 9 25 x10 = vecxor^int64x2_load(x+20); // 10 26 x11 = vecxor^int64x2_load(x+22); // 11 27 x12 = vecxor^int64x2_load(x+24); // 12 28 x13 = vecxor^int64x2_load(x+26); // 13 29 x14 = vecxor^int64x2_load(x+28); // 14 30 x15 = vecxor^int64x2_load(x+30); // 15 31 // stage (1,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 16 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 2 18 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 3 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 4 20 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 6 22 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 8 24 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 10 26 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 12 28 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 14 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 15 31 // stage (2,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 0 16 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x3)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x3); // 1 17 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x1); // 3 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y6)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 4 20 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 5 21 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y10)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y10); // 8 24 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y8); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x11)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x11); // 9 25 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x9); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y14)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 12 28 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x15)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x15); // 13 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x13); // 15 31 // stage (2,0) 1:2 5:6 9:10 13:14 17:18 21:22 25:26 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y1)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y1); // 1 17 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y2); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y5)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y5); // 5 21 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y6); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y9)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y9); // 9 25 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y10); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y13)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y13); // 13 29 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y14); // 14 30 // stage (3,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 1 17 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 0 16 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y5)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y5); // 2 18 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y1); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 3 19 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 7 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x14)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x14); // 9 25 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x10); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x12)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x12); // 8 24 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x8); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y13)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y13); // 10 26 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y9); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x15)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x15); // 11 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x11); // 15 31 // stage (3,1) 2:4 3:5 10:12 11:13 18:20 19:21 26:28 27:29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y3)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y3); // 3 19 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x6); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x1)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x1); // 2 18 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x4); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y11)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y11); // 11 27 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,x14); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x9)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x9); // 10 26 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x12); // 12 28 // stage (3,0) 1:2 3:4 5:6 9:10 11:12 13:14 17:18 19:20 21:22 25:26 27:28 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,x1)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x1); // 3 19 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,y6); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y2)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y2); // 1 17 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y4); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y3)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y3); // 5 21 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y5); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,x9)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,x9); // 11 27 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,y14); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y10); // 9 25 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y12); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 13 29 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 14 30 // stage (4,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x14)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x14); // 3 19 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x6); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x12)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x12); // 1 17 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x4); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x13)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x13); // 5 21 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x5); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y8)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y8); // 0 16 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y0); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x9)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x9); // 4 20 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x1); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y10)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y10); // 2 18 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y2); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y11)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y11); // 6 22 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y3); // 14 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x15)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x15); // 7 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x7); // 15 31 // stage (4,2) 4:8 5:9 6:10 7:11 20:24 21:25 22:26 23:27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y7)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y7); // 7 23 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x14); // 11 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,y5)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,y5); // 5 21 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x12); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y1)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y1); // 4 20 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y8); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,x3)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,x3); // 6 22 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y10); // 10 26 // stage (4,1) 2:4 3:5 6:8 7:9 10:12 11:13 18:20 19:21 22:24 23:25 26:28 27:29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y5)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y5); // 7 23 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y14); // 9 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y6)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y6); // 3 19 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y12); // 5 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,y7)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,y7); // 11 27 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x13); // 13 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x2)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x2); // 2 18 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x8); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x3)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x3); // 10 26 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x9); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,y1)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y1); // 6 22 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x10); // 8 24 // stage (4,0) 1:2 3:4 5:6 7:8 9:10 11:12 13:14 17:18 19:20 21:22 23:24 25:26 27:28 29:30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y1)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y1); // 7 23 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x14); // 8 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x2)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x2); // 3 19 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x12); // 4 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,x3)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,x3); // 11 27 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y13); // 12 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y4)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y4); // 1 17 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y8); // 2 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y5)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y5); // 9 25 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y9); // 10 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y6)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y6); // 5 21 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y10); // 6 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y11,y7)); x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y7); // 13 29 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y11); // 14 30 // stage (5,4) 0:31 1:30 2:29 3:28 4:27 5:26 6:25 7:24 8:23 9:22 10:21 11:20 12:19 13:18 14:17 15:16 y1 = int64x2_10(y1); // 24 8 x3 = int64x2_10(x3); // 28 12 x2 = int64x2_10(x2); // 20 4 y7 = int64x2_10(y7); // 30 14 y6 = int64x2_10(y6); // 22 6 y5 = int64x2_10(y5); // 26 10 y4 = int64x2_10(y4); // 18 2 x15 = int64x2_10(x15); // 31 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y1)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y1); // 7 8 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y14); // 24 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x3)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x3); // 3 12 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y12); // 28 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x2)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x2); // 11 4 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x13); // 20 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,y7)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,y7); // 1 14 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x8); // 30 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,y6)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,y6); // 9 6 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,x9); // 22 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,y5)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y5); // 5 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,x10); // 26 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,y4)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,y4); // 13 2 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x11); // 18 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x15)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x15); // 0 15 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x0); // 31 16 // stage (5,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 y0 = int64x2_10(y0); // 15 0 x15 = int64x2_10(x15); // 16 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y0)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y0); // 7 0 y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,x14); // 15 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,y13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,y13); // 3 4 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,x12); // 11 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x2)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 20 19 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 28 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 1 6 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 9 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y7,y6)); x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 22 17 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 30 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 5 2 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 13 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y4)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 18 21 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 26 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,x15)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,x15); // 16 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y1); // 24 31 // stage (5,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y12)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 3 0 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 7 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y13)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y13); // 11 8 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y0); // 15 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,x1)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,x1); // 16 19 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,y3); // 20 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x10)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x10); // 1 2 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x8); // 5 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x5)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 18 17 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 22 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y11)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y11); // 9 10 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y9); // 13 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y4)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 26 25 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 30 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x15)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x15); // 24 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x2); // 28 31 // stage (5,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y8)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y8); // 1 0 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,x14); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x9)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x9); // 9 8 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x0); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y7); // 16 17 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x3); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x10); // 5 4 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y12); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 20 21 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 13 12 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y2)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y2); // 24 25 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x6); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,x15)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x15); // 28 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y4); // 30 31 // stage (5,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 x14 = int64x2_leftleft(y14,y8); // 1 3 y8 = int64x2_rightright(y14,y8); // 0 2 x0 = int64x2_leftleft(y0,x9); // 9 11 x9 = int64x2_rightright(y0,x9); // 8 10 x3 = int64x2_leftleft(y3,y7); // 16 18 y7 = int64x2_rightright(y3,y7); // 17 19 y12 = int64x2_leftleft(x12,x10); // 5 7 x10 = int64x2_rightright(x12,x10); // 4 6 x1 = int64x2_leftleft(y1,x5); // 20 22 x5 = int64x2_rightright(y1,x5); // 21 23 y13 = int64x2_leftleft(x13,y11); // 13 15 y11 = int64x2_rightright(x13,y11); // 12 14 x6 = int64x2_leftleft(y6,y2); // 24 26 y2 = int64x2_rightright(y6,y2); // 25 27 y4 = int64x2_leftleft(x4,x15); // 28 30 x15 = int64x2_rightright(x4,x15); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,y8)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,y8); // 0 2 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,x14); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x9)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x9); // 8 10 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x0); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,y7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,y7); // 16 18 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,x3); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,x10)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,x10); // 4 6 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,y12); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 20 22 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y11)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y11); // 12 14 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y13); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,y2)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,y2); // 24 26 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,x6); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,x15)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,x15); // 28 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,y4); // 29 31 x14 = int64x2_leftleft(y14,y8); // 0 1 y8 = int64x2_rightright(y14,y8); // 2 3 y12 = int64x2_leftleft(x12,x10); // 4 5 x10 = int64x2_rightright(x12,x10); // 6 7 x0 = int64x2_leftleft(y0,x9); // 8 9 x9 = int64x2_rightright(y0,x9); // 10 11 y13 = int64x2_leftleft(x13,y11); // 12 13 y11 = int64x2_rightright(x13,y11); // 14 15 x3 = int64x2_leftleft(y3,y7); // 16 17 y7 = int64x2_rightright(y3,y7); // 18 19 x1 = int64x2_leftleft(y1,x5); // 20 21 x5 = int64x2_rightright(y1,x5); // 22 23 x6 = int64x2_leftleft(y6,y2); // 24 25 y2 = int64x2_rightright(y6,y2); // 26 27 y4 = int64x2_leftleft(x4,x15); // 28 29 x15 = int64x2_rightright(x4,x15); // 30 31 int64x2_store(x+0,vecxor^x14); int64x2_store(x+2,vecxor^y8); int64x2_store(x+4,vecxor^y12); int64x2_store(x+6,vecxor^x10); int64x2_store(x+8,vecxor^x0); int64x2_store(x+10,vecxor^x9); int64x2_store(x+12,vecxor^y13); int64x2_store(x+14,vecxor^y11); int64x2_store(x+16,vecxor^x3); int64x2_store(x+18,vecxor^y7); int64x2_store(x+20,vecxor^x1); int64x2_store(x+22,vecxor^x5); int64x2_store(x+24,vecxor^x6); int64x2_store(x+26,vecxor^y2); int64x2_store(x+28,vecxor^y4); int64x2_store(x+30,vecxor^x15); } NOINLINE static void int64_V_sort_8through16(int64 *x,long long n) { int32_t pos4,pos5,pos6,pos7; int64x2 infty,t,x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2,y3,y4,y5,y6; x0 = int64x2_load(x+0); // 7 6 x1 = int64x2_load(x+2); // 5 4 x2 = int64x2_load(x+4); // 3 2 x3 = int64x2_load(x+6); // 1 0 infty = int64x2_broadcast(int64_largest); pos4 = int32_min(10,n); x4 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(8,9),int64x2_broadcast(pos4)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos4-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos4)&1))))),infty); // 8 9 pos5 = int32_min(12,n); x5 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(10,11),int64x2_broadcast(pos5)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos5-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos5)&1))))),infty); // 10 11 pos6 = int32_min(14,n); x6 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(12,13),int64x2_broadcast(pos6)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos6-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos6)&1))))),infty); // 12 13 pos7 = int32_min(16,n); x7 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(14,15),int64x2_broadcast(pos7)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos7-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos7)&1))))),infty); // 14 15 // stage (4,3) 0:15 1:14 2:13 3:12 4:11 5:10 6:9 7:8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 7 6 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 8 9 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 3 2 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 12 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 5 4 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 10 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 1 0 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 14 15 // stage (4,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 3 2 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x6)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 8 9 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 12 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y3)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y3); // 1 0 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 10 11 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 14 15 // stage (4,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 1 0 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 9 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 10 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 5 4 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 13 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 14 15 // stage (4,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 x0 = int64x2_leftleft(y0,x1); // 1 3 x1 = int64x2_rightright(y0,x1); // 0 2 y4 = int64x2_leftleft(x4,y5); // 8 10 y5 = int64x2_rightright(x4,y5); // 9 11 y2 = int64x2_leftleft(x2,y3); // 5 7 y3 = int64x2_rightright(x2,y3); // 4 6 x6 = int64x2_leftleft(y6,x7); // 12 14 x7 = int64x2_rightright(y6,x7); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 2 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 4 6 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 13 15 x0 = int64x2_leftleft(y0,x1); // 0 1 x1 = int64x2_rightright(y0,x1); // 2 3 y2 = int64x2_leftleft(x2,y3); // 4 5 y3 = int64x2_rightright(x2,y3); // 6 7 y4 = int64x2_leftleft(x4,y5); // 8 9 y5 = int64x2_rightright(x4,y5); // 10 11 x6 = int64x2_leftleft(y6,x7); // 12 13 x7 = int64x2_rightright(y6,x7); // 14 15 int64x2_store(x+pos7-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x7),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos7)&1)))))); int64x2_store(x+pos6-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x6),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos6)&1)))))); int64x2_store(x+pos5-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y5),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos5)&1)))))); int64x2_store(x+pos4-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y4),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos4)&1)))))); int64x2_store(x+6,y3); int64x2_store(x+4,y2); int64x2_store(x+2,x1); int64x2_store(x+0,x0); } NOINLINE static void int64_V_sort_16through32(int64 *x,long long n) { int32_t pos10,pos11,pos12,pos13,pos14,pos15,pos8,pos9; int64x2 infty,t,x0,x1,x10,x11,x12,x13,x14,x15,x2,x3,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y2,y3,y4,y5,y6,y7,y8,y9; x0 = int64x2_load(x+0); // 15 14 x1 = int64x2_load(x+2); // 13 12 x2 = int64x2_load(x+4); // 11 10 x3 = int64x2_load(x+6); // 9 8 x4 = int64x2_load(x+8); // 7 6 x5 = int64x2_load(x+10); // 5 4 x6 = int64x2_load(x+12); // 3 2 x7 = int64x2_load(x+14); // 1 0 infty = int64x2_broadcast(int64_largest); pos8 = int32_min(18,n); x8 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(16,17),int64x2_broadcast(pos8)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos8-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos8)&1))))),infty); // 16 17 pos9 = int32_min(20,n); x9 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(18,19),int64x2_broadcast(pos9)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos9-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos9)&1))))),infty); // 18 19 pos10 = int32_min(22,n); x10 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(20,21),int64x2_broadcast(pos10)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos10-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos10)&1))))),infty); // 20 21 pos11 = int32_min(24,n); x11 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(22,23),int64x2_broadcast(pos11)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos11-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos11)&1))))),infty); // 22 23 pos12 = int32_min(26,n); x12 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(24,25),int64x2_broadcast(pos12)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos12-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos12)&1))))),infty); // 24 25 pos13 = int32_min(28,n); x13 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(26,27),int64x2_broadcast(pos13)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos13-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos13)&1))))),infty); // 26 27 pos14 = int32_min(30,n); x14 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(28,29),int64x2_broadcast(pos14)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos14-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos14)&1))))),infty); // 28 29 pos15 = int32_min(32,n); x15 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(30,31),int64x2_broadcast(pos15)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos15-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos15)&1))))),infty); // 30 31 // stage (5,4) 0:31 1:30 2:29 3:28 4:27 5:26 6:25 7:24 8:23 9:22 10:21 11:20 12:19 13:18 14:17 15:16 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x8)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x8); // 15 14 x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x0); // 16 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x10)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x10); // 11 10 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x2); // 20 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x12)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x12); // 7 6 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x4); // 24 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x14)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x14); // 3 2 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x6); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x9)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x9); // 13 12 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x1); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x11)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x11); // 9 8 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x3); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x13)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x13); // 5 4 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x5); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x15)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x15); // 1 0 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x7); // 30 31 // stage (5,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y4)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y4); // 7 6 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y0); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y6)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y6); // 3 2 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y2); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x12)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x12); // 16 17 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x8); // 24 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x14)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x14); // 20 21 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x10); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y5)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y5); // 5 4 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y1); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y7)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y7); // 1 0 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y3); // 9 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x13)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x13); // 18 19 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x9); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x15)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x15); // 22 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x11); // 30 31 // stage (5,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x2)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x2); // 3 2 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y6)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 11 10 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y10)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y10); // 16 17 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y8); // 20 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x14)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x14); // 24 25 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x12); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x3)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x3); // 1 0 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y7)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y7); // 9 8 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y5); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y11)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y11); // 18 19 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y9); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x15)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x15); // 26 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x13); // 30 31 // stage (5,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y1)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y1); // 1 0 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 9 8 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 16 17 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y13)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y13); // 24 25 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y12); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 5 4 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y7)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 13 12 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 20 21 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 28 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 30 31 // stage (5,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 y0 = int64x2_leftleft(x0,y1); // 1 3 y1 = int64x2_rightright(x0,y1); // 0 2 x4 = int64x2_leftleft(y4,x5); // 9 11 x5 = int64x2_rightright(y4,x5); // 8 10 x8 = int64x2_leftleft(y8,x9); // 16 18 x9 = int64x2_rightright(y8,x9); // 17 19 y12 = int64x2_leftleft(x12,y13); // 24 26 y13 = int64x2_rightright(x12,y13); // 25 27 x2 = int64x2_leftleft(y2,x3); // 5 7 x3 = int64x2_rightright(y2,x3); // 4 6 y6 = int64x2_leftleft(x6,y7); // 13 15 y7 = int64x2_rightright(x6,y7); // 12 14 y10 = int64x2_leftleft(x10,y11); // 20 22 y11 = int64x2_rightright(x10,y11); // 21 23 x14 = int64x2_leftleft(y14,x15); // 28 30 x15 = int64x2_rightright(y14,x15); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y1)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y1); // 0 2 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 8 10 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 16 18 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y13)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y13); // 24 26 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y12); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 4 6 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y7)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 12 14 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 20 22 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 28 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 29 31 y0 = int64x2_leftleft(x0,y1); // 0 1 y1 = int64x2_rightright(x0,y1); // 2 3 x2 = int64x2_leftleft(y2,x3); // 4 5 x3 = int64x2_rightright(y2,x3); // 6 7 x4 = int64x2_leftleft(y4,x5); // 8 9 x5 = int64x2_rightright(y4,x5); // 10 11 y6 = int64x2_leftleft(x6,y7); // 12 13 y7 = int64x2_rightright(x6,y7); // 14 15 x8 = int64x2_leftleft(y8,x9); // 16 17 x9 = int64x2_rightright(y8,x9); // 18 19 y10 = int64x2_leftleft(x10,y11); // 20 21 y11 = int64x2_rightright(x10,y11); // 22 23 y12 = int64x2_leftleft(x12,y13); // 24 25 y13 = int64x2_rightright(x12,y13); // 26 27 x14 = int64x2_leftleft(y14,x15); // 28 29 x15 = int64x2_rightright(y14,x15); // 30 31 int64x2_store(x+pos15-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x15),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos15)&1)))))); int64x2_store(x+pos14-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x14),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos14)&1)))))); int64x2_store(x+pos13-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y13),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos13)&1)))))); int64x2_store(x+pos12-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y12),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos12)&1)))))); int64x2_store(x+pos11-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y11),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos11)&1)))))); int64x2_store(x+pos10-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y10),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos10)&1)))))); int64x2_store(x+pos9-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x9),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos9)&1)))))); int64x2_store(x+pos8-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x8),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos8)&1)))))); int64x2_store(x+14,y7); int64x2_store(x+12,y6); int64x2_store(x+10,x5); int64x2_store(x+8,x4); int64x2_store(x+6,x3); int64x2_store(x+4,x2); int64x2_store(x+2,y1); int64x2_store(x+0,y0); } NOINLINE static void int64_V_sort_32through64(int64 *x,long long n) { int32_t pos16,pos17,pos18,pos19,pos20,pos21,pos22,pos23,pos24,pos25,pos26,pos27,pos28,pos29,pos30,pos31; int64x2 infty,t,x0,x1,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x2,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x3,x30,x31,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y15,y16,y17,y18,y19,y2,y20,y21,y22,y23,y24,y25,y26,y27,y28,y29,y3,y30,y4,y5,y6,y7,y8,y9; x0 = int64x2_load(x+0); // 31 30 x1 = int64x2_load(x+2); // 29 28 x2 = int64x2_load(x+4); // 27 26 x3 = int64x2_load(x+6); // 25 24 x4 = int64x2_load(x+8); // 23 22 x5 = int64x2_load(x+10); // 21 20 x6 = int64x2_load(x+12); // 19 18 x7 = int64x2_load(x+14); // 17 16 x8 = int64x2_load(x+16); // 15 14 x9 = int64x2_load(x+18); // 13 12 x10 = int64x2_load(x+20); // 11 10 x11 = int64x2_load(x+22); // 9 8 x12 = int64x2_load(x+24); // 7 6 x13 = int64x2_load(x+26); // 5 4 x14 = int64x2_load(x+28); // 3 2 x15 = int64x2_load(x+30); // 1 0 infty = int64x2_broadcast(int64_largest); pos16 = int32_min(34,n); x16 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(32,33),int64x2_broadcast(pos16)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos16-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos16)&1))))),infty); // 32 33 pos17 = int32_min(36,n); x17 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(34,35),int64x2_broadcast(pos17)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos17-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos17)&1))))),infty); // 34 35 pos18 = int32_min(38,n); x18 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(36,37),int64x2_broadcast(pos18)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos18-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos18)&1))))),infty); // 36 37 pos19 = int32_min(40,n); x19 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(38,39),int64x2_broadcast(pos19)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos19-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos19)&1))))),infty); // 38 39 pos20 = int32_min(42,n); x20 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(40,41),int64x2_broadcast(pos20)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos20-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos20)&1))))),infty); // 40 41 pos21 = int32_min(44,n); x21 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(42,43),int64x2_broadcast(pos21)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos21-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos21)&1))))),infty); // 42 43 pos22 = int32_min(46,n); x22 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(44,45),int64x2_broadcast(pos22)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos22-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos22)&1))))),infty); // 44 45 pos23 = int32_min(48,n); x23 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(46,47),int64x2_broadcast(pos23)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos23-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos23)&1))))),infty); // 46 47 pos24 = int32_min(50,n); x24 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(48,49),int64x2_broadcast(pos24)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos24-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos24)&1))))),infty); // 48 49 pos25 = int32_min(52,n); x25 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(50,51),int64x2_broadcast(pos25)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos25-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos25)&1))))),infty); // 50 51 pos26 = int32_min(54,n); x26 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(52,53),int64x2_broadcast(pos26)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos26-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos26)&1))))),infty); // 52 53 pos27 = int32_min(56,n); x27 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(54,55),int64x2_broadcast(pos27)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos27-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos27)&1))))),infty); // 54 55 pos28 = int32_min(58,n); x28 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(56,57),int64x2_broadcast(pos28)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos28-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos28)&1))))),infty); // 56 57 pos29 = int32_min(60,n); x29 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(58,59),int64x2_broadcast(pos29)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos29-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos29)&1))))),infty); // 58 59 pos30 = int32_min(62,n); x30 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(60,61),int64x2_broadcast(pos30)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos30-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos30)&1))))),infty); // 60 61 pos31 = int32_min(64,n); x31 = int64x2_ifthenelse(int64x2_smaller_umask(int64x2_set(62,63),int64x2_broadcast(pos31)),int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(int64x2_load(x+pos31-2)),uint8x16_add(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos31)&1))))),infty); // 62 63 // stage (6,5) 0:63 1:62 2:61 3:60 4:59 5:58 6:57 7:56 8:55 9:54 10:53 11:52 12:51 13:50 14:49 15:48 16:47 17:46 18:45 19:44 20:43 21:42 22:41 23:40 24:39 25:38 26:37 27:36 28:35 29:34 30:33 31:32 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x16)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x16); // 31 30 x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x0); // 32 33 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x18)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x18); // 27 26 x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x2); // 36 37 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x20)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x20); // 23 22 x20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x4); // 40 41 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x22)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x22); // 19 18 x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x22,x6); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x24)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x24); // 15 14 x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x8); // 48 49 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x26)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x26); // 11 10 x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x10); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x28)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x28); // 7 6 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x12); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x30)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x30); // 3 2 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x14); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x17)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x17); // 29 28 x17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x1); // 34 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x19)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x19); // 25 24 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x3); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x21)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x21); // 21 20 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x5); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x23)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x23); // 17 16 x23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x23,x7); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x25)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x25); // 13 12 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x9); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x27)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x27); // 9 8 x27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x11); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x29)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x29); // 5 4 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x13); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x15,x31)); y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x31); // 1 0 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x15); // 62 63 // stage (6,4) 0:16 1:17 2:18 3:19 4:20 5:21 6:22 7:23 8:24 9:25 10:26 11:27 12:28 13:29 14:30 15:31 32:48 33:49 34:50 35:51 36:52 37:53 38:54 39:55 40:56 41:57 42:58 43:59 44:60 45:61 46:62 47:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y8)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y8); // 15 14 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y0); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y10)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y10); // 11 10 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y2); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y12)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y12); // 7 6 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y4); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y14)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y14); // 3 2 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y6); // 19 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x16,x24)); y16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x24); // 32 33 x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x16); // 48 49 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x26)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x26); // 36 37 x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x18); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x28)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x28); // 40 41 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x20); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x22,x30)); y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x22,x30); // 44 45 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x22); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y9)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y9); // 13 12 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y1); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y11)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y11); // 9 8 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y3); // 25 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y13)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y13); // 5 4 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y5); // 21 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y7,y15)); x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y15); // 1 0 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y7); // 17 16 t = int64x2_from_uint64x2(int64x2_smaller_umask(x17,x25)); y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x25); // 34 35 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x17); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(x19,x27)); y19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x27); // 38 39 x27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x19); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x21,x29)); y21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x29); // 42 43 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x21); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x23,x31)); y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x23,x31); // 46 47 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x23); // 62 63 // stage (6,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 32:40 33:41 34:42 35:43 36:44 37:45 38:46 39:47 48:56 49:57 50:58 51:59 52:60 53:61 54:62 55:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 7 6 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 3 2 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y12)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y12); // 23 22 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y8); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y14)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y14); // 19 18 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y10); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y20)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y20); // 32 33 y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y20,y16); // 40 41 t = int64x2_from_uint64x2(int64x2_smaller_umask(y18,y22)); x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y18,y22); // 36 37 y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y18); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x28)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x28); // 48 49 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x24); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x26,x30)); y26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x30); // 52 53 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x26); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 5 4 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 1 0 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 9 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y13)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y13); // 21 20 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y9); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y11,y15)); x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y15); // 17 16 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y11); // 25 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y17,y21)); x17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y21); // 34 35 y21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y21,y17); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(y19,y23)); x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y19,y23); // 38 39 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y19); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(x25,x29)); y25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x29); // 50 51 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x25); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x27,x31)); y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x31); // 54 55 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x27); // 62 63 // stage (6,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 32:36 33:37 34:38 35:39 40:44 41:45 42:46 43:47 48:52 49:53 50:54 51:55 56:60 57:61 58:62 59:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 3 2 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x6)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 11 10 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x10)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x10); // 19 18 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x8); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y14)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 27 26 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x16,x18)); y16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x18); // 32 33 x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x16); // 36 37 t = int64x2_from_uint64x2(int64x2_smaller_umask(y20,y22)); x20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y20,y22); // 40 41 y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y20); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(y24,y26)); x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y24,y26); // 48 49 y26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y24); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x28,x30)); y28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x30); // 56 57 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x28); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y3)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y3); // 1 0 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 9 8 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x11)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x11); // 17 16 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x9); // 21 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y15)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y15); // 25 24 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y13); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(x17,x19)); y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x19); // 34 35 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x17); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y21,y23)); x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y21,y23); // 42 43 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y21); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y25,y27)); x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y25,y27); // 50 51 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y25); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x29,x31)); y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x31); // 58 59 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x29); // 62 63 // stage (6,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 32:34 33:35 36:38 37:39 40:42 41:43 44:46 45:47 48:50 49:51 52:54 53:55 56:58 57:59 60:62 61:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 1 0 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 9 8 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 17 16 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 19 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 25 24 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y17)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y17); // 32 33 y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y16); // 34 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x21)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x21); // 40 41 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x20); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x25)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x25); // 48 49 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x24); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(y28,y29)); x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y28,y29); // 56 57 y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y29,y28); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 5 4 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 13 12 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 21 20 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y15)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y15); // 29 28 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y14); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x19)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x19); // 36 37 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x18); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y22,y23)); x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y23); // 44 45 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y22); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y26,y27)); x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y27); // 52 53 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y26); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x30,x31)); y30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x31); // 60 61 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x30); // 62 63 // stage (6,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 32:33 34:35 36:37 38:39 40:41 42:43 44:45 46:47 48:49 50:51 52:53 54:55 56:57 58:59 60:61 62:63 x0 = int64x2_leftleft(y0,x1); // 1 3 x1 = int64x2_rightright(y0,x1); // 0 2 y4 = int64x2_leftleft(x4,y5); // 9 11 y5 = int64x2_rightright(x4,y5); // 8 10 y8 = int64x2_leftleft(x8,y9); // 17 19 y9 = int64x2_rightright(x8,y9); // 16 18 x12 = int64x2_leftleft(y12,x13); // 25 27 x13 = int64x2_rightright(y12,x13); // 24 26 y16 = int64x2_leftleft(x16,y17); // 32 34 y17 = int64x2_rightright(x16,y17); // 33 35 x20 = int64x2_leftleft(y20,x21); // 40 42 x21 = int64x2_rightright(y20,x21); // 41 43 x24 = int64x2_leftleft(y24,x25); // 48 50 x25 = int64x2_rightright(y24,x25); // 49 51 y28 = int64x2_leftleft(x28,y29); // 56 58 y29 = int64x2_rightright(x28,y29); // 57 59 y2 = int64x2_leftleft(x2,y3); // 5 7 y3 = int64x2_rightright(x2,y3); // 4 6 x6 = int64x2_leftleft(y6,x7); // 13 15 x7 = int64x2_rightright(y6,x7); // 12 14 x10 = int64x2_leftleft(y10,x11); // 21 23 x11 = int64x2_rightright(y10,x11); // 20 22 y14 = int64x2_leftleft(x14,y15); // 29 31 y15 = int64x2_rightright(x14,y15); // 28 30 x18 = int64x2_leftleft(y18,x19); // 36 38 x19 = int64x2_rightright(y18,x19); // 37 39 y22 = int64x2_leftleft(x22,y23); // 44 46 y23 = int64x2_rightright(x22,y23); // 45 47 y26 = int64x2_leftleft(x26,y27); // 52 54 y27 = int64x2_rightright(x26,y27); // 53 55 x30 = int64x2_leftleft(y30,x31); // 60 62 x31 = int64x2_rightright(y30,x31); // 61 63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 2 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 16 18 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 24 26 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y17)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y17); // 32 34 y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y16); // 33 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x21)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x21); // 40 42 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x20); // 41 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x25)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x25); // 48 50 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x24); // 49 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(y28,y29)); x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y28,y29); // 56 58 y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y29,y28); // 57 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 4 6 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 20 22 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y15)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y15); // 28 30 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y14); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x19)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x19); // 36 38 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x18); // 37 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y22,y23)); x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y23); // 44 46 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y22); // 45 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y26,y27)); x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y27); // 52 54 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y26); // 53 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x30,x31)); y30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x31); // 60 62 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x30); // 61 63 x0 = int64x2_leftleft(y0,x1); // 0 1 x1 = int64x2_rightright(y0,x1); // 2 3 y2 = int64x2_leftleft(x2,y3); // 4 5 y3 = int64x2_rightright(x2,y3); // 6 7 y4 = int64x2_leftleft(x4,y5); // 8 9 y5 = int64x2_rightright(x4,y5); // 10 11 x6 = int64x2_leftleft(y6,x7); // 12 13 x7 = int64x2_rightright(y6,x7); // 14 15 y8 = int64x2_leftleft(x8,y9); // 16 17 y9 = int64x2_rightright(x8,y9); // 18 19 x10 = int64x2_leftleft(y10,x11); // 20 21 x11 = int64x2_rightright(y10,x11); // 22 23 x12 = int64x2_leftleft(y12,x13); // 24 25 x13 = int64x2_rightright(y12,x13); // 26 27 y14 = int64x2_leftleft(x14,y15); // 28 29 y15 = int64x2_rightright(x14,y15); // 30 31 y16 = int64x2_leftleft(x16,y17); // 32 33 y17 = int64x2_rightright(x16,y17); // 34 35 x18 = int64x2_leftleft(y18,x19); // 36 37 x19 = int64x2_rightright(y18,x19); // 38 39 x20 = int64x2_leftleft(y20,x21); // 40 41 x21 = int64x2_rightright(y20,x21); // 42 43 y22 = int64x2_leftleft(x22,y23); // 44 45 y23 = int64x2_rightright(x22,y23); // 46 47 x24 = int64x2_leftleft(y24,x25); // 48 49 x25 = int64x2_rightright(y24,x25); // 50 51 y26 = int64x2_leftleft(x26,y27); // 52 53 y27 = int64x2_rightright(x26,y27); // 54 55 y28 = int64x2_leftleft(x28,y29); // 56 57 y29 = int64x2_rightright(x28,y29); // 58 59 x30 = int64x2_leftleft(y30,x31); // 60 61 x31 = int64x2_rightright(y30,x31); // 62 63 int64x2_store(x+pos31-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x31),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos31)&1)))))); int64x2_store(x+pos30-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x30),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos30)&1)))))); int64x2_store(x+pos29-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y29),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos29)&1)))))); int64x2_store(x+pos28-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y28),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos28)&1)))))); int64x2_store(x+pos27-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y27),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos27)&1)))))); int64x2_store(x+pos26-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y26),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos26)&1)))))); int64x2_store(x+pos25-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x25),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos25)&1)))))); int64x2_store(x+pos24-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x24),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos24)&1)))))); int64x2_store(x+pos23-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y23),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos23)&1)))))); int64x2_store(x+pos22-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y22),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos22)&1)))))); int64x2_store(x+pos21-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x21),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos21)&1)))))); int64x2_store(x+pos20-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x20),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos20)&1)))))); int64x2_store(x+pos19-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x19),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos19)&1)))))); int64x2_store(x+pos18-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(x18),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos18)&1)))))); int64x2_store(x+pos17-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y17),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos17)&1)))))); int64x2_store(x+pos16-2,int64x2_from_int8x16(int8x16_varextract(int8x16_from_int64x2(y16),uint8x16_sub(uint8x16_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),uint8x16_broadcast(8*((-pos16)&1)))))); int64x2_store(x+30,y15); int64x2_store(x+28,y14); int64x2_store(x+26,x13); int64x2_store(x+24,x12); int64x2_store(x+22,x11); int64x2_store(x+20,x10); int64x2_store(x+18,y9); int64x2_store(x+16,y8); int64x2_store(x+14,x7); int64x2_store(x+12,x6); int64x2_store(x+10,y5); int64x2_store(x+8,y4); int64x2_store(x+6,y3); int64x2_store(x+4,y2); int64x2_store(x+2,x1); int64x2_store(x+0,x0); } NOINLINE static void int64_V_sort_16_xor(int64 *x,int64 xor) { int64x2 t,vecxor,x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2,y3,y4,y5,y6; vecxor = int64x2_broadcast(xor); x0 = vecxor^int64x2_load(x+0); // 7 6 x1 = vecxor^int64x2_load(x+2); // 5 4 x2 = vecxor^int64x2_load(x+4); // 3 2 x3 = vecxor^int64x2_load(x+6); // 1 0 x4 = vecxor^int64x2_load(x+8); // 8 9 x5 = vecxor^int64x2_load(x+10); // 10 11 x6 = vecxor^int64x2_load(x+12); // 12 13 x7 = vecxor^int64x2_load(x+14); // 14 15 // stage (4,3) 0:15 1:14 2:13 3:12 4:11 5:10 6:9 7:8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 7 6 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 8 9 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 3 2 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 12 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 5 4 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 10 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 1 0 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 14 15 // stage (4,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 3 2 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x6)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 8 9 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 12 13 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y3)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y3); // 1 0 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 10 11 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 14 15 // stage (4,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 1 0 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 9 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 10 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 5 4 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 13 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 14 15 // stage (4,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 x0 = int64x2_leftleft(y0,x1); // 1 3 x1 = int64x2_rightright(y0,x1); // 0 2 y4 = int64x2_leftleft(x4,y5); // 8 10 y5 = int64x2_rightright(x4,y5); // 9 11 y2 = int64x2_leftleft(x2,y3); // 5 7 y3 = int64x2_rightright(x2,y3); // 4 6 x6 = int64x2_leftleft(y6,x7); // 12 14 x7 = int64x2_rightright(y6,x7); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 2 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 4 6 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 13 15 x0 = int64x2_leftleft(y0,x1); // 0 1 x1 = int64x2_rightright(y0,x1); // 2 3 y2 = int64x2_leftleft(x2,y3); // 4 5 y3 = int64x2_rightright(x2,y3); // 6 7 y4 = int64x2_leftleft(x4,y5); // 8 9 y5 = int64x2_rightright(x4,y5); // 10 11 x6 = int64x2_leftleft(y6,x7); // 12 13 x7 = int64x2_rightright(y6,x7); // 14 15 int64x2_store(x+0,vecxor^x0); int64x2_store(x+2,vecxor^x1); int64x2_store(x+4,vecxor^y2); int64x2_store(x+6,vecxor^y3); int64x2_store(x+8,vecxor^y4); int64x2_store(x+10,vecxor^y5); int64x2_store(x+12,vecxor^x6); int64x2_store(x+14,vecxor^x7); } NOINLINE static void int64_V_sort_32_xor(int64 *x,int64 xor) { int64x2 t,vecxor,x0,x1,x10,x11,x12,x13,x14,x15,x2,x3,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y2,y3,y4,y5,y6,y7,y8,y9; vecxor = int64x2_broadcast(xor); x0 = vecxor^int64x2_load(x+0); // 15 14 x1 = vecxor^int64x2_load(x+2); // 13 12 x2 = vecxor^int64x2_load(x+4); // 11 10 x3 = vecxor^int64x2_load(x+6); // 9 8 x4 = vecxor^int64x2_load(x+8); // 7 6 x5 = vecxor^int64x2_load(x+10); // 5 4 x6 = vecxor^int64x2_load(x+12); // 3 2 x7 = vecxor^int64x2_load(x+14); // 1 0 x8 = vecxor^int64x2_load(x+16); // 16 17 x9 = vecxor^int64x2_load(x+18); // 18 19 x10 = vecxor^int64x2_load(x+20); // 20 21 x11 = vecxor^int64x2_load(x+22); // 22 23 x12 = vecxor^int64x2_load(x+24); // 24 25 x13 = vecxor^int64x2_load(x+26); // 26 27 x14 = vecxor^int64x2_load(x+28); // 28 29 x15 = vecxor^int64x2_load(x+30); // 30 31 // stage (5,4) 0:31 1:30 2:29 3:28 4:27 5:26 6:25 7:24 8:23 9:22 10:21 11:20 12:19 13:18 14:17 15:16 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x8)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x8); // 15 14 x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x0); // 16 17 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x10)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x10); // 11 10 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x2); // 20 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x12)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x12); // 7 6 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x4); // 24 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x14)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x14); // 3 2 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x6); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x9)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x9); // 13 12 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x1); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x11)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x11); // 9 8 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x3); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x13)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x13); // 5 4 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x5); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x15)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x15); // 1 0 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x7); // 30 31 // stage (5,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y4)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y4); // 7 6 y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y0); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y6)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y6); // 3 2 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y2); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x12)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x12); // 16 17 x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x8); // 24 25 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x14)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x14); // 20 21 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x10); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y5)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y5); // 5 4 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y1); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y7)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y7); // 1 0 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y3); // 9 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x13)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x13); // 18 19 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x9); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x15)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x15); // 22 23 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x11); // 30 31 // stage (5,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x2)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x2); // 3 2 x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y6)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y6); // 11 10 y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y4); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y10)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y10); // 16 17 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y8); // 20 21 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x14)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x14); // 24 25 x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x12); // 28 29 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x3)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x3); // 1 0 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y7)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y7); // 9 8 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y5); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y11)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y11); // 18 19 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y9); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x15)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x15); // 26 27 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x13); // 30 31 // stage (5,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y1)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y1); // 1 0 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 9 8 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 16 17 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 18 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y13)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y13); // 24 25 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y12); // 26 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 5 4 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y7)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 13 12 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 20 21 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 22 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 28 29 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 30 31 // stage (5,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 y0 = int64x2_leftleft(x0,y1); // 1 3 y1 = int64x2_rightright(x0,y1); // 0 2 x4 = int64x2_leftleft(y4,x5); // 9 11 x5 = int64x2_rightright(y4,x5); // 8 10 x8 = int64x2_leftleft(y8,x9); // 16 18 x9 = int64x2_rightright(y8,x9); // 17 19 y12 = int64x2_leftleft(x12,y13); // 24 26 y13 = int64x2_rightright(x12,y13); // 25 27 x2 = int64x2_leftleft(y2,x3); // 5 7 x3 = int64x2_rightright(y2,x3); // 4 6 y6 = int64x2_leftleft(x6,y7); // 13 15 y7 = int64x2_rightright(x6,y7); // 12 14 y10 = int64x2_leftleft(x10,y11); // 20 22 y11 = int64x2_rightright(x10,y11); // 21 23 x14 = int64x2_leftleft(y14,x15); // 28 30 x15 = int64x2_rightright(y14,x15); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y1)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y1); // 0 2 y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x5)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x5); // 8 10 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x9)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x9); // 16 18 x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x8); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y13)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y13); // 24 26 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y12); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x3)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x3); // 4 6 x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y7)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y7); // 12 14 y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y6); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y11)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y11); // 20 22 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y10); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x15)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x15); // 28 30 x15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x14); // 29 31 y0 = int64x2_leftleft(x0,y1); // 0 1 y1 = int64x2_rightright(x0,y1); // 2 3 x2 = int64x2_leftleft(y2,x3); // 4 5 x3 = int64x2_rightright(y2,x3); // 6 7 x4 = int64x2_leftleft(y4,x5); // 8 9 x5 = int64x2_rightright(y4,x5); // 10 11 y6 = int64x2_leftleft(x6,y7); // 12 13 y7 = int64x2_rightright(x6,y7); // 14 15 x8 = int64x2_leftleft(y8,x9); // 16 17 x9 = int64x2_rightright(y8,x9); // 18 19 y10 = int64x2_leftleft(x10,y11); // 20 21 y11 = int64x2_rightright(x10,y11); // 22 23 y12 = int64x2_leftleft(x12,y13); // 24 25 y13 = int64x2_rightright(x12,y13); // 26 27 x14 = int64x2_leftleft(y14,x15); // 28 29 x15 = int64x2_rightright(y14,x15); // 30 31 int64x2_store(x+0,vecxor^y0); int64x2_store(x+2,vecxor^y1); int64x2_store(x+4,vecxor^x2); int64x2_store(x+6,vecxor^x3); int64x2_store(x+8,vecxor^x4); int64x2_store(x+10,vecxor^x5); int64x2_store(x+12,vecxor^y6); int64x2_store(x+14,vecxor^y7); int64x2_store(x+16,vecxor^x8); int64x2_store(x+18,vecxor^x9); int64x2_store(x+20,vecxor^y10); int64x2_store(x+22,vecxor^y11); int64x2_store(x+24,vecxor^y12); int64x2_store(x+26,vecxor^y13); int64x2_store(x+28,vecxor^x14); int64x2_store(x+30,vecxor^x15); } NOINLINE static void int64_V_sort_64_xor(int64 *x,int64 xor) { int64x2 t,vecxor,x0,x1,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x2,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x3,x30,x31,x4,x5,x6,x7,x8,x9,y0,y1,y10,y11,y12,y13,y14,y15,y16,y17,y18,y19,y2,y20,y21,y22,y23,y24,y25,y26,y27,y28,y29,y3,y30,y4,y5,y6,y7,y8,y9; vecxor = int64x2_broadcast(xor); x0 = vecxor^int64x2_load(x+0); // 31 30 x1 = vecxor^int64x2_load(x+2); // 29 28 x2 = vecxor^int64x2_load(x+4); // 27 26 x3 = vecxor^int64x2_load(x+6); // 25 24 x4 = vecxor^int64x2_load(x+8); // 23 22 x5 = vecxor^int64x2_load(x+10); // 21 20 x6 = vecxor^int64x2_load(x+12); // 19 18 x7 = vecxor^int64x2_load(x+14); // 17 16 x8 = vecxor^int64x2_load(x+16); // 15 14 x9 = vecxor^int64x2_load(x+18); // 13 12 x10 = vecxor^int64x2_load(x+20); // 11 10 x11 = vecxor^int64x2_load(x+22); // 9 8 x12 = vecxor^int64x2_load(x+24); // 7 6 x13 = vecxor^int64x2_load(x+26); // 5 4 x14 = vecxor^int64x2_load(x+28); // 3 2 x15 = vecxor^int64x2_load(x+30); // 1 0 x16 = vecxor^int64x2_load(x+32); // 32 33 x17 = vecxor^int64x2_load(x+34); // 34 35 x18 = vecxor^int64x2_load(x+36); // 36 37 x19 = vecxor^int64x2_load(x+38); // 38 39 x20 = vecxor^int64x2_load(x+40); // 40 41 x21 = vecxor^int64x2_load(x+42); // 42 43 x22 = vecxor^int64x2_load(x+44); // 44 45 x23 = vecxor^int64x2_load(x+46); // 46 47 x24 = vecxor^int64x2_load(x+48); // 48 49 x25 = vecxor^int64x2_load(x+50); // 50 51 x26 = vecxor^int64x2_load(x+52); // 52 53 x27 = vecxor^int64x2_load(x+54); // 54 55 x28 = vecxor^int64x2_load(x+56); // 56 57 x29 = vecxor^int64x2_load(x+58); // 58 59 x30 = vecxor^int64x2_load(x+60); // 60 61 x31 = vecxor^int64x2_load(x+62); // 62 63 // stage (6,5) 0:63 1:62 2:61 3:60 4:59 5:58 6:57 7:56 8:55 9:54 10:53 11:52 12:51 13:50 14:49 15:48 16:47 17:46 18:45 19:44 20:43 21:42 22:41 23:40 24:39 25:38 26:37 27:36 28:35 29:34 30:33 31:32 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x16)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x16); // 31 30 x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x0); // 32 33 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x18)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x18); // 27 26 x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x2); // 36 37 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x20)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x20); // 23 22 x20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x4); // 40 41 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x22)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x22); // 19 18 x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x22,x6); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x24)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x24); // 15 14 x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x8); // 48 49 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x26)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x26); // 11 10 x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x10); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x28)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x28); // 7 6 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x12); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x14,x30)); y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x14,x30); // 3 2 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x14); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x17)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x17); // 29 28 x17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x1); // 34 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x19)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x19); // 25 24 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x3); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x21)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x21); // 21 20 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x5); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x7,x23)); y7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x23); // 17 16 x23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x23,x7); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x25)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x25); // 13 12 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x9); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(x11,x27)); y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x27); // 9 8 x27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x11); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x13,x29)); y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x29); // 5 4 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x13); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x15,x31)); y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x15,x31); // 1 0 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x15); // 62 63 // stage (6,4) 0:16 1:17 2:18 3:19 4:20 5:21 6:22 7:23 8:24 9:25 10:26 11:27 12:28 13:29 14:30 15:31 32:48 33:49 34:50 35:51 36:52 37:53 38:54 39:55 40:56 41:57 42:58 43:59 44:60 45:61 46:62 47:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y8)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y8); // 15 14 y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y0); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y10)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y10); // 11 10 y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y2); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y12)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y12); // 7 6 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y4); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y6,y14)); x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y6,y14); // 3 2 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y6); // 19 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x16,x24)); y16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x24); // 32 33 x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x16); // 48 49 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x26)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x26); // 36 37 x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x18); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x28)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x28); // 40 41 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x20); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x22,x30)); y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x22,x30); // 44 45 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x22); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y9)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y9); // 13 12 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y1); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y3,y11)); x3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y11); // 9 8 y11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y3); // 25 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y5,y13)); x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y13); // 5 4 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y5); // 21 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y7,y15)); x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y7,y15); // 1 0 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y7); // 17 16 t = int64x2_from_uint64x2(int64x2_smaller_umask(x17,x25)); y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x25); // 34 35 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x17); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(x19,x27)); y19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x27); // 38 39 x27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x19); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x21,x29)); y21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x29); // 42 43 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x21); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x23,x31)); y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x23,x31); // 46 47 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x23); // 62 63 // stage (6,3) 0:8 1:9 2:10 3:11 4:12 5:13 6:14 7:15 16:24 17:25 18:26 19:27 20:28 21:29 22:30 23:31 32:40 33:41 34:42 35:43 36:44 37:45 38:46 39:47 48:56 49:57 50:58 51:59 52:60 53:61 54:62 55:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x4)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x4); // 7 6 x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x0); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x2,x6)); y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x2,x6); // 3 2 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x2); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y12)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y12); // 23 22 y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y8); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(y10,y14)); x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y10,y14); // 19 18 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y10); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y20)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y20); // 32 33 y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y20,y16); // 40 41 t = int64x2_from_uint64x2(int64x2_smaller_umask(y18,y22)); x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y18,y22); // 36 37 y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y18); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x28)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x28); // 48 49 x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x24); // 56 57 t = int64x2_from_uint64x2(int64x2_smaller_umask(x26,x30)); y26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x26,x30); // 52 53 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x26); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(x1,x5)); y1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x5); // 5 4 x5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x1); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x3,x7)); y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x3,x7); // 1 0 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x3); // 9 8 t = int64x2_from_uint64x2(int64x2_smaller_umask(y9,y13)); x9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y13); // 21 20 y13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y9); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(y11,y15)); x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y11,y15); // 17 16 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y11); // 25 24 t = int64x2_from_uint64x2(int64x2_smaller_umask(y17,y21)); x17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y21); // 34 35 y21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y21,y17); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(y19,y23)); x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y19,y23); // 38 39 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y19); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(x25,x29)); y25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x29); // 50 51 x29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x25); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(x27,x31)); y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x27,x31); // 54 55 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x27); // 62 63 // stage (6,2) 0:4 1:5 2:6 3:7 8:12 9:13 10:14 11:15 16:20 17:21 18:22 19:23 24:28 25:29 26:30 27:31 32:36 33:37 34:38 35:39 40:44 41:45 42:46 43:47 48:52 49:53 50:54 51:55 56:60 57:61 58:62 59:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(y0,y2)); x0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y0,y2); // 3 2 y2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y0); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x4,x6)); y4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x4,x6); // 11 10 x6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x4); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x8,x10)); y8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x8,x10); // 19 18 x10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x8); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y12,y14)); x12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y12,y14); // 27 26 y14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y12); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x16,x18)); y16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x16,x18); // 32 33 x18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x16); // 36 37 t = int64x2_from_uint64x2(int64x2_smaller_umask(y20,y22)); x20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y20,y22); // 40 41 y22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y20); // 44 45 t = int64x2_from_uint64x2(int64x2_smaller_umask(y24,y26)); x24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y24,y26); // 48 49 y26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y24); // 52 53 t = int64x2_from_uint64x2(int64x2_smaller_umask(x28,x30)); y28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x28,x30); // 56 57 x30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x28); // 60 61 t = int64x2_from_uint64x2(int64x2_smaller_umask(y1,y3)); x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y1,y3); // 1 0 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y1); // 5 4 t = int64x2_from_uint64x2(int64x2_smaller_umask(x5,x7)); y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x5,x7); // 9 8 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x5); // 13 12 t = int64x2_from_uint64x2(int64x2_smaller_umask(x9,x11)); y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x9,x11); // 17 16 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x9); // 21 20 t = int64x2_from_uint64x2(int64x2_smaller_umask(y13,y15)); x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y13,y15); // 25 24 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y13); // 29 28 t = int64x2_from_uint64x2(int64x2_smaller_umask(x17,x19)); y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x17,x19); // 34 35 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x17); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y21,y23)); x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y21,y23); // 42 43 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y21); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y25,y27)); x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y25,y27); // 50 51 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y25); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x29,x31)); y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x29,x31); // 58 59 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x29); // 62 63 // stage (6,1) 0:2 1:3 4:6 5:7 8:10 9:11 12:14 13:15 16:18 17:19 20:22 21:23 24:26 25:27 28:30 29:31 32:34 33:35 36:38 37:39 40:42 41:43 44:46 45:47 48:50 49:51 52:54 53:55 56:58 57:59 60:62 61:63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 1 0 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 3 2 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 9 8 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 11 10 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 17 16 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 19 18 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 25 24 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 27 26 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y17)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y17); // 32 33 y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y16); // 34 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x21)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x21); // 40 41 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x20); // 42 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x25)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x25); // 48 49 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x24); // 50 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(y28,y29)); x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y28,y29); // 56 57 y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y29,y28); // 58 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 5 4 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 7 6 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 13 12 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 15 14 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 21 20 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 23 22 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y15)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y15); // 29 28 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y14); // 31 30 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x19)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x19); // 36 37 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x18); // 38 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y22,y23)); x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y23); // 44 45 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y22); // 46 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y26,y27)); x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y27); // 52 53 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y26); // 54 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x30,x31)); y30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x31); // 60 61 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x30); // 62 63 // stage (6,0) 0:1 2:3 4:5 6:7 8:9 10:11 12:13 14:15 16:17 18:19 20:21 22:23 24:25 26:27 28:29 30:31 32:33 34:35 36:37 38:39 40:41 42:43 44:45 46:47 48:49 50:51 52:53 54:55 56:57 58:59 60:61 62:63 x0 = int64x2_leftleft(y0,x1); // 1 3 x1 = int64x2_rightright(y0,x1); // 0 2 y4 = int64x2_leftleft(x4,y5); // 9 11 y5 = int64x2_rightright(x4,y5); // 8 10 y8 = int64x2_leftleft(x8,y9); // 17 19 y9 = int64x2_rightright(x8,y9); // 16 18 x12 = int64x2_leftleft(y12,x13); // 25 27 x13 = int64x2_rightright(y12,x13); // 24 26 y16 = int64x2_leftleft(x16,y17); // 32 34 y17 = int64x2_rightright(x16,y17); // 33 35 x20 = int64x2_leftleft(y20,x21); // 40 42 x21 = int64x2_rightright(y20,x21); // 41 43 x24 = int64x2_leftleft(y24,x25); // 48 50 x25 = int64x2_rightright(y24,x25); // 49 51 y28 = int64x2_leftleft(x28,y29); // 56 58 y29 = int64x2_rightright(x28,y29); // 57 59 y2 = int64x2_leftleft(x2,y3); // 5 7 y3 = int64x2_rightright(x2,y3); // 4 6 x6 = int64x2_leftleft(y6,x7); // 13 15 x7 = int64x2_rightright(y6,x7); // 12 14 x10 = int64x2_leftleft(y10,x11); // 21 23 x11 = int64x2_rightright(y10,x11); // 20 22 y14 = int64x2_leftleft(x14,y15); // 29 31 y15 = int64x2_rightright(x14,y15); // 28 30 x18 = int64x2_leftleft(y18,x19); // 36 38 x19 = int64x2_rightright(y18,x19); // 37 39 y22 = int64x2_leftleft(x22,y23); // 44 46 y23 = int64x2_rightright(x22,y23); // 45 47 y26 = int64x2_leftleft(x26,y27); // 52 54 y27 = int64x2_rightright(x26,y27); // 53 55 x30 = int64x2_leftleft(y30,x31); // 60 62 x31 = int64x2_rightright(y30,x31); // 61 63 t = int64x2_from_uint64x2(int64x2_smaller_umask(x0,x1)); y0 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x0,x1); // 0 2 x1 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x1,x0); // 1 3 t = int64x2_from_uint64x2(int64x2_smaller_umask(y4,y5)); x4 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y4,y5); // 8 10 y5 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y5,y4); // 9 11 t = int64x2_from_uint64x2(int64x2_smaller_umask(y8,y9)); x8 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y8,y9); // 16 18 y9 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y9,y8); // 17 19 t = int64x2_from_uint64x2(int64x2_smaller_umask(x12,x13)); y12 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x12,x13); // 24 26 x13 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x13,x12); // 25 27 t = int64x2_from_uint64x2(int64x2_smaller_umask(y16,y17)); x16 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y16,y17); // 32 34 y17 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y17,y16); // 33 35 t = int64x2_from_uint64x2(int64x2_smaller_umask(x20,x21)); y20 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x20,x21); // 40 42 x21 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x21,x20); // 41 43 t = int64x2_from_uint64x2(int64x2_smaller_umask(x24,x25)); y24 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x24,x25); // 48 50 x25 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x25,x24); // 49 51 t = int64x2_from_uint64x2(int64x2_smaller_umask(y28,y29)); x28 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y28,y29); // 56 58 y29 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y29,y28); // 57 59 t = int64x2_from_uint64x2(int64x2_smaller_umask(y2,y3)); x2 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y2,y3); // 4 6 y3 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y3,y2); // 5 7 t = int64x2_from_uint64x2(int64x2_smaller_umask(x6,x7)); y6 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x6,x7); // 12 14 x7 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x7,x6); // 13 15 t = int64x2_from_uint64x2(int64x2_smaller_umask(x10,x11)); y10 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x10,x11); // 20 22 x11 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x11,x10); // 21 23 t = int64x2_from_uint64x2(int64x2_smaller_umask(y14,y15)); x14 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y14,y15); // 28 30 y15 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y15,y14); // 29 31 t = int64x2_from_uint64x2(int64x2_smaller_umask(x18,x19)); y18 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x18,x19); // 36 38 x19 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x19,x18); // 37 39 t = int64x2_from_uint64x2(int64x2_smaller_umask(y22,y23)); x22 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y22,y23); // 44 46 y23 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y23,y22); // 45 47 t = int64x2_from_uint64x2(int64x2_smaller_umask(y26,y27)); x26 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y26,y27); // 52 54 y27 = int64x2_ifthenelse(uint64x2_from_int64x2(t),y27,y26); // 53 55 t = int64x2_from_uint64x2(int64x2_smaller_umask(x30,x31)); y30 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x30,x31); // 60 62 x31 = int64x2_ifthenelse(uint64x2_from_int64x2(t),x31,x30); // 61 63 x0 = int64x2_leftleft(y0,x1); // 0 1 x1 = int64x2_rightright(y0,x1); // 2 3 y2 = int64x2_leftleft(x2,y3); // 4 5 y3 = int64x2_rightright(x2,y3); // 6 7 y4 = int64x2_leftleft(x4,y5); // 8 9 y5 = int64x2_rightright(x4,y5); // 10 11 x6 = int64x2_leftleft(y6,x7); // 12 13 x7 = int64x2_rightright(y6,x7); // 14 15 y8 = int64x2_leftleft(x8,y9); // 16 17 y9 = int64x2_rightright(x8,y9); // 18 19 x10 = int64x2_leftleft(y10,x11); // 20 21 x11 = int64x2_rightright(y10,x11); // 22 23 x12 = int64x2_leftleft(y12,x13); // 24 25 x13 = int64x2_rightright(y12,x13); // 26 27 y14 = int64x2_leftleft(x14,y15); // 28 29 y15 = int64x2_rightright(x14,y15); // 30 31 y16 = int64x2_leftleft(x16,y17); // 32 33 y17 = int64x2_rightright(x16,y17); // 34 35 x18 = int64x2_leftleft(y18,x19); // 36 37 x19 = int64x2_rightright(y18,x19); // 38 39 x20 = int64x2_leftleft(y20,x21); // 40 41 x21 = int64x2_rightright(y20,x21); // 42 43 y22 = int64x2_leftleft(x22,y23); // 44 45 y23 = int64x2_rightright(x22,y23); // 46 47 x24 = int64x2_leftleft(y24,x25); // 48 49 x25 = int64x2_rightright(y24,x25); // 50 51 y26 = int64x2_leftleft(x26,y27); // 52 53 y27 = int64x2_rightright(x26,y27); // 54 55 y28 = int64x2_leftleft(x28,y29); // 56 57 y29 = int64x2_rightright(x28,y29); // 58 59 x30 = int64x2_leftleft(y30,x31); // 60 61 x31 = int64x2_rightright(y30,x31); // 62 63 int64x2_store(x+0,vecxor^x0); int64x2_store(x+2,vecxor^x1); int64x2_store(x+4,vecxor^y2); int64x2_store(x+6,vecxor^y3); int64x2_store(x+8,vecxor^y4); int64x2_store(x+10,vecxor^y5); int64x2_store(x+12,vecxor^x6); int64x2_store(x+14,vecxor^x7); int64x2_store(x+16,vecxor^y8); int64x2_store(x+18,vecxor^y9); int64x2_store(x+20,vecxor^x10); int64x2_store(x+22,vecxor^x11); int64x2_store(x+24,vecxor^x12); int64x2_store(x+26,vecxor^x13); int64x2_store(x+28,vecxor^y14); int64x2_store(x+30,vecxor^y15); int64x2_store(x+32,vecxor^y16); int64x2_store(x+34,vecxor^y17); int64x2_store(x+36,vecxor^x18); int64x2_store(x+38,vecxor^x19); int64x2_store(x+40,vecxor^x20); int64x2_store(x+42,vecxor^x21); int64x2_store(x+44,vecxor^y22); int64x2_store(x+46,vecxor^y23); int64x2_store(x+48,vecxor^x24); int64x2_store(x+50,vecxor^x25); int64x2_store(x+52,vecxor^y26); int64x2_store(x+54,vecxor^y27); int64x2_store(x+56,vecxor^y28); int64x2_store(x+58,vecxor^y29); int64x2_store(x+60,vecxor^x30); int64x2_store(x+62,vecxor^x31); } NOINLINE static void int64_threestages_8_up(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2 x6 = int64x2_load(&x[6*p+i]); int64x2 x7 = int64x2_load(&x[7*p+i]); int64x2_MINMAX(x0,x4); int64x2_MINMAX(x1,x5); int64x2_MINMAX(x2,x6); int64x2_MINMAX(x3,x7); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x4,x6); int64x2_MINMAX(x5,x7); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_MINMAX(x4,x5); int64x2_MINMAX(x6,x7); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); int64x2_store(&x[6*p+i],x6); int64x2_store(&x[7*p+i],x7); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64 x4 = x[4*p+i]; int64 x5 = x[5*p+i]; int64 x6 = x[6*p+i]; int64 x7 = x[7*p+i]; int64_MINMAX(x0,x4); int64_MINMAX(x1,x5); int64_MINMAX(x2,x6); int64_MINMAX(x3,x7); int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x4,x6); int64_MINMAX(x5,x7); int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); int64_MINMAX(x4,x5); int64_MINMAX(x6,x7); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; x[4*p+i] = x4; x[5*p+i] = x5; x[6*p+i] = x6; x[7*p+i] = x7; } } NOINLINE static void int64_threestages_7_up(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2 x6 = int64x2_load(&x[6*p+i]); int64x2_MINMAX(x0,x4); int64x2_MINMAX(x1,x5); int64x2_MINMAX(x2,x6); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x4,x6); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_MINMAX(x4,x5); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); int64x2_store(&x[6*p+i],x6); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64 x4 = x[4*p+i]; int64 x5 = x[5*p+i]; int64 x6 = x[6*p+i]; int64_MINMAX(x0,x4); int64_MINMAX(x1,x5); int64_MINMAX(x2,x6); int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x4,x6); int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); int64_MINMAX(x4,x5); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; x[4*p+i] = x4; x[5*p+i] = x5; x[6*p+i] = x6; } } NOINLINE static void int64_threestages_6_up(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2_MINMAX(x0,x4); int64x2_MINMAX(x1,x5); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_MINMAX(x4,x5); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64 x4 = x[4*p+i]; int64 x5 = x[5*p+i]; int64_MINMAX(x0,x4); int64_MINMAX(x1,x5); int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); int64_MINMAX(x4,x5); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; x[4*p+i] = x4; x[5*p+i] = x5; } } NOINLINE static void int64_threestages_5_up(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2_MINMAX(x0,x4); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64 x4 = x[4*p+i]; int64_MINMAX(x0,x4); int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; x[4*p+i] = x4; } } NOINLINE static void int64_threestages_4_up(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64_MINMAX(x0,x2); int64_MINMAX(x1,x3); int64_MINMAX(x0,x1); int64_MINMAX(x2,x3); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; } } NOINLINE static void int64_threestages_8_up_16(int64 *x) { long long i; long long p = 16; long long n = p; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2 x6 = int64x2_load(&x[6*p+i]); int64x2 x7 = int64x2_load(&x[7*p+i]); int64x2_MINMAX(x0,x4); int64x2_MINMAX(x1,x5); int64x2_MINMAX(x2,x6); int64x2_MINMAX(x3,x7); int64x2_MINMAX(x0,x2); int64x2_MINMAX(x1,x3); int64x2_MINMAX(x4,x6); int64x2_MINMAX(x5,x7); int64x2_MINMAX(x0,x1); int64x2_MINMAX(x2,x3); int64x2_MINMAX(x4,x5); int64x2_MINMAX(x6,x7); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); int64x2_store(&x[6*p+i],x6); int64x2_store(&x[7*p+i],x7); } } NOINLINE static void int64_threestages_8_down_16(int64 *x) { long long i; long long p = 16; long long n = p; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2 x6 = int64x2_load(&x[6*p+i]); int64x2 x7 = int64x2_load(&x[7*p+i]); int64x2_MINMAX(x4,x0); int64x2_MINMAX(x5,x1); int64x2_MINMAX(x6,x2); int64x2_MINMAX(x7,x3); int64x2_MINMAX(x2,x0); int64x2_MINMAX(x3,x1); int64x2_MINMAX(x6,x4); int64x2_MINMAX(x7,x5); int64x2_MINMAX(x1,x0); int64x2_MINMAX(x3,x2); int64x2_MINMAX(x5,x4); int64x2_MINMAX(x7,x6); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); int64x2_store(&x[6*p+i],x6); int64x2_store(&x[7*p+i],x7); } } NOINLINE static void int64_threestages_8_down_atleast32(int64 *x,long long p) { long long i; long long n = p; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2 x6 = int64x2_load(&x[6*p+i]); int64x2 x7 = int64x2_load(&x[7*p+i]); int64x2_MINMAX(x4,x0); int64x2_MINMAX(x5,x1); int64x2_MINMAX(x6,x2); int64x2_MINMAX(x7,x3); int64x2_MINMAX(x2,x0); int64x2_MINMAX(x3,x1); int64x2_MINMAX(x6,x4); int64x2_MINMAX(x7,x5); int64x2_MINMAX(x1,x0); int64x2_MINMAX(x3,x2); int64x2_MINMAX(x5,x4); int64x2_MINMAX(x7,x6); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); int64x2_store(&x[6*p+i],x6); int64x2_store(&x[7*p+i],x7); } } NOINLINE static void int64_threestages_6_down(int64 *x,long long p,long long n) { long long i; for (i = 0;i+2 <= n;i += 2) { int64x2 x0 = int64x2_load(&x[i]); int64x2 x1 = int64x2_load(&x[p+i]); int64x2 x2 = int64x2_load(&x[2*p+i]); int64x2 x3 = int64x2_load(&x[3*p+i]); int64x2 x4 = int64x2_load(&x[4*p+i]); int64x2 x5 = int64x2_load(&x[5*p+i]); int64x2_MINMAX(x4,x0); int64x2_MINMAX(x5,x1); int64x2_MINMAX(x2,x0); int64x2_MINMAX(x3,x1); int64x2_MINMAX(x1,x0); int64x2_MINMAX(x3,x2); int64x2_MINMAX(x5,x4); int64x2_store(&x[i],x0); int64x2_store(&x[p+i],x1); int64x2_store(&x[2*p+i],x2); int64x2_store(&x[3*p+i],x3); int64x2_store(&x[4*p+i],x4); int64x2_store(&x[5*p+i],x5); } for (;i < n;++i) { int64 x0 = x[i]; int64 x1 = x[p+i]; int64 x2 = x[2*p+i]; int64 x3 = x[3*p+i]; int64 x4 = x[4*p+i]; int64 x5 = x[5*p+i]; int64_MINMAX(x4,x0); int64_MINMAX(x5,x1); int64_MINMAX(x2,x0); int64_MINMAX(x3,x1); int64_MINMAX(x1,x0); int64_MINMAX(x3,x2); int64_MINMAX(x5,x4); x[i] = x0; x[p+i] = x1; x[2*p+i] = x2; x[3*p+i] = x3; x[4*p+i] = x4; x[5*p+i] = x5; } } // XXX: currently xor must be 0 or -1 NOINLINE static void int64_V_sort_2poweratleast16_xor(int64 *x,long long n,int64 xor) { if (n == 16) { int64_V_sort_16_xor(x,xor); return; } if (n == 32) { int64_V_sort_32_xor(x,xor); return; } if (n == 64) { int64_V_sort_64_xor(x,xor); return; } if (n == 128) { if (xor) int64_threestages_8_down_16(x); else int64_threestages_8_up_16(x); for (long long i = 0;i < 8;++i) int64_V_sort_16_xor(x+16*i,xor); return; } if (xor) int64_threestages_8_down_atleast32(x,n>>3); else int64_threestages_8_up(x,n>>3,n>>3); for (long long i = 0;i < 8;++i) int64_V_sort_2poweratleast16_xor(x+(n>>3)*i,n>>3,xor); } /* q is power of 2; want only merge stages q,q/2,q/4,...,1 */ // XXX: assuming 8 <= q < n <= 2q; q is a power of 2 NOINLINE static void int64_V_sort(int64 *x,long long q,long long n) { if (!(n & (n - 1))) { int64_V_sort_2poweratleast16_xor(x,n,0); return; } if (n <= 16) { int64_V_sort_8through16(x,n); return; } if (n <= 32) { int64_V_sort_16through32(x,n); return; } if (n <= 64) { int64_V_sort_32through64(x,n); return; } // 64 <= q < n < 2q q >>= 2; // 64 <= 4q < n < 8q if (7*q < n) { int64_threestages_8_up(x,q,n-7*q); int64_threestages_7_up(x+n-7*q,q,8*q-n); } else if (6*q < n) { int64_threestages_7_up(x,q,n-6*q); int64_threestages_6_up(x+n-6*q,q,7*q-n); } else if (5*q < n) { int64_threestages_6_up(x,q,n-5*q); int64_threestages_5_up(x+n-5*q,q,6*q-n); } else { int64_threestages_5_up(x,q,n-4*q); int64_threestages_4_up(x+n-4*q,q,5*q-n); } // now want to handle each batch of q entries separately int64_V_sort(x,q>>1,q); int64_V_sort(x+q,q>>1,q); int64_V_sort(x+2*q,q>>1,q); int64_V_sort(x+3*q,q>>1,q); x += 4*q; n -= 4*q; while (n >= q) { int64_V_sort(x,q>>1,q); x += q; n -= q; } // have n entries left in last batch, with 0 <= n < q if (n <= 1) return; while (q >= n) q >>= 1; // empty merge stage // now 1 <= q < n <= 2q if (q >= 8) { int64_V_sort(x,q,n); return; } if (n == 8) { int64_MINMAX(x[0],x[4]); int64_MINMAX(x[1],x[5]); int64_MINMAX(x[2],x[6]); int64_MINMAX(x[3],x[7]); int64_MINMAX(x[0],x[2]); int64_MINMAX(x[1],x[3]); int64_MINMAX(x[0],x[1]); int64_MINMAX(x[2],x[3]); int64_MINMAX(x[4],x[6]); int64_MINMAX(x[5],x[7]); int64_MINMAX(x[4],x[5]); int64_MINMAX(x[6],x[7]); return; } if (4 <= n) { for (long long i = 0;i < n-4;++i) int64_MINMAX(x[i],x[4+i]); int64_MINMAX(x[0],x[2]); int64_MINMAX(x[1],x[3]); int64_MINMAX(x[0],x[1]); int64_MINMAX(x[2],x[3]); n -= 4; x += 4; } if (3 <= n) int64_MINMAX(x[0],x[2]); if (2 <= n) int64_MINMAX(x[0],x[1]); } // XXX: currently xor must be 0 or -1 NOINLINE static void int64_sort_2poweratleast32_xor(int64 *x,long long n,int64 xor) { if (n == 32) { int64_sort_32_xor(x,xor); return; } int64_sort_2poweratleast32_xor(x,n>>1,~xor); int64_sort_2poweratleast32_xor(x+(n>>1),n>>1,xor); int64_V_sort_2poweratleast16_xor(x,n,xor); } void int64_sort(int64 *x,long long n) { long long q; if (n <= 1) return; if (n == 2) { int64_MINMAX(x[0],x[1]); return; } if (n <= 7) { int64_sort_3through7(x,n); return; } if (n <= 16) { int64_sort_8through16(x,n); return; } if (!(n & (n - 1))) { int64_sort_2poweratleast32_xor(x,n,0); return; } if (n <= 32) { int64_sort_16through32(x,n); return; } q = 32; while (q < n - q) q += q; // 32 <= q < n < 2q if (512 <= n && n <= (7*q)>>2) { long long m = (3*q)>>2; // strategy: sort m, sort n-m, merge long long r = q>>3; // at least 32 since q is at least 256 int64_sort_2poweratleast32_xor(x,4*r,0); int64_sort_2poweratleast32_xor(x+4*r,r,0); int64_sort_2poweratleast32_xor(x+5*r,r,-1); int64_V_sort_2poweratleast16_xor(x+4*r,2*r,-1); int64_threestages_6_down(x,r,r); for (long long i = 0;i < 6;++i) int64_V_sort_2poweratleast16_xor(x+i*r,r,-1); int64_sort(x+m,n-m); } else if (64 <= q && n == (3*q)>>1) { // strategy: sort q, sort q/2, merge long long r = q>>2; // at least 16 since q is at least 64 int64_sort_2poweratleast32_xor(x,4*r,-1); int64_sort_2poweratleast32_xor(x+4*r,2*r,0); int64_threestages_6_up(x,r,r); for (long long i = 0;i < 6;++i) int64_V_sort_2poweratleast16_xor(x+i*r,r,0); return; } else { int64_sort_2poweratleast32_xor(x,q,-1); int64_sort(x+q,n-q); } int64_V_sort(x,q,n); }