#!/usr/bin/env python3

print(r'''/* WARNING: auto-generated (by autogen/test); do not edit */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <sys/resource.h>
#include "crypto_uint8.h"
#include "crypto_uint32.h"
#include "crypto_uint64.h"
#include "crypto_declassify.h"
#include <djbsort.h> /* -ldjbsort */

static const char *targeto = 0;
static const char *targetp = 0;
static const char *targeti = 0;
static const char *targetn = 0;
static const char *targetoffset = 0;

static int ok = 1;

#define fail ((ok = 0),printf)

/* ----- valgrind support */

static int valgrind = 0;
static unsigned char valgrind_undefined_byte = 0;
static char *volatile valgrind_pointer = 0;

static char *valgrind_malloc_1(void)
{
  char *x = (char *) malloc(1);
  if (!x) abort();
  *(char **volatile) &valgrind_pointer = x;
  return valgrind_pointer;
}

static void valgrind_init(void)
{
  char *e = getenv("valgrind_multiplier");
  char *x;
  if (!e) return;
  x = valgrind_malloc_1();
  valgrind_undefined_byte = x[0]+1;
  valgrind_undefined_byte *= atoi(e);
  valgrind_undefined_byte ^= x[0]+1;
  free(x);
  valgrind = 1;
}

static void secret(void *xvoid,long long xlen)
{
  unsigned char *x = (unsigned char *) xvoid;
  while (xlen > 0) {
    *x ^= valgrind_undefined_byte;
    ++x;
    --xlen;
  }
}

static void declassify(void *x,long long xlen)
{
  crypto_declassify(x,xlen);
}

/* ----- rng and hash, from supercop/try-anything.c */

typedef crypto_uint8 u8;
typedef crypto_uint32 u32;
typedef crypto_uint64 u64;

#define FOR(i,n) for (i = 0;i < n;++i)

static u32 L32(u32 x,int c) { return (x << c) | ((x&0xffffffff) >> (32 - c)); }

static u32 ld32(const u8 *x)
{
  u32 u = x[3];
  u = (u<<8)|x[2];
  u = (u<<8)|x[1];
  return (u<<8)|x[0];
}

static void st32(u8 *x,u32 u)
{
  int i;
  FOR(i,4) { x[i] = u; u >>= 8; }
}

static const u8 sigma[17] = "expand 32-byte k";

static void core_salsa(u8 *out,const u8 *in,const u8 *k)
{
  u32 w[16],x[16],y[16],t[4];
  int i,j,m;

  FOR(i,4) {
    x[5*i] = ld32(sigma+4*i);
    x[1+i] = ld32(k+4*i);
    x[6+i] = ld32(in+4*i);
    x[11+i] = ld32(k+16+4*i);
  }

  FOR(i,16) y[i] = x[i];

  FOR(i,20) {
    FOR(j,4) {
      FOR(m,4) t[m] = x[(5*j+4*m)%16];
      t[1] ^= L32(t[0]+t[3], 7);
      t[2] ^= L32(t[1]+t[0], 9);
      t[3] ^= L32(t[2]+t[1],13);
      t[0] ^= L32(t[3]+t[2],18);
      FOR(m,4) w[4*j+(j+m)%4] = t[m];
    }
    FOR(m,16) x[m] = w[m];
  }

  FOR(i,16) st32(out + 4 * i,x[i] + y[i]);
}

static void salsa20(u8 *c,u64 b,const u8 *n,const u8 *k)
{
  u8 z[16],x[64];
  u32 u,i;
  if (!b) return;
  FOR(i,16) z[i] = 0;
  FOR(i,8) z[i] = n[i];
  while (b >= 64) {
    core_salsa(x,z,k);
    FOR(i,64) c[i] = x[i];
    u = 1;
    for (i = 8;i < 16;++i) {
      u += (u32) z[i];
      z[i] = u;
      u >>= 8;
    }
    b -= 64;
    c += 64;
  }
  if (b) {
    core_salsa(x,z,k);
    FOR(i,b) c[i] = x[i];
  }
}

static void increment(u8 *n)
{
  if (!++n[0])
    if (!++n[1])
      if (!++n[2])
        if (!++n[3])
          if (!++n[4])
            if (!++n[5])
              if (!++n[6])
                if (!++n[7])
                  ;
}

static unsigned char testvector_n[8];

static void testvector_mutate(unsigned long long seed)
{
  const static unsigned char reseed[33] = "inject new seed for test vectors";
  unsigned char tmp[32];
  long long i;
  salsa20(tmp,sizeof tmp,testvector_n,reseed);
  for (i = 0;i < 8;++i) {
    tmp[i] ^= seed;
    seed >>= 8;
  }
  salsa20(testvector_n,sizeof testvector_n,testvector_n,tmp);
}

static void testvector(unsigned char *x,unsigned long long xlen)
{
  const static unsigned char testvector_k[33] = "generate inputs for test vectors";
  salsa20(x,xlen,testvector_n,testvector_k);
  increment(testvector_n);
}

static unsigned long long myrandom(void)
{
  unsigned char x[8];
  unsigned long long result;
  testvector(x,8);
  result = x[7];
  result = (result<<8)|x[6];
  result = (result<<8)|x[5];
  result = (result<<8)|x[4];
  result = (result<<8)|x[3];
  result = (result<<8)|x[2];
  result = (result<<8)|x[1];
  result = (result<<8)|x[0];
  return result;
}

static unsigned char canary_n[8];

static void canary(unsigned char *x,unsigned long long xlen)
{
  const static unsigned char canary_k[33] = "generate pad to catch overwrites";
  salsa20(x,xlen,canary_n,canary_k);
  increment(canary_n);
}

static void double_canary(unsigned char *x2,unsigned char *x,unsigned long long xlen)
{
  if (valgrind) return;
  canary(x - 16,16);
  canary(x + xlen,16);
  memcpy(x2 - 16,x - 16,16);
  memcpy(x2 + xlen,x + xlen,16);
}

static void input_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
{
  testvector(x,xlen);
  if (valgrind) {
    memcpy(x2,x,xlen);
    return;
  }
  canary(x - 16,16);
  canary(x + xlen,16);
  memcpy(x2 - 16,x - 16,xlen + 32);
}

static void output_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
{
  if (valgrind) return;
  if (memcmp(x2 - 16,x - 16,16)) {
    fail("failure: %s writes before output\n",fun);
  }
  if (memcmp(x2 + xlen,x + xlen,16)) {
    fail("failure: %s writes after output\n",fun);
  }
}

/* ----- knownrandombytes */

static const int knownrandombytes_is_only_for_testing_not_for_cryptographic_use = 1;
extern void knownrandombytes(void *,long long);

#define QUARTERROUND(a,b,c,d) \
  a += b; d = L32(d^a,16); \
  c += d; b = L32(b^c,12); \
  a += b; d = L32(d^a, 8); \
  c += d; b = L32(b^c, 7);

static void core_chacha(u8 *out,const u8 *in,const u8 *k)
{
  u32 x[16],y[16];
  int i,j;
  FOR(i,4) {
    x[i] = ld32(sigma+4*i);
    x[12+i] = ld32(in+4*i);
  }
  FOR(i,8) x[4+i] = ld32(k+4*i);
  FOR(i,16) y[i] = x[i];
  FOR(i,10) {
    FOR(j,4) { QUARTERROUND(x[j],x[j+4],x[j+8],x[j+12]) }
    FOR(j,4) { QUARTERROUND(x[j],x[((j+1)&3)+4],x[((j+2)&3)+8],x[((j+3)&3)+12]) }
  }
  FOR(i,16) st32(out+4*i,x[i]+y[i]);
}

static void chacha20(u8 *c,u64 b,const u8 *n,const u8 *k)
{
  u8 z[16],x[64];
  u32 u,i;
  if (!b) return;
  FOR(i,16) z[i] = 0;
  FOR(i,8) z[i+8] = n[i];
  while (b >= 64) {
    core_chacha(x,z,k);
    FOR(i,64) c[i] = x[i];
    u = 1;
    FOR(i,8) {
      u += (u32) z[i];
      z[i] = u;
      u >>= 8;
    }
    b -= 64;
    c += 64;
  }
  if (b) {
    core_chacha(x,z,k);
    FOR(i,b) c[i] = x[i];
  }
}

#define crypto_rng_OUTPUTBYTES 736

static int crypto_rng(
        unsigned char *r, /* random output */
        unsigned char *n, /* new key */
  const unsigned char *g  /* old key */
)
{
  static const unsigned char nonce[8] = {0};
  unsigned char x[32+crypto_rng_OUTPUTBYTES];
  chacha20(x,sizeof x,nonce,g);
  memcpy(n,x,32);
  memcpy(r,x+32,crypto_rng_OUTPUTBYTES);
  return 0;
}

static unsigned char knownrandombytes_g[32];
static unsigned char knownrandombytes_r[crypto_rng_OUTPUTBYTES];
static unsigned long long knownrandombytes_pos = crypto_rng_OUTPUTBYTES;

void knownrandombytes_main(void *xvoid,long long xlen)
{
  unsigned char *x = (unsigned char *) xvoid;
  assert(knownrandombytes_is_only_for_testing_not_for_cryptographic_use);

  while (xlen > 0) {
    if (knownrandombytes_pos == crypto_rng_OUTPUTBYTES) {
      crypto_rng(knownrandombytes_r,knownrandombytes_g,knownrandombytes_g);
      knownrandombytes_pos = 0;
    }
    *x++ = knownrandombytes_r[knownrandombytes_pos];
    xlen -= 1;
    knownrandombytes_r[knownrandombytes_pos++] = 0;
  }
}

void knownrandombytes(void *xvoid,long long xlen)
{
  knownrandombytes_main(xvoid,xlen);
  secret(xvoid,xlen);
}

/* ----- memory handling */

static void *callocplus(long long len)
{
  if (valgrind) {
    unsigned char *x = (unsigned char *) malloc(len);
    if (!x) abort();
    return x;
  } else {
    unsigned char *x = (unsigned char *) calloc(1,len + 256);
    long long i;
    if (!x) abort();
    for (i = 0;i < len + 256;++i) x[i] = random();
    return x;
  }
}

static void *aligned(void *x,long long len)
{
  if (valgrind)
    return x;
  else {
    long long i;
    unsigned char *y = (unsigned char *) x;
    y += 64;
    y += 63 & (-(unsigned long) y);
    for (i = 0;i < len;++i) y[i] = 0;
    return y;
  }
}

/* ----- catching SIGILL, SIGBUS, SIGSEGV, etc. */

#include "limits.inc"

static void forked(void (*test)(long long),long long impl)
{
  if (valgrind) {
    test(impl);
    return;
  }
  fflush(stdout);
  pid_t child = fork();
  int childstatus = -1;
  if (child == -1) {
    fprintf(stderr,"fatal: fork failed: %s",strerror(errno));
    exit(111);
  }
  if (child == 0) {
    ok = 1;
    limits();
    test(impl);
    if (!ok) exit(100);
    exit(0);
  }
  if (waitpid(child,&childstatus,0) != child) {
    fprintf(stderr,"fatal: wait failed: %s",strerror(errno));
    exit(111);
  }
  if (childstatus)
    fail("failure: process failed, status %d\n",childstatus);
  fflush(stdout);
}

/* ----- sorting */
''')

tests = ''
measuresort = ''
mutation = 0
for bytes in 4,8:
  bits = 8*bytes
  I = f'int{bits}'
  U = f'uint{bits}'
  F = f'float{bits}'
  for S,T in (
    (I,f'{I}_t'),
    (I+'down',f'{I}_t'),
    (U,f'{U}_t'),
    (U+'down',f'{U}_t'),
    (F,f'{F}_t'),
    (F+'down',f'{F}_t'),
  ):
    mutation += 1
    if T == 'float16_t': T = 'void' # compiler support for float16 is currently spotty
    if T == 'float32_t': T = 'float'
    if T == 'float64_t': T = 'double'

    print(fr'''static void *storage_sort_{S}_x;
static unsigned char *test_sort_{S}_x;
static void *storage_sort_{S}_x2;
static unsigned char *test_sort_{S}_x2;
static long long test_sort_{S}_offset;
''')

    down = 'down' in S
    retleq = 1 if down else -1
    retgeq = -1 if down else 1
    if S in (F,F+'down'):
      print(fr'''static int {S}_cmp(const void *x,const void *y)
{{
  const {T} a = *({T} *) x;
  const {T} b = *({T} *) y;
  if (a != a || b != b) {{ /* NaN handling */
    {I}_t ai = *({I}_t *) x;
    {I}_t bi = *({I}_t *) y;
    ai ^= (({U}_t) (ai >> {bits-1})) >> 1;
    bi ^= (({U}_t) (bi >> {bits-1})) >> 1;
    if (ai < bi) return {retleq};
    if (ai > bi) return {retgeq};
    return 0;
  }}
  if (a < b) return {retleq};
  if (a > b) return {retgeq};
  return 0;
}}
''')
    else:
      print(fr'''static int {S}_cmp(const void *x,const void *y)
{{
  const {T} a = *({T} *) x;
  const {T} b = *({T} *) y;
  if (a < b) return {retleq};
  if (a > b) return {retgeq};
  return 0;
}}
''')

    print(fr'''static void test_sort_{S}_impl(long long impl)
{{
  unsigned char *x = test_sort_{S}_x;
  unsigned char *x2 = test_sort_{S}_x2;
  long long xlen;
  long long xwords;
  void (*crypto_sort)({T} *,long long);

  testvector_mutate(valgrind);
  testvector_mutate({mutation});
  testvector_mutate(impl);
  testvector_mutate(test_sort_{S}_offset);
  if (targeti && strcmp(targeti,".") && strcmp(targeti,djbsort_dispatch_{S}_implementation(impl))) return;
  if (targetn && atol(targetn) != impl) return;
  if (impl >= 0) {{
    crypto_sort = djbsort_dispatch_{S}(impl);
    printf("sort_{S} %lld implementation %s compiler %s\n",impl,djbsort_dispatch_{S}_implementation(impl),djbsort_dispatch_{S}_compiler(impl));
  }} else {{
    crypto_sort = djbsort_{S};
    printf("sort_{S} selected implementation %s compiler %s\n",djbsort_{S}_implementation(),djbsort_{S}_compiler());
  }}
  for (long long stage = 0;stage < 4;++stage) {{
    long long loops,maxtest;
    switch(stage) {{
      case 0: loops = 1024; maxtest = 128; break;
      case 1: loops = 4096; maxtest = 4096; break;
      case 2: loops = 128; maxtest = 65536; break;
      default: loops = 4; maxtest = 1048576; break;
    }}

    for (long long loop = 0;loop < loops;++loop) {{
      {T} *xs = ({T} *) x;
      xwords = myrandom() % (maxtest + 1);
      xlen = {bytes}*xwords;

      input_prepare(x2,x,xlen);
      secret(x,xlen);
      crypto_sort(xs,xwords);
      declassify(x,xlen);
      output_compare(x2,x,xlen,"crypto_sort");

      for (long long i = 1;i < xwords;++i)
        if ({S}_cmp(&xs[i-1],&xs[i]) > 0) {{
          fail("failure: crypto_sort output is not in order\n");
          break;
        }}

      double_canary(x2,x,xlen);
      qsort(x2,xwords,{bytes},{S}_cmp);
      output_compare(x2,x,xlen,"crypto_sort");
      if (memcmp(x2,x,xlen) != 0) fail("failure: crypto_sort does not match qsort\n");
    }}
  }}
}}

static void test_sort_{S}(void)
{{
  if (targeto && strcmp(targeto,"sort")) return;
  if (targetp && strcmp(targetp,"{S}")) return;
  storage_sort_{S}_x = callocplus({bytes}*1048576);
  test_sort_{S}_x = (unsigned char *) aligned(storage_sort_{S}_x,{bytes}*1048576);
  storage_sort_{S}_x2 = callocplus({bytes}*1048576);
  test_sort_{S}_x2 = (unsigned char *) aligned(storage_sort_{S}_x2,{bytes}*1048576);

  for (long long offset = 0;offset < 2;++offset) {{
    if (targetoffset && atol(targetoffset) != offset) continue;
    if (offset && valgrind) break;
    printf("sort_{S} offset %lld\n",offset);
    test_sort_{S}_offset = offset;
    for (long long impl = -1;impl < djbsort_numimpl_{S}();++impl)
      forked(test_sort_{S}_impl,impl);
    test_sort_{S}_x += {bytes};
    test_sort_{S}_x2 += {bytes};
  }}
  free(storage_sort_{S}_x2);
  free(storage_sort_{S}_x);
}}
''')
    tests += f'  test_sort_{S}();\n'

print(r'''/* ----- top level */

#include "print_cpuid.inc"

int main(int argc,char **argv)
{
  valgrind_init();
  if (valgrind) limits();

  setvbuf(stdout,0,_IOLBF,0);
  printf("djbsort version %s\n",djbsort_version());
  printf("djbsort arch %s\n",djbsort_arch());
  print_cpuid();

  if (valgrind) {
    printf("valgrind %d",(int) valgrind);
    printf(" declassify %d",(int) crypto_declassify_uses_valgrind);
    if (!crypto_declassify_uses_valgrind)
      printf(" (expect false positives)");
    printf("\n");
  }

  if (*argv) ++argv;
  if (*argv) {
    targeto = *argv++;
    if (*argv) {
      targetp = *argv++;
      if (*argv) {
        targeti = *argv++;
        if (*argv) {
          targetn = *argv++;
          if (*argv) {
            targetoffset = *argv++;
          }
        }
      }
    }
  }
''')

print(tests)

print(r'''  if (!ok) {
    printf("some tests failed\n");
    return 100;
  }
  printf("all tests succeeded\n");
  return 0;
}''')
