monocypher.cc - medfall - A super great game engine

monocypher.cc (58201B)
      1 #include "monocypher.h"
      2 
      3 /////////////////
      4 /// Utilities ///
      5 /////////////////
      6 
      7 // By default, EdDSA signatures use blake2b.  SHA-512 is provided as
      8 // an option for full ed25519 compatibility (a must for test vectors).
      9 // Compile with option -DED25519_SHA512 to use with sha512.  If you do
     10 // so, you must provide the "sha512" header with suitable functions.
     11 #ifdef ED25519_SHA512
     12     #include "sha512.h"
     13     #define HASH crypto_sha512
     14 #else
     15     #define HASH crypto_blake2b
     16 #endif
     17 #define COMBINE1(x, y) x ## y
     18 #define COMBINE2(x, y) COMBINE1(x, y)
     19 #define HASH_CTX    COMBINE2(HASH, _ctx)
     20 #define HASH_INIT   COMBINE2(HASH, _init)
     21 #define HASH_UPDATE COMBINE2(HASH, _update)
     22 #define HASH_FINAL  COMBINE2(HASH, _final)
     23 
     24 #define FOR(i, start, end) for (size_t (i) = (start); (i) < (end); (i)++)
     25 typedef uint8_t  u8;
     26 typedef uint32_t u32;
     27 typedef int32_t  i32;
     28 typedef int64_t  i64;
     29 typedef uint64_t u64;
     30 
     31 static u32 load24_le(const u8 s[3])
     32 {
     33     return (u32)s[0]
     34         | ((u32)s[1] <<  8)
     35         | ((u32)s[2] << 16);
     36 }
     37 
     38 static u32 load32_le(const u8 s[4])
     39 {
     40     return (u32)s[0]
     41         | ((u32)s[1] <<  8)
     42         | ((u32)s[2] << 16)
     43         | ((u32)s[3] << 24);
     44 }
     45 
     46 static u64 load64_le(const u8 s[8])
     47 {
     48     return (u64)s[0]
     49         | ((u64)s[1] <<  8)
     50         | ((u64)s[2] << 16)
     51         | ((u64)s[3] << 24)
     52         | ((u64)s[4] << 32)
     53         | ((u64)s[5] << 40)
     54         | ((u64)s[6] << 48)
     55         | ((u64)s[7] << 56);
     56 }
     57 
     58 static void store32_le(u8 out[4], u32 in)
     59 {
     60     out[0] =  in        & 0xff;
     61     out[1] = (in >>  8) & 0xff;
     62     out[2] = (in >> 16) & 0xff;
     63     out[3] = (in >> 24) & 0xff;
     64 }
     65 
     66 static void store64_le(u8 out[8], u64 in)
     67 {
     68     out[0] =  in        & 0xff;
     69     out[1] = (in >>  8) & 0xff;
     70     out[2] = (in >> 16) & 0xff;
     71     out[3] = (in >> 24) & 0xff;
     72     out[4] = (in >> 32) & 0xff;
     73     out[5] = (in >> 40) & 0xff;
     74     out[6] = (in >> 48) & 0xff;
     75     out[7] = (in >> 56) & 0xff;
     76 }
     77 
     78 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
     79 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
     80 
     81 int crypto_memcmp(const u8 *p1, const u8 *p2, size_t n)
     82 {
     83     unsigned diff = 0;
     84     FOR (i, 0, n) {
     85         diff |= (p1[i] ^ p2[i]);
     86     }
     87     return (1 & ((diff - 1) >> 8)) - 1;
     88 }
     89 
     90 int crypto_zerocmp(const u8 *p, size_t n)
     91 {
     92     unsigned diff = 0;
     93     FOR (i, 0, n) {
     94         diff |= p[i];
     95     }
     96     return (1 & ((diff - 1) >> 8)) - 1;
     97 }
     98 
     99 /////////////////
    100 /// Chacha 20 ///
    101 /////////////////
    102 #define QUARTERROUND(a, b, c, d)          \
    103     a += b;  d ^= a;  d = rotl32(d, 16);  \
    104     c += d;  b ^= c;  b = rotl32(b, 12);  \
    105     a += b;  d ^= a;  d = rotl32(d,  8);  \
    106     c += d;  b ^= c;  b = rotl32(b,  7)
    107 
    108 static void chacha20_rounds(u32 out[16], const u32 in[16])
    109 {
    110     // The temporary variables make Chacha20 10% faster.
    111     u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
    112     u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
    113     u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
    114     u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
    115 
    116     FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
    117         QUARTERROUND(t0, t4, t8 , t12); // column 0
    118         QUARTERROUND(t1, t5, t9 , t13); // column 1
    119         QUARTERROUND(t2, t6, t10, t14); // column 2
    120         QUARTERROUND(t3, t7, t11, t15); // column 3
    121         QUARTERROUND(t0, t5, t10, t15); // diagonal 0
    122         QUARTERROUND(t1, t6, t11, t12); // diagonal 1
    123         QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
    124         QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
    125     }
    126     out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
    127     out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
    128     out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
    129     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
    130 }
    131 
    132 static void chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
    133 {
    134     // constant
    135     ctx->input[0] = load32_le((u8*)"expa");
    136     ctx->input[1] = load32_le((u8*)"nd 3");
    137     ctx->input[2] = load32_le((u8*)"2-by");
    138     ctx->input[3] = load32_le((u8*)"te k");
    139     // key
    140     FOR (i, 0, 8) {
    141         ctx->input[i+4] = load32_le(key + i*4);
    142     }
    143 }
    144 
    145 static u8 chacha20_pool_byte(crypto_chacha_ctx *ctx)
    146 {
    147     u32 pool_word = ctx->pool[ctx->pool_idx / 4];
    148     u8  pool_byte = pool_word >> (8*(ctx->pool_idx % 4));
    149     ctx->pool_idx++;
    150     return pool_byte;
    151 }
    152 
    153 // Fill the pool if needed, update the counters
    154 static void chacha20_refill_pool(crypto_chacha_ctx *ctx)
    155 {
    156     if (ctx->pool_idx == 64) {
    157         chacha20_rounds(ctx->pool, ctx->input);
    158         FOR (j, 0, 16) {
    159             ctx->pool[j] += ctx->input[j];
    160         }
    161         ctx->pool_idx = 0;
    162         ctx->input[12]++;
    163         if (ctx->input[12] == 0) {
    164             ctx->input[13]++;
    165         }
    166     }
    167 }
    168 
    169 void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16])
    170 {
    171     crypto_chacha_ctx ctx;
    172     chacha20_init_key(&ctx, key);
    173     FOR (i, 0, 4) {
    174         ctx.input[i+12] = load32_le(in + i*4);
    175     }
    176     u32 buffer[16];
    177     chacha20_rounds(buffer, ctx.input);
    178     // prevents reversal of the rounds by revealing only half of the buffer.
    179     FOR (i, 0, 4) {
    180         store32_le(out      + i*4, buffer[i     ]); // constant
    181         store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
    182     }
    183 }
    184 
    185 void crypto_chacha20_init(crypto_chacha_ctx *ctx,
    186                           const u8           key[32],
    187                           const u8           nonce[8])
    188 {
    189     chacha20_init_key      (ctx, key);     // key
    190     crypto_chacha20_set_ctr(ctx, 0  );     // counter
    191     ctx->input[14] = load32_le(nonce + 0); // nonce
    192     ctx->input[15] = load32_le(nonce + 4); // nonce
    193 }
    194 
    195 void crypto_chacha20_x_init(crypto_chacha_ctx *ctx,
    196                             const u8           key[32],
    197                             const u8           nonce[24])
    198 {
    199     u8 derived_key[32];
    200     crypto_chacha20_H(derived_key, key, nonce);
    201     crypto_chacha20_init(ctx, derived_key, nonce + 16);
    202 }
    203 
    204 void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, u64 ctr)
    205 {
    206     ctx->input[12] = ctr & 0xffffffff;
    207     ctx->input[13] = ctr >> 32;
    208     ctx->pool_idx  = 64;  // The random pool (re)starts empty
    209 }
    210 
    211 void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
    212                              u8                *cipher_text,
    213                              const u8          *plain_text,
    214                              size_t             text_size)
    215 {
    216     // Align ourselves with 4 byte words
    217     while (ctx->pool_idx % 4 != 0 && text_size > 0) {
    218         u8 stream = chacha20_pool_byte(ctx);
    219         u8 plain  = 0;
    220         if (plain_text != 0) {
    221             plain = *plain_text;
    222             plain_text++;
    223         }
    224         *cipher_text = stream ^ plain;
    225         text_size--;
    226         cipher_text++;
    227     }
    228     // Main processing by 4 byte chunks
    229     size_t nb_words  = text_size / 4;
    230     size_t remainder = text_size % 4;
    231     FOR (i, 0, nb_words) {
    232         chacha20_refill_pool(ctx);
    233         u32 txt = 0;
    234         if (plain_text) {
    235             txt = load32_le(plain_text);
    236             plain_text += 4;
    237         }
    238         store32_le(cipher_text, ctx->pool[ctx->pool_idx / 4] ^ txt);
    239         cipher_text   += 4;
    240         ctx->pool_idx += 4;
    241     }
    242     // Remaining input, byte by byte
    243     FOR (i, 0, remainder) {
    244         chacha20_refill_pool(ctx);
    245         u8 stream = chacha20_pool_byte(ctx);
    246         u8 plain  = 0;
    247         if (plain_text != 0) {
    248             plain = *plain_text;
    249             plain_text++;
    250         }
    251         *cipher_text = stream ^ plain;
    252         cipher_text++;
    253     }
    254 }
    255 
    256 void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
    257                             uint8_t *stream, size_t size)
    258 {
    259     crypto_chacha20_encrypt(ctx, stream, 0, size);
    260 }
    261 
    262 
    263 /////////////////
    264 /// Poly 1305 ///
    265 /////////////////
    266 
    267 // h = (h + c) * r
    268 // preconditions:
    269 //   ctx->h <= 7_ffffffff_ffffffff_ffffffff_ffffffff
    270 //   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
    271 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
    272 // Postcondition:
    273 //   ctx->h <= 4_87ffffe4_8fffffe2_97ffffe0_9ffffffa
    274 static void poly_block(crypto_poly1305_ctx *ctx)
    275 {
    276     // s = h + c, without carry propagation
    277     const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
    278     const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
    279     const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
    280     const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
    281     const u64 s4 = ctx->h[4] + (u64)ctx->c[4]; // s4 <=   00000004
    282 
    283     // Local all the things!
    284     const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
    285     const u32 r1 = ctx->r[1];       // r1  <= 0ffffffc
    286     const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
    287     const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
    288     const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
    289     const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
    290     const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
    291     const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
    292 
    293     // (h + c) * r, without carry propagation
    294     const u64 x0 = s0*r0 + s1*rr3 + s2*rr2 + s3*rr1 + s4*rr0;//<=97ffffe007fffff8
    295     const u64 x1 = s0*r1 + s1*r0  + s2*rr3 + s3*rr2 + s4*rr1;//<=8fffffe20ffffff6
    296     const u64 x2 = s0*r2 + s1*r1  + s2*r0  + s3*rr3 + s4*rr2;//<=87ffffe417fffff4
    297     const u64 x3 = s0*r3 + s1*r2  + s2*r1  + s3*r0  + s4*rr3;//<=7fffffe61ffffff2
    298     const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits       //<=0000000000000018
    299 
    300     // partial reduction modulo 2^130 - 5
    301     const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffffe
    302     const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
    303     const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
    304     const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
    305     const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
    306     const u64 u4 = (u3 >> 32)     + (u5 & 3);
    307 
    308     // Update the hash
    309     ctx->h[0] = u0 & 0xffffffff; // u0 <= 1_9ffffffa
    310     ctx->h[1] = u1 & 0xffffffff; // u1 <= 1_97ffffe0
    311     ctx->h[2] = u2 & 0xffffffff; // u2 <= 1_8fffffe2
    312     ctx->h[3] = u3 & 0xffffffff; // u3 <= 1_87ffffe4
    313     ctx->h[4] = u4;              // u4 <=          4
    314 }
    315 
    316 // (re-)initializes the input counter and input buffer
    317 static void poly_clear_c(crypto_poly1305_ctx *ctx)
    318 {
    319     FOR (i, 0, 4) {
    320         ctx->c[i] = 0;
    321     }
    322     ctx->c_idx = 0;
    323 }
    324 
    325 static void poly_end_block(crypto_poly1305_ctx *ctx)
    326 {
    327     if (ctx->c_idx == 16) {
    328         poly_block(ctx);
    329         poly_clear_c(ctx);
    330     }
    331 }
    332 
    333 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
    334 {
    335     size_t word = ctx->c_idx / 4;
    336     size_t byte = ctx->c_idx % 4;
    337     ctx->c[word] |= (u32)input << (byte * 8);
    338     ctx->c_idx++;
    339 }
    340 
    341 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
    342 {
    343     // Initial hash is zero
    344     FOR (i, 0, 5) {
    345         ctx->h [i] = 0;
    346     }
    347     // add 2^130 to every input block
    348     ctx->c  [4] = 1;
    349     poly_clear_c(ctx);
    350     // load r and pad (r has some of its bits cleared)
    351     FOR (i, 0, 1) { ctx->r  [0] = load32_le(key           ) & 0x0fffffff; }
    352     FOR (i, 1, 4) { ctx->r  [i] = load32_le(key + i*4     ) & 0x0ffffffc; }
    353     FOR (i, 0, 4) { ctx->pad[i] = load32_le(key + i*4 + 16);              }
    354 }
    355 
    356 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
    357                             const u8 *message, size_t message_size)
    358 {
    359     // Align ourselves with 4 byte words
    360     while (ctx->c_idx % 4 != 0 && message_size > 0) {
    361         poly_take_input(ctx, *message);
    362         message++;
    363         message_size--;
    364     }
    365 
    366     // Process the input 4 bytes at a time
    367     size_t nb_words  = message_size / 4;
    368     size_t remainder = message_size % 4;
    369     FOR (i, 0, nb_words) {
    370         poly_end_block(ctx);
    371         ctx->c[ctx->c_idx / 4] = load32_le(message);
    372         message    += 4;
    373         ctx->c_idx += 4;
    374     }
    375 
    376     // Input the remaining bytes
    377     if (remainder != 0) {
    378         poly_end_block(ctx);
    379     }
    380     FOR (i, 0, remainder) {
    381         poly_take_input(ctx, message[i]);
    382     }
    383 }
    384 
    385 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
    386 {
    387     // Process the last block (if any)
    388     if (ctx->c_idx != 0) {
    389         // move the final 1 according to remaining input length
    390         // (We may add less than 2^130 to the last input block)
    391         ctx->c[4] = 0;
    392         poly_take_input(ctx, 1);
    393         // one last hash update
    394         poly_block(ctx);
    395     }
    396 
    397     // check if we should subtract 2^130-5 by performing the
    398     // corresponding carry propagation.
    399     u64 u = 5;
    400     u += ctx->h[0];  u >>= 32;
    401     u += ctx->h[1];  u >>= 32;
    402     u += ctx->h[2];  u >>= 32;
    403     u += ctx->h[3];  u >>= 32;
    404     u += ctx->h[4];  u >>=  2;
    405     // now u indicates how many times we should subtract 2^130-5 (0 or 1)
    406 
    407     // store h + pad, minus 2^130-5 if u tells us to.
    408     u *= 5;
    409     u += (i64)(ctx->h[0]) + ctx->pad[0];  store32_le(mac     , u);  u >>= 32;
    410     u += (i64)(ctx->h[1]) + ctx->pad[1];  store32_le(mac +  4, u);  u >>= 32;
    411     u += (i64)(ctx->h[2]) + ctx->pad[2];  store32_le(mac +  8, u);  u >>= 32;
    412     u += (i64)(ctx->h[3]) + ctx->pad[3];  store32_le(mac + 12, u);
    413 }
    414 
    415 void crypto_poly1305_auth(u8     mac[16],  const u8 *message,
    416                           size_t message_size, const u8  key[32])
    417 {
    418     crypto_poly1305_ctx ctx;
    419     crypto_poly1305_init  (&ctx, key);
    420     crypto_poly1305_update(&ctx, message, message_size);
    421     crypto_poly1305_final (&ctx, mac);
    422 }
    423 
    424 ////////////////
    425 /// Blake2 b ///
    426 ////////////////
    427 static const u64 iv[8] = {
    428     0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
    429     0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
    430     0x510e527fade682d1, 0x9b05688c2b3e6c1f,
    431     0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
    432 };
    433 
    434 // increment the input offset
    435 static void blake2b_incr(crypto_blake2b_ctx *ctx)
    436 {
    437     u64   *x = ctx->input_offset;
    438     size_t y = ctx->input_idx;
    439     x[0] += y;
    440     if (x[0] < y) {
    441         x[1]++;
    442     }
    443 }
    444 
    445 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input)
    446 {
    447     size_t word = ctx->input_idx / 8;
    448     size_t byte = ctx->input_idx % 8;
    449     ctx->input[word] |= (u64)input << (byte * 8);
    450     ctx->input_idx++;
    451 }
    452 
    453 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
    454 {
    455     static const u8 sigma[12][16] = {
    456         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    457         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    458         { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
    459         {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
    460         {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
    461         {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
    462         { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
    463         { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
    464         {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
    465         { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
    466         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    467         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    468     };
    469 
    470     // init work vector
    471     u64 v[16];
    472     FOR (i, 0, 8) {
    473         v[i  ] = ctx->hash[i];
    474         v[i+8] = iv[i];
    475     }
    476     v[12] ^= ctx->input_offset[0];
    477     v[13] ^= ctx->input_offset[1];
    478     if (is_last_block) {
    479         v[14] = ~v[14];
    480     }
    481 
    482     // mangle work vector
    483     uint64_t *input = ctx->input;
    484     FOR (i, 0, 12) {
    485 #define BLAKE2_G(v, a, b, c, d, x, y)                       \
    486         v[a] += v[b] + x;  v[d] = rotr64(v[d] ^ v[a], 32);  \
    487         v[c] += v[d];      v[b] = rotr64(v[b] ^ v[c], 24);  \
    488         v[a] += v[b] + y;  v[d] = rotr64(v[d] ^ v[a], 16);  \
    489         v[c] += v[d];      v[b] = rotr64(v[b] ^ v[c], 63);  \
    490 
    491         BLAKE2_G(v, 0, 4,  8, 12, input[sigma[i][ 0]], input[sigma[i][ 1]]);
    492         BLAKE2_G(v, 1, 5,  9, 13, input[sigma[i][ 2]], input[sigma[i][ 3]]);
    493         BLAKE2_G(v, 2, 6, 10, 14, input[sigma[i][ 4]], input[sigma[i][ 5]]);
    494         BLAKE2_G(v, 3, 7, 11, 15, input[sigma[i][ 6]], input[sigma[i][ 7]]);
    495         BLAKE2_G(v, 0, 5, 10, 15, input[sigma[i][ 8]], input[sigma[i][ 9]]);
    496         BLAKE2_G(v, 1, 6, 11, 12, input[sigma[i][10]], input[sigma[i][11]]);
    497         BLAKE2_G(v, 2, 7,  8, 13, input[sigma[i][12]], input[sigma[i][13]]);
    498         BLAKE2_G(v, 3, 4,  9, 14, input[sigma[i][14]], input[sigma[i][15]]);
    499     }
    500     // update hash
    501     FOR (i, 0, 8) {
    502         ctx->hash[i] ^= v[i] ^ v[i+8];
    503     }
    504 }
    505 
    506 static void blake2b_reset_input(crypto_blake2b_ctx *ctx)
    507 {
    508     FOR(i, 0, 16) {
    509         ctx->input[i] = 0;
    510     }
    511     ctx->input_idx = 0;
    512 }
    513 
    514 static void blake2b_end_block(crypto_blake2b_ctx *ctx)
    515 {
    516     if (ctx->input_idx == 128) {  // If buffer is full,
    517         blake2b_incr(ctx);        // update the input offset
    518         blake2b_compress(ctx, 0); // and compress the (not last) block
    519         blake2b_reset_input(ctx);
    520     }
    521 }
    522 
    523 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
    524                                  const u8           *key, size_t key_size)
    525 {
    526     // initial hash
    527     FOR (i, 0, 8) {
    528         ctx->hash[i] = iv[i];
    529     }
    530     ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
    531 
    532     ctx->input_offset[0] = 0;         // begining of the input, no offset
    533     ctx->input_offset[1] = 0;         // begining of the input, no offset
    534     ctx->input_idx       = 0;         // buffer is empty
    535     ctx->hash_size       = hash_size; // remember the hash size we want
    536     blake2b_reset_input(ctx);         // clear the input buffer
    537 
    538     // if there is a key, the first block is that key
    539     if (key_size > 0) {
    540         crypto_blake2b_update(ctx, key, key_size);
    541         ctx->input_idx = 128;
    542     }
    543 }
    544 
    545 void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
    546 {
    547     crypto_blake2b_general_init(ctx, 64, 0, 0);
    548 }
    549 
    550 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
    551                            const u8 *message, size_t message_size)
    552 {
    553     // Align ourselves with 8 byte words
    554     while (ctx->input_idx % 8 != 0 && message_size > 0) {
    555         blake2b_set_input(ctx, *message);
    556         message++;
    557         message_size--;
    558     }
    559 
    560     // Process the input 8 bytes at a time
    561     size_t nb_words  = message_size / 8;
    562     size_t remainder = message_size % 8;
    563     FOR (i, 0, nb_words) {
    564         blake2b_end_block(ctx);
    565         ctx->input[ctx->input_idx / 8] = load64_le(message);
    566         message        += 8;
    567         ctx->input_idx += 8;
    568     }
    569 
    570     // Load the remainder
    571     if (remainder != 0) {
    572         blake2b_end_block(ctx);
    573     }
    574     FOR (i, 0, remainder) {
    575         blake2b_set_input(ctx, message[i]);
    576     }
    577 }
    578 
    579 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
    580 {
    581     blake2b_incr(ctx);        // update the input offset
    582     blake2b_compress(ctx, 1); // compress the last block
    583     size_t nb_words  = ctx->hash_size / 8;
    584     FOR (i, 0, nb_words) {
    585         store64_le(hash + i*8, ctx->hash[i]);
    586     }
    587     FOR (i, nb_words * 8, ctx->hash_size) {
    588         hash[i] = (ctx->hash[i / 8] >> (8 * (i % 8))) & 0xff;
    589     }
    590 }
    591 
    592 void crypto_blake2b_general(u8       *hash   , size_t hash_size,
    593                             const u8 *key    , size_t key_size,
    594                             const u8 *message, size_t message_size)
    595 {
    596     crypto_blake2b_ctx ctx;
    597     crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
    598     crypto_blake2b_update(&ctx, message, message_size);
    599     crypto_blake2b_final(&ctx, hash);
    600 }
    601 
    602 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
    603 {
    604     crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
    605 }
    606 
    607 
    608 ////////////////
    609 /// Argon2 i ///
    610 ////////////////
    611 // references to R, Z, Q etc. come from the spec
    612 
    613 // Argon2 operates on 1024 byte blocks.
    614 typedef struct { u64 a[128]; } block;
    615 
    616 static u32 min(u32 a, u32 b) { return a <= b ? a : b; }
    617 
    618 // updates a blake2 hash with a 32 bit word, little endian.
    619 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
    620 {
    621     u8 buf[4];
    622     store32_le(buf, input);
    623     crypto_blake2b_update(ctx, buf, 4);
    624 }
    625 
    626 static void load_block(block *b, const u8 bytes[1024])
    627 {
    628     FOR (i, 0, 128) {
    629         b->a[i] = load64_le(bytes + i*8);
    630     }
    631 }
    632 
    633 static void store_block(u8 bytes[1024], const block *b)
    634 {
    635     FOR (i, 0, 128) {
    636         store64_le(bytes + i*8, b->a[i]);
    637     }
    638 }
    639 
    640 static void copy_block(block *o,const block*in){FOR(i,0,128) o->a[i] = in->a[i];}
    641 static void  xor_block(block *o,const block*in){FOR(i,0,128) o->a[i]^= in->a[i];}
    642 
    643 // Hash with a virtually unlimited digest size.
    644 // Doesn't extract more entropy than the base hash function.
    645 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
    646 // (One could use a stream cipher with a seed hash as the key, but
    647 //  this would introduce another dependency —and point of failure.)
    648 static void extended_hash(u8       *digest, u32 digest_size,
    649                           const u8 *input , u32 input_size)
    650 {
    651     crypto_blake2b_ctx ctx;
    652     crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
    653     blake_update_32            (&ctx, digest_size);
    654     crypto_blake2b_update      (&ctx, input, input_size);
    655     crypto_blake2b_final       (&ctx, digest);
    656 
    657     if (digest_size > 64) {
    658         // the conversion to u64 avoids integer overflow on
    659         // ludicrously big hash sizes.
    660         u32 r   = (((u64)digest_size + 31) / 32) - 2;
    661         u32 i   =  1;
    662         u32 in  =  0;
    663         u32 out = 32;
    664         while (i < r) {
    665             // Input and output overlap. This is intentional
    666             crypto_blake2b(digest + out, digest + in, 64);
    667             i   +=  1;
    668             in  += 32;
    669             out += 32;
    670         }
    671         crypto_blake2b_general(digest + out, digest_size - (32 * r),
    672                                0, 0, // no key
    673                                digest + in , 64);
    674     }
    675 }
    676 
    677 #define LSB(x) ((x) & 0xffffffff)
    678 #define G(a, b, c, d)                                            \
    679     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
    680     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
    681     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
    682     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
    683 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
    684               v8,  v9, v10, v11, v12, v13, v14, v15)    \
    685     G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
    686     G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
    687     G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
    688     G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
    689 
    690 // Core of the compression function G.  Computes Z from R in place.
    691 static void g_rounds(block *work_block)
    692 {
    693     // column rounds (work_block = Q)
    694     for (int i = 0; i < 128; i += 16) {
    695         ROUND(work_block->a[i     ], work_block->a[i +  1],
    696               work_block->a[i +  2], work_block->a[i +  3],
    697               work_block->a[i +  4], work_block->a[i +  5],
    698               work_block->a[i +  6], work_block->a[i +  7],
    699               work_block->a[i +  8], work_block->a[i +  9],
    700               work_block->a[i + 10], work_block->a[i + 11],
    701               work_block->a[i + 12], work_block->a[i + 13],
    702               work_block->a[i + 14], work_block->a[i + 15]);
    703     }
    704     // row rounds (work_block = Z)
    705     for (int i = 0; i < 16; i += 2) {
    706         ROUND(work_block->a[i      ], work_block->a[i +   1],
    707               work_block->a[i +  16], work_block->a[i +  17],
    708               work_block->a[i +  32], work_block->a[i +  33],
    709               work_block->a[i +  48], work_block->a[i +  49],
    710               work_block->a[i +  64], work_block->a[i +  65],
    711               work_block->a[i +  80], work_block->a[i +  81],
    712               work_block->a[i +  96], work_block->a[i +  97],
    713               work_block->a[i + 112], work_block->a[i + 113]);
    714     }
    715 }
    716 
    717 // The compression function G (copy version for the first pass)
    718 static void g_copy(block *result, const block *x, const block *y)
    719 {
    720     block tmp;
    721     copy_block(&tmp  , x   ); // tmp    = X
    722     xor_block (&tmp  , y   ); // tmp    = X ^ Y = R
    723     copy_block(result, &tmp); // result = R         (only difference with g_xor)
    724     g_rounds  (&tmp);         // tmp    = Z
    725     xor_block (result, &tmp); // result = R ^ Z
    726 }
    727 
    728 // The compression function G (xor version for subsequent passes)
    729 static void g_xor(block *result, const block *x, const block *y)
    730 {
    731     block tmp;
    732     copy_block(&tmp  , x   ); // tmp    = X
    733     xor_block (&tmp  , y   ); // tmp    = X ^ Y = R
    734     xor_block (result, &tmp); // result = R ^ old   (only difference with g_copy)
    735     g_rounds  (&tmp);         // tmp    = Z
    736     xor_block (result, &tmp); // result = R ^ old ^ Z
    737 }
    738 
    739 // unary version of the compression function.
    740 // The missing argument is implied zero.
    741 // Does the transformation in place.
    742 static void unary_g(block *work_block)
    743 {
    744     // work_block == R
    745     block tmp;
    746     copy_block(&tmp, work_block); // tmp        = R
    747     g_rounds(work_block);         // work_block = Z
    748     xor_block(work_block, &tmp);  // work_block = Z ^ R
    749 }
    750 
    751 // Argon2i uses a kind of stream cipher to determine which reference
    752 // block it will take to synthesise the next block.  This context hold
    753 // that stream's state.  (It's very similar to Chacha20.  The block b
    754 // is anologous to Chacha's own pool)
    755 typedef struct {
    756     block b;
    757     u32 pass_number;
    758     u32 slice_number;
    759     u32 nb_blocks;
    760     u32 nb_iterations;
    761     u32 ctr;
    762     u32 offset;
    763 } gidx_ctx;
    764 
    765 // The block in the context will determine array indices. To avoid
    766 // timing attacks, it only depends on public information.  No looking
    767 // at a previous block to seed the next.  This makes offline attacks
    768 // easier, but timing attacks are the bigger threat in many settings.
    769 static void gidx_refresh(gidx_ctx *ctx)
    770 {
    771     // seed the begining of the block...
    772     ctx->b.a[0] = ctx->pass_number;
    773     ctx->b.a[1] = 0;  // lane number (we have only one)
    774     ctx->b.a[2] = ctx->slice_number;
    775     ctx->b.a[3] = ctx->nb_blocks;
    776     ctx->b.a[4] = ctx->nb_iterations;
    777     ctx->b.a[5] = 1;  // type: Argon2i
    778     ctx->b.a[6] = ctx->ctr;
    779     FOR (i, 7, 128) { ctx->b.a[i] = 0; } // ...then zero the rest out
    780 
    781     // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
    782     // (G "square" function), to get cheap pseudo-random numbers.
    783     unary_g(&(ctx->b));
    784     unary_g(&(ctx->b));
    785 }
    786 
    787 static void gidx_init(gidx_ctx *ctx,
    788                       u32 pass_number, u32 slice_number,
    789                       u32 nb_blocks,   u32 nb_iterations)
    790 {
    791     ctx->pass_number   = pass_number;
    792     ctx->slice_number  = slice_number;
    793     ctx->nb_blocks     = nb_blocks;
    794     ctx->nb_iterations = nb_iterations;
    795     ctx->ctr           = 0;
    796 
    797     // Offset from the begining of the segment.  For the first slice
    798     // of the first pass, we start at the *third* block, so the offset
    799     // starts at 2, not 0.
    800     if (pass_number != 0 || slice_number != 0) {
    801         ctx->offset = 0;
    802     } else {
    803         ctx->offset = 2;
    804         ctx->ctr++;         // Compensates for missed lazy creation
    805         gidx_refresh(ctx);  // at the start of gidx_next()
    806     }
    807 }
    808 
    809 static u32 gidx_next(gidx_ctx *ctx)
    810 {
    811     // lazily creates the offset block we need
    812     if (ctx->offset % 128 == 0) {
    813         ctx->ctr++;
    814         gidx_refresh(ctx);
    815     }
    816     u32 index  = ctx->offset % 128; // save index  for current call
    817     u32 offset = ctx->offset;       // save offset for current call
    818     ctx->offset++;                  // update offset for next call
    819 
    820     // Computes the area size.
    821     // Pass 0 : all already finished segments plus already constructed
    822     //          blocks in this segment
    823     // Pass 1+: 3 last segments plus already constructed
    824     //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
    825     //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
    826     int first_pass  = ctx->pass_number == 0;
    827     u32 slice_size  = ctx->nb_blocks / 4;
    828     u32 nb_segments = first_pass ? ctx->slice_number : 3;
    829     u32 area_size   = nb_segments * slice_size + offset - 1;
    830 
    831     // Computes the starting position of the reference area.
    832     // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
    833     // NEXT SEGMENT, NOT THE NEXT BLOCK.
    834     u32 next_slice = ((ctx->slice_number + 1) % 4) * slice_size;
    835     u32 start_pos  = first_pass ? 0 : next_slice;
    836 
    837     // Generate offset from J1 (no need for J2, there's only one lane)
    838     u64 j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
    839     u64 x          = (j1 * j1)       >> 32;
    840     u64 y          = (area_size * x) >> 32;
    841     u64 z          = (area_size - 1) - y;
    842     return (start_pos + z) % ctx->nb_blocks;
    843 }
    844 
    845 // Main algorithm
    846 void crypto_argon2i(u8       *hash,      u32 hash_size,
    847                     void     *work_area, u32 nb_blocks, u32 nb_iterations,
    848                     const u8 *password,  u32 password_size,
    849                     const u8 *salt,      u32 salt_size,
    850                     const u8 *key,       u32 key_size,
    851                     const u8 *ad,        u32 ad_size)
    852 {
    853     // work area seen as blocks (must be suitably aligned)
    854     block *blocks = (block*)work_area;
    855     {
    856         crypto_blake2b_ctx ctx;
    857         crypto_blake2b_init(&ctx);
    858 
    859         blake_update_32      (&ctx, 1            ); // p: number of threads
    860         blake_update_32      (&ctx, hash_size    );
    861         blake_update_32      (&ctx, nb_blocks    );
    862         blake_update_32      (&ctx, nb_iterations);
    863         blake_update_32      (&ctx, 0x13         ); // v: version number
    864         blake_update_32      (&ctx, 1            ); // y: Argon2i
    865         blake_update_32      (&ctx,           password_size);
    866         crypto_blake2b_update(&ctx, password, password_size);
    867         blake_update_32      (&ctx,           salt_size);
    868         crypto_blake2b_update(&ctx, salt,     salt_size);
    869         blake_update_32      (&ctx,           key_size);
    870         crypto_blake2b_update(&ctx, key,      key_size);
    871         blake_update_32      (&ctx,           ad_size);
    872         crypto_blake2b_update(&ctx, ad,       ad_size);
    873 
    874         u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
    875         crypto_blake2b_final(&ctx, initial_hash);
    876 
    877         // fill first 2 blocks
    878         block   tmp_block;
    879         u8 hash_area[1024];
    880         store32_le(initial_hash + 64, 0); // first  additional word
    881         store32_le(initial_hash + 68, 0); // second additional word
    882         extended_hash(hash_area, 1024, initial_hash, 72);
    883         load_block(&tmp_block, hash_area);
    884         copy_block(blocks, &tmp_block);
    885 
    886         store32_le(initial_hash + 64, 1); // slight modification
    887         extended_hash(hash_area, 1024, initial_hash, 72);
    888         load_block(&tmp_block, hash_area);
    889         copy_block(blocks + 1, &tmp_block);
    890     }
    891 
    892     // Actual number of blocks
    893     nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
    894     const u32 segment_size = nb_blocks / 4;
    895 
    896     // fill (then re-fill) the rest of the blocks
    897     FOR (pass_number, 0, nb_iterations) {
    898         int first_pass = pass_number == 0;
    899 
    900         FOR (segment, 0, 4) {
    901             gidx_ctx ctx;
    902             gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
    903 
    904             // On the first segment of the first pass,
    905             // blocks 0 and 1 are already filled.
    906             // We use the offset to skip them.
    907             u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
    908             u32 segment_start = segment * segment_size + start_offset;
    909             u32 segment_end   = (segment + 1) * segment_size;
    910             FOR (current_block, segment_start, segment_end) {
    911                 u32 reference_block = gidx_next(&ctx);
    912                 u32 previous_block  = current_block == 0
    913                                     ? nb_blocks - 1
    914                                     : current_block - 1;
    915                 block *c = blocks + current_block;
    916                 block *p = blocks + previous_block;
    917                 block *r = blocks + reference_block;
    918                 if (first_pass) { g_copy(c, p, r); }
    919                 else            { g_xor (c, p, r); }
    920             }
    921         }
    922     }
    923     // hash the very last block with H' into the output hash
    924     u8 final_block[1024];
    925     store_block(final_block, blocks + (nb_blocks - 1));
    926     extended_hash(hash, hash_size, final_block, 1024);
    927 }
    928 
    929 ////////////////////////////////////
    930 /// Arithmetic modulo 2^255 - 19 ///
    931 ////////////////////////////////////
    932 //  Taken from Supercop's ref10 implementation.
    933 //  A bit bigger than TweetNaCl, over 4 times faster.
    934 
    935 // field element
    936 typedef i32 fe[10];
    937 
    938 static void fe_0   (fe h) {                     FOR(i,0,10) h[i] = 0;          }
    939 static void fe_1   (fe h) {          h[0] = 1;  FOR(i,1,10) h[i] = 0;          }
    940 static void fe_neg (fe h,const fe f)           {FOR(i,0,10) h[i] = -f[i];      }
    941 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
    942 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
    943 static void fe_copy(fe h,const fe f)           {FOR(i,0,10) h[i] = f[i];       }
    944 
    945 static void fe_cswap(fe f, fe g, int b)
    946 {
    947     FOR (i, 0, 10) {
    948         i32 x = (f[i] ^ g[i]) & -b;
    949         f[i] = f[i] ^ x;
    950         g[i] = g[i] ^ x;
    951     }
    952 }
    953 
    954 static void fe_carry(fe h, i64 t[10])
    955 {
    956     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
    957     c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= c9 * (1 << 25);
    958     c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1;      t[1] -= c1 * (1 << 25);
    959     c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3;      t[3] -= c3 * (1 << 25);
    960     c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5;      t[5] -= c5 * (1 << 25);
    961     c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7;      t[7] -= c7 * (1 << 25);
    962     c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0;      t[0] -= c0 * (1 << 26);
    963     c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2;      t[2] -= c2 * (1 << 26);
    964     c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4;      t[4] -= c4 * (1 << 26);
    965     c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6;      t[6] -= c6 * (1 << 26);
    966     c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8;      t[8] -= c8 * (1 << 26);
    967     FOR (i, 0, 10) { h[i] = t[i]; }
    968 }
    969 
    970 static void fe_frombytes(fe h, const u8 s[32])
    971 {
    972     i64 t[10]; // intermediate result (may overflow 32 bits)
    973     t[0] =  load32_le(s);
    974     t[1] =  load24_le(s +  4) << 6;
    975     t[2] =  load24_le(s +  7) << 5;
    976     t[3] =  load24_le(s + 10) << 3;
    977     t[4] =  load24_le(s + 13) << 2;
    978     t[5] =  load32_le(s + 16);
    979     t[6] =  load24_le(s + 20) << 7;
    980     t[7] =  load24_le(s + 23) << 5;
    981     t[8] =  load24_le(s + 26) << 4;
    982     t[9] = (load24_le(s + 29) & 8388607) << 2;
    983     fe_carry(h, t);
    984 }
    985 
    986 static void fe_mul_small(fe h, const fe f, i32 g)
    987 {
    988     i64 t[10];
    989     FOR(i, 0, 10) {
    990         t[i] = f[i] * (i64) g;
    991     }
    992     fe_carry(h, t);
    993 }
    994 static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }
    995 static void fe_mul973324(fe h, const fe f) { fe_mul_small(h, f, 973324); }
    996 
    997 static void fe_mul(fe h, const fe f, const fe g)
    998 {
    999     // Everything is unrolled and put in temporary variables.
   1000     // We could roll the loop, but that would make curve25519 twice as slow.
   1001     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1002     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1003     i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
   1004     i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
   1005     i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
   1006     i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
   1007     i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
   1008     i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
   1009 
   1010     i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
   1011         +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
   1012     i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
   1013         +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
   1014     i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
   1015         +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
   1016     i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
   1017         +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
   1018     i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
   1019         +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
   1020     i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
   1021         +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
   1022     i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
   1023         +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
   1024     i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
   1025         +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
   1026     i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
   1027         +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
   1028     i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
   1029         +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
   1030 
   1031 #define CARRY                                                             \
   1032     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;                           \
   1033     c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 * (1 << 26); \
   1034     c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 * (1 << 26); \
   1035     c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1;      h1 -= c1 * (1 << 25); \
   1036     c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5;      h5 -= c5 * (1 << 25); \
   1037     c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2;      h2 -= c2 * (1 << 26); \
   1038     c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6;      h6 -= c6 * (1 << 26); \
   1039     c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3;      h3 -= c3 * (1 << 25); \
   1040     c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7;      h7 -= c7 * (1 << 25); \
   1041     c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 * (1 << 26); \
   1042     c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8;      h8 -= c8 * (1 << 26); \
   1043     c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \
   1044     c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 * (1 << 26); \
   1045     h[0] = h0;  h[1] = h1;  h[2] = h2;  h[3] = h3;  h[4] = h4;            \
   1046     h[5] = h5;  h[6] = h6;  h[7] = h7;  h[8] = h8;  h[9] = h9;            \
   1047 
   1048     CARRY;
   1049 }
   1050 
   1051 // we could use fe_mul() for this, but this is significantly faster
   1052 static void fe_sq(fe h, const fe f)
   1053 {
   1054     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1055     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1056     i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
   1057     i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
   1058     i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
   1059     i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
   1060 
   1061     i64 h0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
   1062         +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
   1063     i64 h1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
   1064         +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
   1065     i64 h2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
   1066         +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
   1067     i64 h3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
   1068         +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
   1069     i64 h4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
   1070         +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
   1071     i64 h5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
   1072         +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
   1073     i64 h6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
   1074         +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
   1075     i64 h7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
   1076         +    f3_2*(i64)f4    + f8  *(i64)f9_38;
   1077     i64 h8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
   1078         +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
   1079     i64 h9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
   1080         +    f3_2*(i64)f6    + f4  *(i64)f5_2;
   1081 
   1082     CARRY;
   1083 }
   1084 
   1085 // This could be simplified, but it would be slower
   1086 static void fe_invert(fe out, const fe z)
   1087 {
   1088     fe t0, t1, t2, t3;
   1089     fe_sq(t0, z );
   1090     fe_sq(t1, t0);
   1091     fe_sq(t1, t1);
   1092     fe_mul(t1,  z, t1);
   1093     fe_mul(t0, t0, t1);
   1094     fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
   1095     fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
   1096     fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
   1097     fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
   1098     fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
   1099     fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
   1100     fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
   1101     fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
   1102     fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
   1103 }
   1104 
   1105 // This could be simplified, but it would be slower
   1106 void fe_pow22523(fe out, const fe z)
   1107 {
   1108     fe t0, t1, t2;
   1109     fe_sq(t0, z);
   1110     fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
   1111     fe_mul(t0, t0, t1);
   1112     fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
   1113     fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1114     fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
   1115     fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
   1116     fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1117     fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
   1118     fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
   1119     fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1120     fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
   1121 }
   1122 
   1123 static void fe_tobytes(u8 s[32], const fe h)
   1124 {
   1125     i32 t[10];
   1126     FOR (i, 0, 10) {
   1127         t[i] = h[i];
   1128     }
   1129     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
   1130     FOR (i, 0, 5) {
   1131         q += t[2*i  ]; q >>= 26;
   1132         q += t[2*i+1]; q >>= 25;
   1133     }
   1134     t[0] += 19 * q;
   1135 
   1136     i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= c0 * (1 << 26);
   1137     i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= c1 * (1 << 25);
   1138     i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= c2 * (1 << 26);
   1139     i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= c3 * (1 << 25);
   1140     i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= c4 * (1 << 26);
   1141     i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= c5 * (1 << 25);
   1142     i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= c6 * (1 << 26);
   1143     i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= c7 * (1 << 25);
   1144     i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= c8 * (1 << 26);
   1145     i32 c9 = t[9] >> 25;             t[9] -= c9 * (1 << 25);
   1146 
   1147     store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
   1148     store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
   1149     store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
   1150     store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
   1151     store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
   1152     store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
   1153     store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
   1154     store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
   1155 }
   1156 
   1157 //  Parity check.  Returns 0 if even, 1 if odd
   1158 static int fe_isnegative(const fe f)
   1159 {
   1160     u8 s[32];
   1161     fe_tobytes(s, f);
   1162     return s[0] & 1;
   1163 }
   1164 
   1165 static int fe_isnonzero(const fe f)
   1166 {
   1167     u8 s[32];
   1168     fe_tobytes(s, f);
   1169     return crypto_zerocmp(s, 32);
   1170 }
   1171 
   1172 ///////////////
   1173 /// X-25519 /// Taken from Supercop's ref10 implementation.
   1174 ///////////////
   1175 
   1176 static void trim_scalar(u8 s[32])
   1177 {
   1178     s[ 0] &= 248;
   1179     s[31] &= 127;
   1180     s[31] |= 64;
   1181 }
   1182 
   1183 static void x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3,
   1184                           const u8 scalar[32])
   1185 {
   1186     // Montgomery ladder
   1187     // In projective coordinates, to avoid divisons: x = X / Z
   1188     // We don't care about the y coordinate, it's only 1 bit of information
   1189     fe_1(x2);        fe_0(z2); // "zero" point
   1190     fe_copy(x3, x1); fe_1(z3); // "one"  point
   1191     int swap = 0;
   1192     for (int pos = 254; pos >= 0; --pos) {
   1193         // constant time conditional swap before ladder step
   1194         int b = (scalar[pos / 8] >> (pos & 7)) & 1;
   1195         swap ^= b; // xor trick avoids swapping at the end of the loop
   1196         fe_cswap(x2, x3, swap);
   1197         fe_cswap(z2, z3, swap);
   1198         swap = b;  // anticipates one last swap after the loop
   1199 
   1200         // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
   1201         // with differential addition
   1202         fe t0, t1;
   1203         fe_sub(t0, x3, z3);  fe_sub(t1, x2, z2);    fe_add(x2, x2, z2);
   1204         fe_add(z2, x3, z3);  fe_mul(z3, t0, x2);    fe_mul(z2, z2, t1);
   1205         fe_sq (t0, t1    );  fe_sq (t1, x2    );    fe_add(x3, z3, z2);
   1206         fe_sub(z2, z3, z2);  fe_mul(x2, t1, t0);    fe_sub(t1, t1, t0);
   1207         fe_sq (z2, z2    );  fe_mul121666(z3, t1);  fe_sq (x3, x3    );
   1208         fe_add(t0, t0, z3);  fe_mul(z3, x1, z2);    fe_mul(z2, t1, t0);
   1209     }
   1210     // last swap is necessary to compensate for the xor trick
   1211     // Note: after this swap, P3 == P2 + P1.
   1212     fe_cswap(x2, x3, swap);
   1213     fe_cswap(z2, z3, swap);
   1214 }
   1215 
   1216 int crypto_x25519(u8       shared_secret   [32],
   1217                   const u8 your_secret_key [32],
   1218                   const u8 their_public_key[32])
   1219 {
   1220     // computes the scalar product
   1221     fe x1;
   1222     fe_frombytes(x1, their_public_key);
   1223 
   1224     // restrict the possible scalar values
   1225     u8 e[32];
   1226     FOR (i, 0, 32) {
   1227         e[i] = your_secret_key[i];
   1228     }
   1229     trim_scalar(e);
   1230 
   1231     // computes the actual scalar product (the result is in x2 and z2)
   1232     fe x2, z2, x3, z3;
   1233     x25519_ladder(x1, x2, z2, x3, z3, e);
   1234 
   1235     // normalises the coordinates: x == X / Z
   1236     fe_invert(z2, z2);
   1237     fe_mul(x2, x2, z2);
   1238     fe_tobytes(shared_secret, x2);
   1239 
   1240     // Returns -1 if the input is all zero
   1241     // (happens with some malicious public keys)
   1242     return -1 - crypto_zerocmp(shared_secret, 32);
   1243 }
   1244 
   1245 void crypto_x25519_public_key(u8       public_key[32],
   1246                               const u8 secret_key[32])
   1247 {
   1248     static const u8 base_point[32] = {9};
   1249     crypto_x25519(public_key, secret_key, base_point);
   1250 }
   1251 
   1252 ///////////////
   1253 /// Ed25519 ///
   1254 ///////////////
   1255 
   1256 // Point in a twisted Edwards curve,
   1257 // in extended projective coordinates.
   1258 // x = X/Z, y = Y/Z, T = XY/Z
   1259 typedef struct { fe X; fe Y; fe Z; fe T; } ge;
   1260 
   1261 static void ge_from_xy(ge *p, const fe x, const fe y)
   1262 {
   1263     FOR (i, 0, 10) {
   1264         p->X[i] = x[i];
   1265         p->Y[i] = y[i];
   1266     }
   1267     fe_1  (p->Z);
   1268     fe_mul(p->T, x, y);
   1269 }
   1270 
   1271 static void ge_tobytes(u8 s[32], const ge *h)
   1272 {
   1273     fe recip, x, y;
   1274     fe_invert(recip, h->Z);
   1275     fe_mul(x, h->X, recip);
   1276     fe_mul(y, h->Y, recip);
   1277     fe_tobytes(s, y);
   1278     s[31] ^= fe_isnegative(x) << 7;
   1279 }
   1280 
   1281 // Variable time! s must not be secret!
   1282 static int ge_frombytes_neg(ge *h, const u8 s[32])
   1283 {
   1284     static const fe d = {
   1285         -10913610, 13857413, -15372611, 6949391, 114729,
   1286         -8787816, -6275908, -3247719, -18696448, -12055116
   1287     } ;
   1288     static const fe sqrtm1 = {
   1289         -32595792, -7943725, 9377950, 3500415, 12389472,
   1290         -272473, -25146209, -2005654, 326686, 11406482
   1291     } ;
   1292     fe u, v, v3, vxx, check;
   1293     fe_frombytes(h->Y, s);
   1294     fe_1(h->Z);
   1295     fe_sq(u, h->Y);            // y^2
   1296     fe_mul(v, u, d);
   1297     fe_sub(u, u, h->Z);        // u = y^2-1
   1298     fe_add(v, v, h->Z);        // v = dy^2+1
   1299 
   1300     fe_sq(v3, v);
   1301     fe_mul(v3, v3, v);         // v3 = v^3
   1302     fe_sq(h->X, v3);
   1303     fe_mul(h->X, h->X, v);
   1304     fe_mul(h->X, h->X, u);     // x = uv^7
   1305 
   1306     fe_pow22523(h->X, h->X);   // x = (uv^7)^((q-5)/8)
   1307     fe_mul(h->X, h->X, v3);
   1308     fe_mul(h->X, h->X, u);     // x = uv^3(uv^7)^((q-5)/8)
   1309 
   1310     fe_sq(vxx, h->X);
   1311     fe_mul(vxx, vxx, v);
   1312     fe_sub(check, vxx, u);     // vx^2-u
   1313     if (fe_isnonzero(check)) {
   1314         fe_add(check, vxx, u); // vx^2+u
   1315         if (fe_isnonzero(check)) return -1;
   1316         fe_mul(h->X, h->X, sqrtm1);
   1317     }
   1318 
   1319     if (fe_isnegative(h->X) == (s[31] >> 7)) {
   1320         fe_neg(h->X, h->X);
   1321     }
   1322     fe_mul(h->T, h->X, h->Y);
   1323     return 0;
   1324 }
   1325 
   1326 static void ge_add(ge *s, const ge *p, const ge *q)
   1327 {
   1328     static const fe D2 = { // - 2 * 121665 / 121666
   1329         0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052,
   1330         0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67
   1331     };
   1332     fe a, b, c, d, e, f, g, h;
   1333     //  A = (Y1-X1) * (Y2-X2)
   1334     //  B = (Y1+X1) * (Y2+X2)
   1335     fe_sub(a, p->Y, p->X);  fe_sub(h, q->Y, q->X);  fe_mul(a, a, h);
   1336     fe_add(b, p->X, p->Y);  fe_add(h, q->X, q->Y);  fe_mul(b, b, h);
   1337     fe_mul(c, p->T, q->T);  fe_mul(c, c, D2  );  //  C = T1 * k * T2
   1338     fe_add(d, p->Z, p->Z);  fe_mul(d, d, q->Z);  //  D = Z1 * 2 * Z2
   1339     fe_sub(e, b, a);     //  E  = B - A
   1340     fe_sub(f, d, c);     //  F  = D - C
   1341     fe_add(g, d, c);     //  G  = D + C
   1342     fe_add(h, b, a);     //  H  = B + A
   1343     fe_mul(s->X, e, f);  //  X3 = E * F
   1344     fe_mul(s->Y, g, h);  //  Y3 = G * H
   1345     fe_mul(s->Z, f, g);  //  Z3 = F * G
   1346     fe_mul(s->T, e, h);  //  T3 = E * H
   1347 }
   1348 
   1349 // Performing the scalar multiplication directly in Twisted Edwards
   1350 // space woud be simpler, but also slower.  So we do it in Montgomery
   1351 // space instead.  The sign of the Y coordinate however gets lost in
   1352 // translation, so we use a dirty trick to recover it.
   1353 static void ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
   1354 {
   1355     // sqrt(-486664)
   1356     static const fe K = { 54885894, 25242303, 55597453,  9067496, 51808079,
   1357                           33312638, 25456129, 14121551, 54921728,  3972023 };
   1358 
   1359     // convert q to montgomery format
   1360     fe x1, y1, z1, x2, z2, x3, z3, t1, t2, t3, t4;
   1361     fe_sub(z1, q->Z, q->Y);  fe_mul(z1, z1, q->X);  fe_invert(z1, z1);
   1362     fe_add(t1, q->Z, q->Y);
   1363     fe_mul(x1, q->X, t1  );  fe_mul(x1, x1, z1);
   1364     fe_mul(y1, q->Z, t1  );  fe_mul(y1, y1, z1);  fe_mul(y1, K, y1);
   1365     fe_1(z1); // implied in the ladder, needed to convert back.
   1366 
   1367     // montgomery scalarmult
   1368     x25519_ladder(x1, x2, z2, x3, z3, scalar);
   1369 
   1370     // Recover the y coordinate (Katsuyuki Okeya & Kouichi Sakurai, 2001)
   1371     // Note the shameless reuse of x1: (x1, y1, z1) will correspond to
   1372     // what was originally (x2, z2).
   1373     fe_mul(t1, x1, z2);  // t1 = x1 * z2
   1374     fe_add(t2, x2, t1);  // t2 = x2 + t1
   1375     fe_sub(t3, x2, t1);  // t3 = x2 − t1
   1376     fe_sq (t3, t3);      // t3 = t3^2
   1377     fe_mul(t3, t3, x3);  // t3 = t3 * x3
   1378     fe_mul973324(t1, z2);// t1 = 2a * z2
   1379     fe_add(t2, t2, t1);  // t2 = t2 + t1
   1380     fe_mul(t4, x1, x2);  // t4 = x1 * x2
   1381     fe_add(t4, t4, z2);  // t4 = t4 + z2
   1382     fe_mul(t2, t2, t4);  // t2 = t2 * t4
   1383     fe_mul(t1, t1, z2);  // t1 = t1 * z2
   1384     fe_sub(t2, t2, t1);  // t2 = t2 − t1
   1385     fe_mul(t2, t2, z3);  // t2 = t2 * z3
   1386     fe_add(t1, y1, y1);  // t1 = y1 + y1
   1387     fe_mul(t1, t1, z2);  // t1 = t1 * z2
   1388     fe_mul(t1, t1, z3);  // t1 = t1 * z3
   1389     fe_mul(x1, t1, x2);  // x1 = t1 * x2
   1390     fe_sub(y1, t2, t3);  // y1 = t2 − t3
   1391     fe_mul(z1, t1, z2);  // z1 = t1 * z2
   1392 
   1393     // convert back to twisted edwards
   1394     fe_sub(t1  , x1, z1);
   1395     fe_add(t2  , x1, z1);
   1396     fe_mul(x1  , K , x1);
   1397     fe_mul(p->X, x1, t2);
   1398     fe_mul(p->Y, y1, t1);
   1399     fe_mul(p->Z, y1, t2);
   1400     fe_mul(p->T, x1, t1);
   1401 }
   1402 
   1403 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
   1404 {
   1405     // Calls the general ge_scalarmult() with the base point.
   1406     // Other implementations use a precomputed table, but it
   1407     // takes way too much code.
   1408     static const fe X = {
   1409         0x325d51a, 0x18b5823, 0x0f6592a, 0x104a92d, 0x1a4b31d,
   1410         0x1d6dc5c, 0x27118fe, 0x07fd814, 0x13cd6e5, 0x085a4db};
   1411     static const fe Y = {
   1412         0x2666658, 0x1999999, 0x0cccccc, 0x1333333, 0x1999999,
   1413         0x0666666, 0x3333333, 0x0cccccc, 0x2666666, 0x1999999};
   1414     ge base_point;
   1415     ge_from_xy(&base_point, X, Y);
   1416     ge_scalarmult(p, &base_point, scalar);
   1417 }
   1418 
   1419 static void modL(u8 *r, i64 x[64])
   1420 {
   1421     static const  u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
   1422                                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
   1423                                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1424                                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
   1425     for (unsigned i = 63; i >= 32; i--) {
   1426         i64 carry = 0;
   1427         FOR (j, i-32, i-12) {
   1428             x[j] += carry - 16 * x[i] * L[j - (i - 32)];
   1429             carry = (x[j] + 128) >> 8;
   1430             x[j] -= carry * (1 << 8);
   1431         }
   1432         x[i-12] += carry;
   1433         x[i] = 0;
   1434     }
   1435     i64 carry = 0;
   1436     FOR(i, 0, 32) {
   1437         x[i] += carry - (x[31] >> 4) * L[i];
   1438         carry = x[i] >> 8;
   1439         x[i] &= 255;
   1440     }
   1441     FOR(i, 0, 32) {
   1442         x[i] -= carry * L[i];
   1443     }
   1444     FOR(i, 0, 32) {
   1445         x[i+1] += x[i] >> 8;
   1446         r[i  ]  = x[i] & 255;
   1447     }
   1448 }
   1449 
   1450 static void reduce(u8 r[64])
   1451 {
   1452     i64 x[64];
   1453     FOR(i, 0, 64) {
   1454         x[i] = (u64) r[i];
   1455         r[i] = 0;
   1456     }
   1457     modL(r, x);
   1458 }
   1459 
   1460 // hashes R || A || M, reduces it modulo L
   1461 static void hash_ram(u8 k[64], const u8 R[32], const u8 A[32],
   1462                      const u8 *M, size_t M_size)
   1463 {
   1464     HASH_CTX ctx;
   1465     HASH_INIT  (&ctx);
   1466     HASH_UPDATE(&ctx, R , 32    );
   1467     HASH_UPDATE(&ctx, A , 32    );
   1468     HASH_UPDATE(&ctx, M , M_size);
   1469     HASH_FINAL (&ctx, k);
   1470     reduce(k);
   1471 }
   1472 
   1473 void crypto_sign_public_key(u8       public_key[32],
   1474                             const u8 secret_key[32])
   1475 {
   1476     u8 a[64];
   1477     HASH(a, secret_key, 32);
   1478     trim_scalar(a);
   1479     ge A;
   1480     ge_scalarmult_base(&A, a);
   1481     ge_tobytes(public_key, &A);
   1482 }
   1483 
   1484 void crypto_sign(u8        signature[64],
   1485                  const u8  secret_key[32],
   1486                  const u8  public_key[32],
   1487                  const u8 *message, size_t message_size)
   1488 {
   1489     u8 a[64], *prefix = a + 32;
   1490     HASH(a, secret_key, 32);
   1491     trim_scalar(a);
   1492 
   1493     u8 pk_buf[32];
   1494     const u8 *pk = public_key;
   1495     if (public_key == 0) {
   1496         crypto_sign_public_key(pk_buf, secret_key);
   1497         pk = pk_buf;
   1498     }
   1499 
   1500     // Constructs the "random" nonce from the secret key and message.
   1501     // An actual random number would work just fine, and would save us
   1502     // the trouble of hashing the message twice.  If we did that
   1503     // however, the user could fuck it up and reuse the nonce.
   1504     u8 r[64];
   1505     HASH_CTX ctx;
   1506     HASH_INIT  (&ctx);
   1507     HASH_UPDATE(&ctx, prefix , 32          );
   1508     HASH_UPDATE(&ctx, message, message_size);
   1509     HASH_FINAL (&ctx, r);
   1510     reduce(r);
   1511 
   1512     // first half of the signature = "random" nonce times basepoint
   1513     ge R;
   1514     ge_scalarmult_base(&R, r);
   1515     ge_tobytes(signature, &R);
   1516 
   1517     u8 h_ram[64];
   1518     hash_ram(h_ram, signature, pk, message, message_size);
   1519 
   1520     i64 s[64]; // s = r + h_ram * a
   1521     FOR(i,  0, 32) { s[i] = (u64) r[i]; }
   1522     FOR(i, 32, 64) { s[i] = 0;          }
   1523     FOR(i,  0, 32) {
   1524         FOR(j, 0, 32) {
   1525             s[i+j] += h_ram[i] * (u64) a[j];
   1526         }
   1527     }
   1528     modL(signature + 32, s);  // second half of the signature = s
   1529 }
   1530 
   1531 int crypto_check(const u8  signature[64],
   1532                  const u8  public_key[32],
   1533                  const u8 *message,  size_t message_size)
   1534 {
   1535     ge A, p, sB, diff;
   1536     u8 h_ram[64], R_check[32];
   1537     if (ge_frombytes_neg(&A, public_key)) {       // -A
   1538         return -1;
   1539     }
   1540     hash_ram(h_ram, signature, public_key, message, message_size);
   1541     ge_scalarmult(&p, &A, h_ram);                 // p    = -A*h_ram
   1542     ge_scalarmult_base(&sB, signature + 32);
   1543     ge_add(&diff, &p, &sB);                       // diff = s - A*h_ram
   1544     ge_tobytes(R_check, &diff);
   1545     return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail
   1546 }
   1547 
   1548 ////////////////////
   1549 /// Key exchange ///
   1550 ////////////////////
   1551 int crypto_key_exchange(u8       shared_key[32],
   1552                         const u8 your_secret_key [32],
   1553                         const u8 their_public_key[32])
   1554 {
   1555     static const u8 zero[16] = {0};
   1556     u8 shared_secret[32];
   1557     int status = crypto_x25519(shared_secret, your_secret_key, their_public_key);
   1558     crypto_chacha20_H(shared_key, shared_secret, zero);
   1559     return status;
   1560 }
   1561 
   1562 ////////////////////////////////
   1563 /// Authenticated encryption ///
   1564 ////////////////////////////////
   1565 static void authenticate2(u8 mac[16]  , const u8 auth_key[32],
   1566                           const u8 *t1, size_t   size1,
   1567                           const u8 *t2, size_t   size2)
   1568 {
   1569     crypto_poly1305_ctx a_ctx;
   1570     crypto_poly1305_init  (&a_ctx, auth_key);
   1571     crypto_poly1305_update(&a_ctx, t1, size1);
   1572     crypto_poly1305_update(&a_ctx, t2, size2);
   1573     crypto_poly1305_final (&a_ctx, mac);
   1574 }
   1575 
   1576 void crypto_aead_lock(u8        mac[16],
   1577                       u8       *cipher_text,
   1578                       const u8  key[32],
   1579                       const u8  nonce[24],
   1580                       const u8 *ad        , size_t ad_size,
   1581                       const u8 *plain_text, size_t text_size)
   1582 {   // encrypt then mac
   1583     u8 auth_key[32];
   1584     crypto_chacha_ctx e_ctx;
   1585     crypto_chacha20_x_init (&e_ctx, key, nonce);
   1586     crypto_chacha20_stream (&e_ctx, auth_key, 32);
   1587     crypto_chacha20_encrypt(&e_ctx, cipher_text, plain_text, text_size);
   1588     authenticate2(mac, auth_key, ad, ad_size, cipher_text, text_size);
   1589 }
   1590 
   1591 int crypto_aead_unlock(u8       *plain_text,
   1592                        const u8  key[32],
   1593                        const u8  nonce[24],
   1594                        const u8  mac[16],
   1595                        const u8 *ad         , size_t ad_size,
   1596                        const u8 *cipher_text, size_t text_size)
   1597 {
   1598     u8 auth_key[32], real_mac[16];
   1599     crypto_chacha_ctx e_ctx;
   1600     crypto_chacha20_x_init(&e_ctx, key, nonce);
   1601     crypto_chacha20_stream(&e_ctx, auth_key, 32);
   1602     authenticate2(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
   1603     if (crypto_memcmp(real_mac, mac, 16)) {
   1604         return -1; // reject forgeries
   1605     }
   1606     crypto_chacha20_encrypt(&e_ctx, plain_text, cipher_text, text_size);
   1607     return 0;
   1608 }
   1609 
   1610 void crypto_lock(u8        mac[16],
   1611                  u8       *cipher_text,
   1612                  const u8  key[32],
   1613                  const u8  nonce[24],
   1614                  const u8 *plain_text, size_t text_size)
   1615 {
   1616     crypto_aead_lock(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
   1617 }
   1618 
   1619 int crypto_unlock(u8       *plain_text,
   1620                   const u8  key[32],
   1621                   const u8  nonce[24],
   1622                   const u8  mac[16],
   1623                   const u8 *cipher_text, size_t text_size)
   1624 {
   1625     return crypto_aead_unlock(plain_text, key, nonce, mac, 0, 0,
   1626                               cipher_text, text_size);
   1627 }