Monocypher 1.0.1 - medfall - A super great game engine

commit d9955ba697a3ef0d5d248c633705a7fb953d70f0
parent deb315bc225a227e18d4dc19da17f8c36c01fa65
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Wed, 25 Oct 2017 20:04:27 +0300

Monocypher 1.0.1

Diffstat:
libs/monocypher/monocypher.cc  | 640 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
libs/monocypher/monocypher.h  | 82 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------

2 files changed, 446 insertions(+), 276 deletions(-)
diff --git a/libs/monocypher/monocypher.cc b/libs/monocypher/monocypher.cc
@@ -22,13 +22,19 @@
 #define HASH_FINAL  COMBINE2(HASH, _final)
 
 #define FOR(i, start, end) for (size_t (i) = (start); (i) < (end); (i)++)
-#define sv static void
-typedef uint8_t   u8;
+typedef uint8_t  u8;
 typedef uint32_t u32;
-typedef  int32_t i32;
-typedef  int64_t i64;
+typedef int32_t  i32;
+typedef int64_t  i64;
 typedef uint64_t u64;
 
+static u32 load24_le(const u8 s[3])
+{
+    return (u32)s[0]
+        | ((u32)s[1] <<  8)
+        | ((u32)s[2] << 16);
+}
+
 static u32 load32_le(const u8 s[4])
 {
     return (u32)s[0]
@@ -49,24 +55,24 @@ static u64 load64_le(const u8 s[8])
         | ((u64)s[7] << 56);
 }
 
-sv store32_le(u8 output[4], u32 input)
+static void store32_le(u8 out[4], u32 in)
 {
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
+    out[0] =  in        & 0xff;
+    out[1] = (in >>  8) & 0xff;
+    out[2] = (in >> 16) & 0xff;
+    out[3] = (in >> 24) & 0xff;
 }
 
-sv store64_le(u8 output[8], u64 input)
+static void store64_le(u8 out[8], u64 in)
 {
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-    output[4] = (input >> 32) & 0xff;
-    output[5] = (input >> 40) & 0xff;
-    output[6] = (input >> 48) & 0xff;
-    output[7] = (input >> 56) & 0xff;
+    out[0] =  in        & 0xff;
+    out[1] = (in >>  8) & 0xff;
+    out[2] = (in >> 16) & 0xff;
+    out[3] = (in >> 24) & 0xff;
+    out[4] = (in >> 32) & 0xff;
+    out[5] = (in >> 40) & 0xff;
+    out[6] = (in >> 48) & 0xff;
+    out[7] = (in >> 56) & 0xff;
 }
 
 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
@@ -75,14 +81,18 @@ static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
 int crypto_memcmp(const u8 *p1, const u8 *p2, size_t n)
 {
     unsigned diff = 0;
-    FOR (i, 0, n) { diff |= (p1[i] ^ p2[i]); }
+    FOR (i, 0, n) {
+        diff |= (p1[i] ^ p2[i]);
+    }
     return (1 & ((diff - 1) >> 8)) - 1;
 }
 
 int crypto_zerocmp(const u8 *p, size_t n)
 {
     unsigned diff = 0;
-    FOR (i, 0, n) { diff |= p[i]; }
+    FOR (i, 0, n) {
+        diff |= p[i];
+    }
     return (1 & ((diff - 1) >> 8)) - 1;
 }
 
@@ -95,7 +105,7 @@ int crypto_zerocmp(const u8 *p, size_t n)
     a += b;  d ^= a;  d = rotl32(d,  8);  \
     c += d;  b ^= c;  b = rotl32(b,  7)
 
-sv chacha20_rounds(u32 out[16], const u32 in[16])
+static void chacha20_rounds(u32 out[16], const u32 in[16])
 {
     // The temporary variables make Chacha20 10% faster.
     u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
@@ -119,7 +129,7 @@ sv chacha20_rounds(u32 out[16], const u32 in[16])
     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
 }
 
-sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
+static void chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
 {
     // constant
     ctx->input[0] = load32_le((u8*)"expa");
@@ -132,6 +142,30 @@ sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
     }
 }
 
+static u8 chacha20_pool_byte(crypto_chacha_ctx *ctx)
+{
+    u32 pool_word = ctx->pool[ctx->pool_idx / 4];
+    u8  pool_byte = pool_word >> (8*(ctx->pool_idx % 4));
+    ctx->pool_idx++;
+    return pool_byte;
+}
+
+// Fill the pool if needed, update the counters
+static void chacha20_refill_pool(crypto_chacha_ctx *ctx)
+{
+    if (ctx->pool_idx == 64) {
+        chacha20_rounds(ctx->pool, ctx->input);
+        FOR (j, 0, 16) {
+            ctx->pool[j] += ctx->input[j];
+        }
+        ctx->pool_idx = 0;
+        ctx->input[12]++;
+        if (ctx->input[12] == 0) {
+            ctx->input[13]++;
+        }
+    }
+}
+
 void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16])
 {
     crypto_chacha_ctx ctx;
@@ -152,8 +186,8 @@ void crypto_chacha20_init(crypto_chacha_ctx *ctx,
                           const u8           key[32],
                           const u8           nonce[8])
 {
-    chacha20_init_key(ctx, key  );         // key
-    crypto_chacha20_set_ctr(ctx, 0);       // counter
+    chacha20_init_key      (ctx, key);     // key
+    crypto_chacha20_set_ctr(ctx, 0  );     // counter
     ctx->input[14] = load32_le(nonce + 0); // nonce
     ctx->input[15] = load32_le(nonce + 4); // nonce
 }
@@ -169,35 +203,53 @@ void crypto_chacha20_x_init(crypto_chacha_ctx *ctx,
 
 void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, u64 ctr)
 {
-    ctx->input[12]  = ctr & 0xffffffff;
-    ctx->input[13]  = ctr >> 32;
-    ctx->pool_index = 64;  // The random pool (re)starts empty
+    ctx->input[12] = ctr & 0xffffffff;
+    ctx->input[13] = ctr >> 32;
+    ctx->pool_idx  = 64;  // The random pool (re)starts empty
 }
 
 void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
                              u8                *cipher_text,
                              const u8          *plain_text,
-                             size_t             message_size)
-{
-    FOR (i, 0, message_size) {
-        // refill the pool if empty
-        if (ctx->pool_index == 64) {
-            // fill the pool
-            u32 buffer[16];
-            chacha20_rounds(buffer, ctx->input);
-            FOR (j, 0, 16) {
-                store32_le(ctx->random_pool + j*4, buffer[j] + ctx->input[j]);
-            }
-            // update the counters
-            ctx->pool_index = 0;
-            ctx->input[12]++;
-            if (ctx->input[12] == 0) { ctx->input[13]++; }
+                             size_t             text_size)
+{
+    // Align ourselves with 4 byte words
+    while (ctx->pool_idx % 4 != 0 && text_size > 0) {
+        u8 stream = chacha20_pool_byte(ctx);
+        u8 plain  = 0;
+        if (plain_text != 0) {
+            plain = *plain_text;
+            plain_text++;
         }
-        // use the pool for encryption (or random stream)
-        cipher_text[i] =
-            (plain_text == 0 ? 0 : plain_text[i]) // ignore null plaintext
-            ^ ctx->random_pool[ctx->pool_index];
-        ctx->pool_index++;
+        *cipher_text = stream ^ plain;
+        text_size--;
+        cipher_text++;
+    }
+    // Main processing by 4 byte chunks
+    size_t nb_words  = text_size / 4;
+    size_t remainder = text_size % 4;
+    FOR (i, 0, nb_words) {
+        chacha20_refill_pool(ctx);
+        u32 txt = 0;
+        if (plain_text) {
+            txt = load32_le(plain_text);
+            plain_text += 4;
+        }
+        store32_le(cipher_text, ctx->pool[ctx->pool_idx / 4] ^ txt);
+        cipher_text   += 4;
+        ctx->pool_idx += 4;
+    }
+    // Remaining input, byte by byte
+    FOR (i, 0, remainder) {
+        chacha20_refill_pool(ctx);
+        u8 stream = chacha20_pool_byte(ctx);
+        u8 plain  = 0;
+        if (plain_text != 0) {
+            plain = *plain_text;
+            plain_text++;
+        }
+        *cipher_text = stream ^ plain;
+        cipher_text++;
     }
 }
 
@@ -219,7 +271,7 @@ void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
 // Postcondition:
 //   ctx->h <= 4_87ffffe4_8fffffe2_97ffffe0_9ffffffa
-sv poly_block(crypto_poly1305_ctx *ctx)
+static void poly_block(crypto_poly1305_ctx *ctx)
 {
     // s = h + c, without carry propagation
     const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
@@ -234,9 +286,9 @@ sv poly_block(crypto_poly1305_ctx *ctx)
     const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
     const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
     const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
-    const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // * 5 trick
-    const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // * 5 trick
-    const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // * 5 trick
+    const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
+    const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
+    const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
 
     // (h + c) * r, without carry propagation
     const u64 x0 = s0*r0 + s1*rr3 + s2*rr2 + s3*rr1 + s4*rr0;//<=97ffffe007fffff8
@@ -262,47 +314,82 @@ sv poly_block(crypto_poly1305_ctx *ctx)
 }
 
 // (re-)initializes the input counter and input buffer
-sv poly_clear_c(crypto_poly1305_ctx *ctx)
+static void poly_clear_c(crypto_poly1305_ctx *ctx)
+{
+    FOR (i, 0, 4) {
+        ctx->c[i] = 0;
+    }
+    ctx->c_idx = 0;
+}
+
+static void poly_end_block(crypto_poly1305_ctx *ctx)
+{
+    if (ctx->c_idx == 16) {
+        poly_block(ctx);
+        poly_clear_c(ctx);
+    }
+}
+
+static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
 {
-    FOR (i, 0, 4) { ctx->c[i] = 0; }
-    ctx->c_index = 0;
+    size_t word = ctx->c_idx / 4;
+    size_t byte = ctx->c_idx % 4;
+    ctx->c[word] |= (u32)input << (byte * 8);
+    ctx->c_idx++;
 }
 
 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
 {
-    // constant init
-    FOR (i, 0, 5) { ctx->h [i] = 0; } // initial hash: zero
-    ctx->c  [4] = 1;                  // add 2^130 to every input block
-    ctx->pad[4] = 0;                  // poly_add() compatibility
+    // Initial hash is zero
+    FOR (i, 0, 5) {
+        ctx->h [i] = 0;
+    }
+    // add 2^130 to every input block
+    ctx->c  [4] = 1;
     poly_clear_c(ctx);
     // load r and pad (r has some of its bits cleared)
-    /**/            ctx->r  [0] = load32_le(key      ) & 0x0fffffff;
-    FOR (i, 1, 4) { ctx->r  [i] = load32_le(key + i*4) & 0x0ffffffc; }
-    FOR (i, 0, 4) { ctx->pad[i] = load32_le(key + i*4 + 16);         }
+    FOR (i, 0, 1) { ctx->r  [0] = load32_le(key           ) & 0x0fffffff; }
+    FOR (i, 1, 4) { ctx->r  [i] = load32_le(key + i*4     ) & 0x0ffffffc; }
+    FOR (i, 0, 4) { ctx->pad[i] = load32_le(key + i*4 + 16);              }
 }
 
 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const u8 *msg, size_t msg_size)
+                            const u8 *message, size_t message_size)
 {
-    FOR (i, 0, msg_size) {
-        if (ctx->c_index == 16) {
-            poly_block(ctx);
-            poly_clear_c(ctx);
-        }
-        // feed the input buffer
-        ctx->c[ctx->c_index / 4] |= (u32)msg[i] << ((ctx->c_index % 4) * 8);
-        ctx->c_index++;
+    // Align ourselves with 4 byte words
+    while (ctx->c_idx % 4 != 0 && message_size > 0) {
+        poly_take_input(ctx, *message);
+        message++;
+        message_size--;
+    }
+
+    // Process the input 4 bytes at a time
+    size_t nb_words  = message_size / 4;
+    size_t remainder = message_size % 4;
+    FOR (i, 0, nb_words) {
+        poly_end_block(ctx);
+        ctx->c[ctx->c_idx / 4] = load32_le(message);
+        message    += 4;
+        ctx->c_idx += 4;
+    }
+
+    // Input the remaining bytes
+    if (remainder != 0) {
+        poly_end_block(ctx);
+    }
+    FOR (i, 0, remainder) {
+        poly_take_input(ctx, message[i]);
     }
 }
 
 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
 {
     // Process the last block (if any)
-    if (ctx->c_index != 0) {
+    if (ctx->c_idx != 0) {
         // move the final 1 according to remaining input length
         // (We may add less than 2^130 to the last input block)
         ctx->c[4] = 0;
-        ctx->c[ctx->c_index / 4] |= (u32)1 << ((ctx->c_index % 4) * 8);
+        poly_take_input(ctx, 1);
         // one last hash update
         poly_block(ctx);
     }
@@ -325,12 +412,12 @@ void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
     u += (i64)(ctx->h[3]) + ctx->pad[3];  store32_le(mac + 12, u);
 }
 
-void crypto_poly1305_auth(u8     mac[16],  const u8 *msg,
-                          size_t msg_size, const u8  key[32])
+void crypto_poly1305_auth(u8     mac[16],  const u8 *message,
+                          size_t message_size, const u8  key[32])
 {
     crypto_poly1305_ctx ctx;
     crypto_poly1305_init  (&ctx, key);
-    crypto_poly1305_update(&ctx, msg, msg_size);
+    crypto_poly1305_update(&ctx, message, message_size);
     crypto_poly1305_final (&ctx, mac);
 }
 
@@ -345,22 +432,25 @@ static const u64 iv[8] = {
 };
 
 // increment the input offset
-sv incr(crypto_blake2b_ctx *ctx)
+static void blake2b_incr(crypto_blake2b_ctx *ctx)
 {
     u64   *x = ctx->input_offset;
-    size_t y = ctx->buffer_idx;
-    x[0] += y;                 // increment low word
-    if (x[0] < y) { x[1]++; }  // carry overflow to high word
+    size_t y = ctx->input_idx;
+    x[0] += y;
+    if (x[0] < y) {
+        x[1]++;
+    }
 }
 
-// pad the buffer with zeroes
-sv pad(crypto_blake2b_ctx *ctx)
+static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input)
 {
-    FOR (i, ctx->buffer_idx, 128) { ctx->buffer[i] = 0; }
-    ctx->buffer_idx = 128; // mark the buffer as filled
+    size_t word = ctx->input_idx / 8;
+    size_t byte = ctx->input_idx % 8;
+    ctx->input[word] |= (u64)input << (byte * 8);
+    ctx->input_idx++;
 }
 
-sv compress(crypto_blake2b_ctx *ctx, int is_last_block)
+static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
 {
     static const u8 sigma[12][16] = {
         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -377,10 +467,6 @@ sv compress(crypto_blake2b_ctx *ctx, int is_last_block)
         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
     };
 
-    // load input buffer
-    u64 input[16];
-    FOR(i, 0, 16) { input[i] = load64_le(ctx->buffer + i*8); }
-
     // init work vector
     u64 v[16];
     FOR (i, 0, 8) {
@@ -389,9 +475,12 @@ sv compress(crypto_blake2b_ctx *ctx, int is_last_block)
     }
     v[12] ^= ctx->input_offset[0];
     v[13] ^= ctx->input_offset[1];
-    if (is_last_block) { v[14] = ~v[14]; }
+    if (is_last_block) {
+        v[14] = ~v[14];
+    }
 
     // mangle work vector
+    uint64_t *input = ctx->input;
     FOR (i, 0, 12) {
 #define BLAKE2_G(v, a, b, c, d, x, y)                       \
         v[a] += v[b] + x;  v[d] = rotr64(v[d] ^ v[a], 32);  \
@@ -409,27 +498,47 @@ sv compress(crypto_blake2b_ctx *ctx, int is_last_block)
         BLAKE2_G(v, 3, 4,  9, 14, input[sigma[i][14]], input[sigma[i][15]]);
     }
     // update hash
-    FOR (i, 0, 8) { ctx->hash[i] ^= v[i] ^ v[i+8]; }
-    // mark buffer as empty
-    ctx->buffer_idx = 0;
+    FOR (i, 0, 8) {
+        ctx->hash[i] ^= v[i] ^ v[i+8];
+    }
+}
+
+static void blake2b_reset_input(crypto_blake2b_ctx *ctx)
+{
+    FOR(i, 0, 16) {
+        ctx->input[i] = 0;
+    }
+    ctx->input_idx = 0;
+}
+
+static void blake2b_end_block(crypto_blake2b_ctx *ctx)
+{
+    if (ctx->input_idx == 128) {  // If buffer is full,
+        blake2b_incr(ctx);        // update the input offset
+        blake2b_compress(ctx, 0); // and compress the (not last) block
+        blake2b_reset_input(ctx);
+    }
 }
 
-void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,
+void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
                                  const u8           *key, size_t key_size)
 {
     // initial hash
-    FOR (i, 0, 8) { ctx->hash[i] = iv[i]; }
-    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size;
+    FOR (i, 0, 8) {
+        ctx->hash[i] = iv[i];
+    }
+    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
 
     ctx->input_offset[0] = 0;         // begining of the input, no offset
     ctx->input_offset[1] = 0;         // begining of the input, no offset
-    ctx->buffer_idx      = 0;         // buffer is empty
-    ctx->hash_size       = out_size;  // remember the hash size we want
+    ctx->input_idx       = 0;         // buffer is empty
+    ctx->hash_size       = hash_size; // remember the hash size we want
+    blake2b_reset_input(ctx);         // clear the input buffer
 
     // if there is a key, the first block is that key
     if (key_size > 0) {
         crypto_blake2b_update(ctx, key, key_size);
-        pad(ctx);
+        ctx->input_idx = 128;
     }
 }
 
@@ -438,41 +547,61 @@ void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
     crypto_blake2b_general_init(ctx, 64, 0, 0);
 }
 
-void crypto_blake2b_update(crypto_blake2b_ctx *ctx, const u8 *in, size_t in_size)
+void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
+                           const u8 *message, size_t message_size)
 {
-    FOR (i, 0, in_size) {
-        if (ctx->buffer_idx == 128) { // If buffer is full,
-            incr(ctx);                // update the input offset
-            compress(ctx, 0);         // compress the (not last) block
-        }
-        ctx->buffer[ctx->buffer_idx] = in[i];
-        ctx->buffer_idx++;
+    // Align ourselves with 8 byte words
+    while (ctx->input_idx % 8 != 0 && message_size > 0) {
+        blake2b_set_input(ctx, *message);
+        message++;
+        message_size--;
+    }
+
+    // Process the input 8 bytes at a time
+    size_t nb_words  = message_size / 8;
+    size_t remainder = message_size % 8;
+    FOR (i, 0, nb_words) {
+        blake2b_end_block(ctx);
+        ctx->input[ctx->input_idx / 8] = load64_le(message);
+        message        += 8;
+        ctx->input_idx += 8;
+    }
+
+    // Load the remainder
+    if (remainder != 0) {
+        blake2b_end_block(ctx);
+    }
+    FOR (i, 0, remainder) {
+        blake2b_set_input(ctx, message[i]);
     }
 }
 
-void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *out)
+void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
 {
-    incr(ctx);        // update the input offset (the last block may not be full)
-    pad(ctx);         // pad the last block with zeroes
-    compress(ctx, 1); // compress the last block
-    FOR (i, 0, ctx->hash_size) {
-        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xff;
+    blake2b_incr(ctx);        // update the input offset
+    blake2b_compress(ctx, 1); // compress the last block
+    size_t nb_words  = ctx->hash_size / 8;
+    FOR (i, 0, nb_words) {
+        store64_le(hash + i*8, ctx->hash[i]);
+    }
+    FOR (i, nb_words * 8, ctx->hash_size) {
+        hash[i] = (ctx->hash[i / 8] >> (8 * (i % 8))) & 0xff;
     }
 }
 
-void crypto_blake2b_general(u8       *out, size_t out_size,
-                            const u8 *key, size_t key_size,
-                            const u8 *in,  size_t in_size)
+void crypto_blake2b_general(u8       *hash   , size_t hash_size,
+                            const u8 *key    , size_t key_size,
+                            const u8 *message, size_t message_size)
 {
     crypto_blake2b_ctx ctx;
-    crypto_blake2b_general_init(&ctx, out_size, key, key_size);
-    crypto_blake2b_update(&ctx, in, in_size);
-    crypto_blake2b_final(&ctx, out);
+    crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
+    crypto_blake2b_update(&ctx, message, message_size);
+    crypto_blake2b_final(&ctx, hash);
 }
 
-void crypto_blake2b(u8 out[64], const u8 *in, size_t in_size)
+void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
 {
-    crypto_blake2b_general(out, 64, 0, 0, in, in_size);
+    crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
 }
 
 
@@ -481,38 +610,43 @@ void crypto_blake2b(u8 out[64], const u8 *in, size_t in_size)
 ////////////////
 // references to R, Z, Q etc. come from the spec
 
-typedef struct { u64 a[128]; } block; // 1024 octets
+// Argon2 operates on 1024 byte blocks.
+typedef struct { u64 a[128]; } block;
 
 static u32 min(u32 a, u32 b) { return a <= b ? a : b; }
 
 // updates a blake2 hash with a 32 bit word, little endian.
-sv blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
+static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
 {
     u8 buf[4];
     store32_le(buf, input);
     crypto_blake2b_update(ctx, buf, 4);
 }
 
-sv load_block(block *b, const u8 bytes[1024])
+static void load_block(block *b, const u8 bytes[1024])
 {
-    FOR (i, 0, 128) { b->a[i] = load64_le(bytes + i*8); }
+    FOR (i, 0, 128) {
+        b->a[i] = load64_le(bytes + i*8);
+    }
 }
 
-sv store_block(u8 bytes[1024], const block *b)
+static void store_block(u8 bytes[1024], const block *b)
 {
-    FOR (i, 0, 128) { store64_le(bytes + i*8, b->a[i]); }
+    FOR (i, 0, 128) {
+        store64_le(bytes + i*8, b->a[i]);
+    }
 }
 
-// type of copy_block() and xor_block()
-typedef void (*copy_fun) (block*, const block*);
-sv copy_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i]  = in->a[i]; }
-sv  xor_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i] ^= in->a[i]; }
+static void copy_block(block *o,const block*in){FOR(i,0,128) o->a[i] = in->a[i];}
+static void  xor_block(block *o,const block*in){FOR(i,0,128) o->a[i]^= in->a[i];}
 
 // Hash with a virtually unlimited digest size.
 // Doesn't extract more entropy than the base hash function.
 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
-sv extended_hash(u8       *digest, u32 digest_size,
-                 const u8 *input , u32 input_size)
+// (One could use a stream cipher with a seed hash as the key, but
+//  this would introduce another dependency —and point of failure.)
+static void extended_hash(u8       *digest, u32 digest_size,
+                          const u8 *input , u32 input_size)
 {
     crypto_blake2b_ctx ctx;
     crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
@@ -554,7 +688,7 @@ sv extended_hash(u8       *digest, u32 digest_size,
     G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
 
 // Core of the compression function G.  Computes Z from R in place.
-sv g_rounds(block *work_block)
+static void g_rounds(block *work_block)
 {
     // column rounds (work_block = Q)
     for (int i = 0; i < 128; i += 16) {
@@ -581,31 +715,31 @@ sv g_rounds(block *work_block)
 }
 
 // The compression function G (copy version for the first pass)
-sv g_copy(block *result, const block *x, const block *y)
+static void g_copy(block *result, const block *x, const block *y)
 {
     block tmp;
-    copy_block(&tmp, x);     // tmp    = X
-    xor_block (&tmp, y);     // tmp    = X ^ Y = R
-    copy_block(result, &tmp);// result = R
-    g_rounds(&tmp);          // tmp    = Z
-    xor_block(result, &tmp); // result = R ^ Z
+    copy_block(&tmp  , x   ); // tmp    = X
+    xor_block (&tmp  , y   ); // tmp    = X ^ Y = R
+    copy_block(result, &tmp); // result = R         (only difference with g_xor)
+    g_rounds  (&tmp);         // tmp    = Z
+    xor_block (result, &tmp); // result = R ^ Z
 }
 
 // The compression function G (xor version for subsequent passes)
-sv g_xor(block *result, const block *x, const block *y)
+static void g_xor(block *result, const block *x, const block *y)
 {
     block tmp;
-    copy_block(&tmp, x);     // tmp    = X
-    xor_block (&tmp, y);     // tmp    = X ^ Y = R
-    xor_block(result, &tmp); // result = R ^ old
-    g_rounds(&tmp);          // tmp    = Z
-    xor_block(result, &tmp); // result = R ^ old ^ Z
+    copy_block(&tmp  , x   ); // tmp    = X
+    xor_block (&tmp  , y   ); // tmp    = X ^ Y = R
+    xor_block (result, &tmp); // result = R ^ old   (only difference with g_copy)
+    g_rounds  (&tmp);         // tmp    = Z
+    xor_block (result, &tmp); // result = R ^ old ^ Z
 }
 
 // unary version of the compression function.
 // The missing argument is implied zero.
 // Does the transformation in place.
-sv unary_g(block *work_block)
+static void unary_g(block *work_block)
 {
     // work_block == R
     block tmp;
@@ -614,14 +748,25 @@ sv unary_g(block *work_block)
     xor_block(work_block, &tmp);  // work_block = Z ^ R
 }
 
+// Argon2i uses a kind of stream cipher to determine which reference
+// block it will take to synthesise the next block.  This context hold
+// that stream's state.  (It's very similar to Chacha20.  The block b
+// is anologous to Chacha's own pool)
 typedef struct {
     block b;
-    u32 pass_number; u32 slice_number;
-    u32 nb_blocks; u32 nb_iterations;
-    u32 ctr; u32 offset;
+    u32 pass_number;
+    u32 slice_number;
+    u32 nb_blocks;
+    u32 nb_iterations;
+    u32 ctr;
+    u32 offset;
 } gidx_ctx;
 
-sv gidx_refresh(gidx_ctx *ctx)
+// The block in the context will determine array indices. To avoid
+// timing attacks, it only depends on public information.  No looking
+// at a previous block to seed the next.  This makes offline attacks
+// easier, but timing attacks are the bigger threat in many settings.
+static void gidx_refresh(gidx_ctx *ctx)
 {
     // seed the begining of the block...
     ctx->b.a[0] = ctx->pass_number;
@@ -639,9 +784,9 @@ sv gidx_refresh(gidx_ctx *ctx)
     unary_g(&(ctx->b));
 }
 
-sv gidx_init(gidx_ctx *ctx,
-             u32 pass_number, u32 slice_number,
-             u32 nb_blocks,   u32 nb_iterations)
+static void gidx_init(gidx_ctx *ctx,
+                      u32 pass_number, u32 slice_number,
+                      u32 nb_blocks,   u32 nb_iterations)
 {
     ctx->pass_number   = pass_number;
     ctx->slice_number  = slice_number;
@@ -650,7 +795,7 @@ sv gidx_init(gidx_ctx *ctx,
     ctx->ctr           = 0;
 
     // Offset from the begining of the segment.  For the first slice
-    // of the firs pass, we start at the *third* block, so the offset
+    // of the first pass, we start at the *third* block, so the offset
     // starts at 2, not 0.
     if (pass_number != 0 || slice_number != 0) {
         ctx->offset = 0;
@@ -698,7 +843,7 @@ static u32 gidx_next(gidx_ctx *ctx)
 }
 
 // Main algorithm
-void crypto_argon2i(u8       *tag,       u32 tag_size,
+void crypto_argon2i(u8       *hash,      u32 hash_size,
                     void     *work_area, u32 nb_blocks, u32 nb_iterations,
                     const u8 *password,  u32 password_size,
                     const u8 *salt,      u32 salt_size,
@@ -712,7 +857,7 @@ void crypto_argon2i(u8       *tag,       u32 tag_size,
         crypto_blake2b_init(&ctx);
 
         blake_update_32      (&ctx, 1            ); // p: number of threads
-        blake_update_32      (&ctx, tag_size     );
+        blake_update_32      (&ctx, hash_size    );
         blake_update_32      (&ctx, nb_blocks    );
         blake_update_32      (&ctx, nb_iterations);
         blake_update_32      (&ctx, 0x13         ); // v: version number
@@ -750,7 +895,7 @@ void crypto_argon2i(u8       *tag,       u32 tag_size,
 
     // fill (then re-fill) the rest of the blocks
     FOR (pass_number, 0, nb_iterations) {
-        int first_pass  = pass_number == 0;
+        int first_pass = pass_number == 0;
 
         FOR (segment, 0, 4) {
             gidx_ctx ctx;
@@ -775,29 +920,29 @@ void crypto_argon2i(u8       *tag,       u32 tag_size,
             }
         }
     }
-    // hash the very last block with H' into the output tag
+    // hash the very last block with H' into the output hash
     u8 final_block[1024];
     store_block(final_block, blocks + (nb_blocks - 1));
-    extended_hash(tag, tag_size, final_block, 1024);
+    extended_hash(hash, hash_size, final_block, 1024);
 }
 
 ////////////////////////////////////
 /// Arithmetic modulo 2^255 - 19 ///
 ////////////////////////////////////
 //  Taken from Supercop's ref10 implementation.
-//  A bit bigger than TweetNaCl, about 8 times faster.
+//  A bit bigger than TweetNaCl, over 4 times faster.
 
 // field element
 typedef i32 fe[10];
 
-sv fe_0   (fe h) {                         FOR (i, 0, 10) h[i] = 0;           }
-sv fe_1   (fe h) {              h[0] = 1;  FOR (i, 1, 10) h[i] = 0;           }
-sv fe_neg (fe h, const fe f)             { FOR (i, 0, 10) h[i] = -f[i];       }
-sv fe_add (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] + g[i]; }
-sv fe_sub (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] - g[i]; }
-sv fe_copy(fe h, const fe f            ) { FOR (i, 0, 10) h[i] = f[i];        }
+static void fe_0   (fe h) {                     FOR(i,0,10) h[i] = 0;          }
+static void fe_1   (fe h) {          h[0] = 1;  FOR(i,1,10) h[i] = 0;          }
+static void fe_neg (fe h,const fe f)           {FOR(i,0,10) h[i] = -f[i];      }
+static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
+static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
+static void fe_copy(fe h,const fe f)           {FOR(i,0,10) h[i] = f[i];       }
 
-sv fe_cswap(fe f, fe g, int b)
+static void fe_cswap(fe f, fe g, int b)
 {
     FOR (i, 0, 10) {
         i32 x = (f[i] ^ g[i]) & -b;
@@ -806,14 +951,7 @@ sv fe_cswap(fe f, fe g, int b)
     }
 }
 
-static u32 load24_le(const u8 s[3])
-{
-    return (u32)s[0]
-        | ((u32)s[1] <<  8)
-        | ((u32)s[2] << 16);
-}
-
-sv fe_carry(fe h, i64 t[10])
+static void fe_carry(fe h, i64 t[10])
 {
     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
     c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= c9 * (1 << 25);
@@ -829,7 +967,7 @@ sv fe_carry(fe h, i64 t[10])
     FOR (i, 0, 10) { h[i] = t[i]; }
 }
 
-sv fe_frombytes(fe h, const u8 s[32])
+static void fe_frombytes(fe h, const u8 s[32])
 {
     i64 t[10]; // intermediate result (may overflow 32 bits)
     t[0] =  load32_le(s);
@@ -845,16 +983,18 @@ sv fe_frombytes(fe h, const u8 s[32])
     fe_carry(h, t);
 }
 
-sv fe_mul_small(fe h, const fe f, i32 g)
+static void fe_mul_small(fe h, const fe f, i32 g)
 {
     i64 t[10];
-    FOR(i, 0, 10) { t[i] = f[i] * (i64) g; }
+    FOR(i, 0, 10) {
+        t[i] = f[i] * (i64) g;
+    }
     fe_carry(h, t);
 }
-sv fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }
-sv fe_mul973324(fe h, const fe f) { fe_mul_small(h, f, 973324); }
+static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }
+static void fe_mul973324(fe h, const fe f) { fe_mul_small(h, f, 973324); }
 
-sv fe_mul(fe h, const fe f, const fe g)
+static void fe_mul(fe h, const fe f, const fe g)
 {
     // Everything is unrolled and put in temporary variables.
     // We could roll the loop, but that would make curve25519 twice as slow.
@@ -909,7 +1049,7 @@ sv fe_mul(fe h, const fe f, const fe g)
 }
 
 // we could use fe_mul() for this, but this is significantly faster
-sv fe_sq(fe h, const fe f)
+static void fe_sq(fe h, const fe f)
 {
     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
@@ -943,7 +1083,7 @@ sv fe_sq(fe h, const fe f)
 }
 
 // This could be simplified, but it would be slower
-sv fe_invert(fe out, const fe z)
+static void fe_invert(fe out, const fe z)
 {
     fe t0, t1, t2, t3;
     fe_sq(t0, z );
@@ -980,11 +1120,12 @@ void fe_pow22523(fe out, const fe z)
     fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
 }
 
-sv fe_tobytes(u8 s[32], const fe h)
+static void fe_tobytes(u8 s[32], const fe h)
 {
     i32 t[10];
-    FOR (i, 0, 10) { t[i] = h[i]; }
-
+    FOR (i, 0, 10) {
+        t[i] = h[i];
+    }
     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
     FOR (i, 0, 5) {
         q += t[2*i  ]; q >>= 26;
@@ -1032,14 +1173,15 @@ static int fe_isnonzero(const fe f)
 /// X-25519 /// Taken from Supercop's ref10 implementation.
 ///////////////
 
-sv trim_scalar(u8 s[32])
+static void trim_scalar(u8 s[32])
 {
     s[ 0] &= 248;
     s[31] &= 127;
     s[31] |= 64;
 }
 
-sv x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3, const u8 scalar[32])
+static void x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3,
+                          const u8 scalar[32])
 {
     // Montgomery ladder
     // In projective coordinates, to avoid divisons: x = X / Z
@@ -1066,6 +1208,7 @@ sv x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3, const u8 scalar[32])
         fe_add(t0, t0, z3);  fe_mul(z3, x1, z2);    fe_mul(z2, t1, t0);
     }
     // last swap is necessary to compensate for the xor trick
+    // Note: after this swap, P3 == P2 + P1.
     fe_cswap(x2, x3, swap);
     fe_cswap(z2, z3, swap);
 }
@@ -1075,14 +1218,18 @@ int crypto_x25519(u8       shared_secret   [32],
                   const u8 their_public_key[32])
 {
     // computes the scalar product
-    fe x1, x2, z2, x3, z3;
+    fe x1;
     fe_frombytes(x1, their_public_key);
 
     // restrict the possible scalar values
-    u8 e[32]; FOR (i, 0, 32) { e[i] = your_secret_key[i]; }
+    u8 e[32];
+    FOR (i, 0, 32) {
+        e[i] = your_secret_key[i];
+    }
     trim_scalar(e);
 
     // computes the actual scalar product (the result is in x2 and z2)
+    fe x2, z2, x3, z3;
     x25519_ladder(x1, x2, z2, x3, z3, e);
 
     // normalises the coordinates: x == X / Z
@@ -1107,11 +1254,11 @@ void crypto_x25519_public_key(u8       public_key[32],
 ///////////////
 
 // Point in a twisted Edwards curve,
-// in extended projective coordinates
+// in extended projective coordinates.
 // x = X/Z, y = Y/Z, T = XY/Z
 typedef struct { fe X; fe Y; fe Z; fe T; } ge;
 
-sv ge_from_xy(ge *p, const fe x, const fe y)
+static void ge_from_xy(ge *p, const fe x, const fe y)
 {
     FOR (i, 0, 10) {
         p->X[i] = x[i];
@@ -1121,7 +1268,7 @@ sv ge_from_xy(ge *p, const fe x, const fe y)
     fe_mul(p->T, x, y);
 }
 
-sv ge_tobytes(u8 s[32], const ge *h)
+static void ge_tobytes(u8 s[32], const ge *h)
 {
     fe recip, x, y;
     fe_invert(recip, h->Z);
@@ -1135,12 +1282,12 @@ sv ge_tobytes(u8 s[32], const ge *h)
 static int ge_frombytes_neg(ge *h, const u8 s[32])
 {
     static const fe d = {
-        -10913610,13857413,-15372611,6949391,114729,
-        -8787816,-6275908,-3247719,-18696448,-12055116
+        -10913610, 13857413, -15372611, 6949391, 114729,
+        -8787816, -6275908, -3247719, -18696448, -12055116
     } ;
     static const fe sqrtm1 = {
-        -32595792,-7943725,9377950,3500415,12389472,
-        -272473,-25146209,-2005654,326686,11406482
+        -32595792, -7943725, 9377950, 3500415, 12389472,
+        -272473, -25146209, -2005654, 326686, 11406482
     } ;
     fe u, v, v3, vxx, check;
     fe_frombytes(h->Y, s);
@@ -1176,14 +1323,12 @@ static int ge_frombytes_neg(ge *h, const u8 s[32])
     return 0;
 }
 
-// for point additon
-static const fe D2 = { // - 2 * 121665 / 121666
-    0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052,
-    0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67
-};
-
-sv ge_add(ge *s, const ge *p, const ge *q)
+static void ge_add(ge *s, const ge *p, const ge *q)
 {
+    static const fe D2 = { // - 2 * 121665 / 121666
+        0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052,
+        0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67
+    };
     fe a, b, c, d, e, f, g, h;
     //  A = (Y1-X1) * (Y2-X2)
     //  B = (Y1+X1) * (Y2+X2)
@@ -1201,7 +1346,11 @@ sv ge_add(ge *s, const ge *p, const ge *q)
     fe_mul(s->T, e, h);  //  T3 = E * H
 }
 
-sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
+// Performing the scalar multiplication directly in Twisted Edwards
+// space woud be simpler, but also slower.  So we do it in Montgomery
+// space instead.  The sign of the Y coordinate however gets lost in
+// translation, so we use a dirty trick to recover it.
+static void ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
 {
     // sqrt(-486664)
     static const fe K = { 54885894, 25242303, 55597453,  9067496, 51808079,
@@ -1218,7 +1367,9 @@ sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
     // montgomery scalarmult
     x25519_ladder(x1, x2, z2, x3, z3, scalar);
 
-    // recover the y1 coordinate (Katsuyuki Okeya & Kouichi Sakurai, 2001)
+    // Recover the y coordinate (Katsuyuki Okeya & Kouichi Sakurai, 2001)
+    // Note the shameless reuse of x1: (x1, y1, z1) will correspond to
+    // what was originally (x2, z2).
     fe_mul(t1, x1, z2);  // t1 = x1 * z2
     fe_add(t2, x2, t1);  // t2 = x2 + t1
     fe_sub(t3, x2, t1);  // t3 = x2 − t1
@@ -1249,7 +1400,7 @@ sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
     fe_mul(p->T, x1, t1);
 }
 
-sv ge_scalarmult_base(ge *p, const u8 scalar[32])
+static void ge_scalarmult_base(ge *p, const u8 scalar[32])
 {
     // Calls the general ge_scalarmult() with the base point.
     // Other implementations use a precomputed table, but it
@@ -1265,7 +1416,7 @@ sv ge_scalarmult_base(ge *p, const u8 scalar[32])
     ge_scalarmult(p, &base_point, scalar);
 }
 
-sv modL(u8 *r, i64 x[64])
+static void modL(u8 *r, i64 x[64])
 {
     static const  u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
                                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
@@ -1287,23 +1438,28 @@ sv modL(u8 *r, i64 x[64])
         carry = x[i] >> 8;
         x[i] &= 255;
     }
-    FOR(i, 0, 32) { x[i] -= carry * L[i]; }
+    FOR(i, 0, 32) {
+        x[i] -= carry * L[i];
+    }
     FOR(i, 0, 32) {
         x[i+1] += x[i] >> 8;
         r[i  ]  = x[i] & 255;
     }
 }
 
-sv reduce(u8 r[64])
+static void reduce(u8 r[64])
 {
     i64 x[64];
-    FOR(i, 0, 64) x[i] = (u64) r[i];
-    FOR(i, 0, 64) r[i] = 0;
+    FOR(i, 0, 64) {
+        x[i] = (u64) r[i];
+        r[i] = 0;
+    }
     modL(r, x);
 }
 
 // hashes R || A || M, reduces it modulo L
-sv hash_ram(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
+static void hash_ram(u8 k[64], const u8 R[32], const u8 A[32],
+                     const u8 *M, size_t M_size)
 {
     HASH_CTX ctx;
     HASH_INIT  (&ctx);
@@ -1362,9 +1518,9 @@ void crypto_sign(u8        signature[64],
     hash_ram(h_ram, signature, pk, message, message_size);
 
     i64 s[64]; // s = r + h_ram * a
-    FOR(i,  0, 32) s[i] = (u64) r[i];
-    FOR(i, 32, 64) s[i] = 0;
-    FOR(i, 0, 32) {
+    FOR(i,  0, 32) { s[i] = (u64) r[i]; }
+    FOR(i, 32, 64) { s[i] = 0;          }
+    FOR(i,  0, 32) {
         FOR(j, 0, 32) {
             s[i+j] += h_ram[i] * (u64) a[j];
         }
@@ -1378,11 +1534,13 @@ int crypto_check(const u8  signature[64],
 {
     ge A, p, sB, diff;
     u8 h_ram[64], R_check[32];
-    if (ge_frombytes_neg(&A, public_key)) { return -1; } // -A
+    if (ge_frombytes_neg(&A, public_key)) {       // -A
+        return -1;
+    }
     hash_ram(h_ram, signature, public_key, message, message_size);
-    ge_scalarmult(&p, &A, h_ram);                        // p    = -A*h_ram
+    ge_scalarmult(&p, &A, h_ram);                 // p    = -A*h_ram
     ge_scalarmult_base(&sB, signature + 32);
-    ge_add(&diff, &p, &sB);                              // diff = s - A*h_ram
+    ge_add(&diff, &p, &sB);                       // diff = s - A*h_ram
     ge_tobytes(R_check, &diff);
     return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail
 }
@@ -1404,9 +1562,9 @@ int crypto_key_exchange(u8       shared_key[32],
 ////////////////////////////////
 /// Authenticated encryption ///
 ////////////////////////////////
-sv authenticate2(u8 mac[16]  , const u8 auth_key[32],
-                 const u8 *t1, size_t   size1,
-                 const u8 *t2, size_t   size2)
+static void authenticate2(u8 mac[16]  , const u8 auth_key[32],
+                          const u8 *t1, size_t   size1,
+                          const u8 *t2, size_t   size2)
 {
     crypto_poly1305_ctx a_ctx;
     crypto_poly1305_init  (&a_ctx, auth_key);
@@ -1416,52 +1574,54 @@ sv authenticate2(u8 mac[16]  , const u8 auth_key[32],
 }
 
 void crypto_aead_lock(u8        mac[16],
-                      u8       *ciphertext,
+                      u8       *cipher_text,
                       const u8  key[32],
                       const u8  nonce[24],
-                      const u8 *ad       , size_t ad_size,
-                      const u8 *plaintext, size_t text_size)
+                      const u8 *ad        , size_t ad_size,
+                      const u8 *plain_text, size_t text_size)
 {   // encrypt then mac
     u8 auth_key[32];
     crypto_chacha_ctx e_ctx;
     crypto_chacha20_x_init (&e_ctx, key, nonce);
     crypto_chacha20_stream (&e_ctx, auth_key, 32);
-    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
-    authenticate2(mac, auth_key, ad, ad_size, ciphertext, text_size);
+    crypto_chacha20_encrypt(&e_ctx, cipher_text, plain_text, text_size);
+    authenticate2(mac, auth_key, ad, ad_size, cipher_text, text_size);
 }
 
-int crypto_aead_unlock(u8       *plaintext,
+int crypto_aead_unlock(u8       *plain_text,
                        const u8  key[32],
                        const u8  nonce[24],
                        const u8  mac[16],
-                       const u8 *ad        , size_t ad_size,
-                       const u8 *ciphertext, size_t text_size)
+                       const u8 *ad         , size_t ad_size,
+                       const u8 *cipher_text, size_t text_size)
 {
     u8 auth_key[32], real_mac[16];
     crypto_chacha_ctx e_ctx;
     crypto_chacha20_x_init(&e_ctx, key, nonce);
     crypto_chacha20_stream(&e_ctx, auth_key, 32);
-    authenticate2(real_mac, auth_key, ad, ad_size, ciphertext, text_size);
-    if (crypto_memcmp(real_mac, mac, 16)) { return -1; } // reject forgeries
-    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
+    authenticate2(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
+    if (crypto_memcmp(real_mac, mac, 16)) {
+        return -1; // reject forgeries
+    }
+    crypto_chacha20_encrypt(&e_ctx, plain_text, cipher_text, text_size);
     return 0;
 }
 
 void crypto_lock(u8        mac[16],
-                 u8       *ciphertext,
+                 u8       *cipher_text,
                  const u8  key[32],
                  const u8  nonce[24],
-                 const u8 *plaintext, size_t text_size)
+                 const u8 *plain_text, size_t text_size)
 {
-    crypto_aead_lock(mac, ciphertext, key, nonce, 0, 0, plaintext, text_size);
+    crypto_aead_lock(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
 }
 
-int crypto_unlock(u8       *plaintext,
+int crypto_unlock(u8       *plain_text,
                   const u8  key[32],
                   const u8  nonce[24],
                   const u8  mac[16],
-                  const u8 *ciphertext, size_t text_size)
+                  const u8 *cipher_text, size_t text_size)
 {
-    return crypto_aead_unlock(plaintext, key, nonce, mac, 0, 0,
-                              ciphertext, text_size);
+    return crypto_aead_unlock(plain_text, key, nonce, mac, 0, 0,
+                              cipher_text, text_size);
 }
diff --git a/libs/monocypher/monocypher.h b/libs/monocypher/monocypher.h
@@ -15,10 +15,13 @@ int crypto_zerocmp(const uint8_t *p, size_t n);
 ////////////////
 /// Chacha20 ///
 ////////////////
+
+// Chacha context.  Do not rely on its contents or its size,
+// they may change without notice.
 typedef struct {
-    uint32_t input[16];       // current input, unencrypted
-    uint8_t  random_pool[64]; // last input, encrypted
-    uint8_t  pool_index;      // pointer to random_pool
+    uint32_t input[16]; // current input, unencrypted
+    uint32_t pool [16]; // last input, encrypted
+    size_t   pool_idx;  // pointer to random_pool
 } crypto_chacha_ctx;
 
 void crypto_chacha20_H(uint8_t       out[32],
@@ -38,7 +41,7 @@ void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, uint64_t ctr);
 void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
                              uint8_t           *cipher_text,
                              const uint8_t     *plain_text,
-                             size_t             message_size);
+                             size_t             text_size);
 
 void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
                             uint8_t *stream, size_t size);
@@ -46,62 +49,69 @@ void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
 /////////////////
 /// Poly 1305 ///
 /////////////////
+
+// Poly 1305 context.  Do not rely on its contents or its size, they
+// may change without notice.
 typedef struct {
-    uint32_t r[4];
-    uint32_t h[5];
-    uint32_t c[5];
-    uint32_t pad[5];
-    size_t   c_index;
+    uint32_t r[4];   // constant multiplier (from the secret key)
+    uint32_t h[5];   // accumulated hash
+    uint32_t c[5];   // chunk of the message
+    uint32_t pad[4]; // random number added at the end (from the secret key)
+    size_t   c_idx;  // How many bytes are there in the chunk.
 } crypto_poly1305_ctx;
 
 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
 
 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const uint8_t *msg, size_t msg_size);
+                            const uint8_t *message, size_t message_size);
 
 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
 
 void crypto_poly1305_auth(uint8_t        mac[16],
-                          const uint8_t *msg, size_t msg_size,
+                          const uint8_t *message, size_t message_size,
                           const uint8_t  key[32]);
 
 ////////////////
 /// Blake2 b ///
 ////////////////
+
+// Blake2b context.  Do not rely on its contents or its size, they
+// may change without notice.
 typedef struct {
     uint64_t hash[8];
     uint64_t input_offset[2];
-    uint8_t  buffer[128];
-    size_t   buffer_idx;
+    uint64_t input[16];
+    size_t   input_idx;
     size_t   hash_size;
 } crypto_blake2b_ctx;
 
-void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,
+void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
                                  const uint8_t      *key, size_t key_size);
 
 void crypto_blake2b_init(crypto_blake2b_ctx *ctx);
 
 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
-                           const uint8_t *in, size_t in_size);
+                           const uint8_t *message, size_t message_size);
 
-void crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
+void crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *hash);
 
-void crypto_blake2b_general(uint8_t       *out, size_t out_size, // digest
-                            const uint8_t *key, size_t key_size, // optional
-                            const uint8_t *in , size_t in_size);
+void crypto_blake2b_general(uint8_t       *hash    , size_t hash_size,
+                            const uint8_t *key     , size_t key_size, // optional
+                            const uint8_t *message , size_t message_size);
 
-void crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t in_size);
+void crypto_blake2b(uint8_t hash[64],
+                    const uint8_t *message, size_t message_size);
 
 ////////////////
 /// Argon2 i ///
 ////////////////
-void crypto_argon2i(uint8_t       *tag,       uint32_t tag_size,      // >= 4
+void crypto_argon2i(uint8_t       *hash,      uint32_t hash_size,     // >= 4
                     void          *work_area, uint32_t nb_blocks,     // >= 8
-                    uint32_t       nb_iterations,
+                    uint32_t       nb_iterations,                     // >= 1
                     const uint8_t *password,  uint32_t password_size,
                     const uint8_t *salt,      uint32_t salt_size,     // >= 8
-                    const uint8_t *key,       uint32_t key_size,
-                    const uint8_t *ad,        uint32_t ad_size);
+                    const uint8_t *key,       uint32_t key_size,      // optional
+                    const uint8_t *ad,        uint32_t ad_size);      // optional
 
 ///////////////
 /// X-25519 ///
@@ -120,12 +130,12 @@ void crypto_x25519_public_key(uint8_t       public_key[32],
 void crypto_sign_public_key(uint8_t        public_key[32],
                             const uint8_t  secret_key[32]);
 
-void crypto_sign(uint8_t        signature[64],
+void crypto_sign(uint8_t        signature [64],
                  const uint8_t  secret_key[32],
                  const uint8_t  public_key[32], // optional, may be 0
                  const uint8_t *message, size_t message_size);
 
-int crypto_check(const uint8_t  signature[64],
+int crypto_check(const uint8_t  signature [64],
                  const uint8_t  public_key[32],
                  const uint8_t *message, size_t message_size);
 
@@ -140,29 +150,29 @@ int crypto_key_exchange(uint8_t       shared_key      [32],
 /// Authenticated encryption ///
 ////////////////////////////////
 void crypto_aead_lock(uint8_t        mac[16],
-                      uint8_t       *ciphertext,
+                      uint8_t       *cipher_text,
                       const uint8_t  key[32],
                       const uint8_t  nonce[24],
-                      const uint8_t *ad       , size_t ad_size,
-                      const uint8_t *plaintext, size_t text_size);
+                      const uint8_t *ad        , size_t ad_size,
+                      const uint8_t *plain_text, size_t text_size);
 
-int crypto_aead_unlock(uint8_t       *plaintext,
+int crypto_aead_unlock(uint8_t       *plain_text,
                        const uint8_t  key[32],
                        const uint8_t  nonce[24],
                        const uint8_t  mac[16],
-                       const uint8_t *ad        , size_t ad_size,
-                       const uint8_t *ciphertext, size_t text_size);
+                       const uint8_t *ad         , size_t ad_size,
+                       const uint8_t *cipher_text, size_t text_size);
 
 void crypto_lock(uint8_t        mac[16],
-                 uint8_t       *ciphertext,
+                 uint8_t       *cipher_text,
                  const uint8_t  key[32],
                  const uint8_t  nonce[24],
-                 const uint8_t *plaintext, size_t text_size);
+                 const uint8_t *plain_text, size_t text_size);
 
-int crypto_unlock(uint8_t       *plaintext,
+int crypto_unlock(uint8_t       *plain_text,
                   const uint8_t  key[32],
                   const uint8_t  nonce[24],
                   const uint8_t  mac[16],
-                  const uint8_t *ciphertext, size_t text_size);
+                  const uint8_t *cipher_text, size_t text_size);
 
 #endif // MONOCYPHER_H

libs/monocypher/monocypher.cc	\|	640	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
libs/monocypher/monocypher.h	\|	82	++++++++++++++++++++++++++++++++++++++++++++-----------------------------------