Add monocypher - medfall - A super great game engine

commit 23fc51d7b9948ed05ec4e7df670bbe3e7092253e
parent 2307e09c817f8fdb7f7a7a9b38f1e7c6b8bbd79a
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Sun Apr  2 23:52:23 +0300

Add monocypher

Diffstat:
libs/monocypher.mk  | 6 ++++++
libs/monocypher/README  | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
libs/monocypher/monocypher.cc  | 1366 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
libs/monocypher/monocypher.h  | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1646 insertions(+), 0 deletions(-)
diff --git a/libs/monocypher.mk b/libs/monocypher.mk
@@ -0,0 +1,6 @@
+libmonocypher.a: libs/monocypher/monocypher.o
+
+clean: clean-monocypher
+
+clean-monocypher:
+	rm -f libs/monocypher/*.o
diff --git a/libs/monocypher/README b/libs/monocypher/README
@@ -0,0 +1,107 @@
+Authors
+-------
+
+Packaged by Loup Vaillant.
+
+- Chacha20: Loup Vaillant, implemented from spec.
+- Poly1305: Loup Vaillant, implemented from spec.
+- Blake2b:  derived from https://tools.ietf.org/html/rfc7693
+- Argon2i:  Loup Vaillant, implemented from spec.
+- X25519:   taken from SUPERCOP ref10.
+- ed25519:  adapted http://tweetnacl.cr.yp.to/ for ref10 arithmetic.
+- High-level constructions: Loup Vaillant, implemented from specs and
+  first principles
+
+Licence
+-------
+
+For everything *but* Blake2b:
+
+  Copying and distribution of the code, with or without modification,
+  are permitted in any medium without royalty.  This code is offered
+  as-is, without any warranty.
+
+---
+
+For the Blake2b code:
+
+  Copyright (c) 2015 IETF Trust and the persons identified as authors
+  of the code. All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  - Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  - Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  - Neither the name of Internet Society, IETF or IETF Trust, nor the
+    names of specific contributors, may be used to endorse or promote
+    products derived from this software without specific prior written
+    permission.
+
+  - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+    CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+    TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+    THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+    SUCH DAMAGE.
+
+
+Current status
+--------------
+
+0.2  Interfaces should stabilise.  Needs external review.
+
+
+Test suite
+----------
+
+    $ make
+    $ ./test
+
+It should display a nice printout of all the tests, all starting with
+"OK".  If you see "FAILURE" anywhere, something has gone very wrong
+somewhere.
+
+*Do not* use Monocypher without having run the test suite at least
+once.
+
+
+Integration to your project
+---------------------------
+
+Just copy monocypher.c and monocypher.h.
+
+Compile as C99, C11, C++98, C++11, C++14, and C++17. (Tested with
+gcc 5.4.0 and clang 2.8.0 on GNU/Linux.)
+
+
+Customisation
+-------------
+
+If you want to use ed25519 with the official SHA-512 hash instead of
+the default Blake2b, do as the test suite does:
+
+- Compile monocypher.c with option -DED25519_SHA512, or modify the
+  relevant preprocessor directives at the beginning of monocypher.c.
+
+- Link the final program with a suitable SHA-512 implementation.  You
+  can use the sha512.c and sha512.h files provided here.
+
+Note that even though the default hash (Blake2b) is not widely used,
+it doesn't prevent you from upgrading to faster implementations if you
+need to.  The Donna implementations of ed25519 for instance can use a
+custom hash.
diff --git a/libs/monocypher/monocypher.cc b/libs/monocypher/monocypher.cc
@@ -0,0 +1,1366 @@
+#include "monocypher.h"
+
+/////////////////
+/// Utilities ///
+/////////////////
+
+// By default, signatures use blake2b. SHA-512 is provided as an
+// option for full ed25519 compatibility (a must for test vectors).
+// Compile with option -DED25519_SHA512 to use with sha512 If you do
+// so, you must provide the "sha512" header with suitable functions.
+#ifdef ED25519_SHA512
+    #include "sha512.h"
+    #define HASH crypto_sha512
+#else
+    #define HASH crypto_blake2b
+#endif
+#define COMBINE1(x, y) x ## y
+#define COMBINE2(x, y) COMBINE1(x, y)
+#define HASH_CTX    COMBINE2(HASH, _ctx)
+#define HASH_INIT   COMBINE2(HASH, _init)
+#define HASH_UPDATE COMBINE2(HASH, _update)
+#define HASH_FINAL  COMBINE2(HASH, _final)
+
+#define FOR(i, start, end) for (size_t i = start; i < end; i++)
+#define sv static void
+typedef  int8_t   i8;
+typedef uint8_t   u8;
+typedef uint32_t u32;
+typedef  int32_t i32;
+typedef  int64_t i64;
+typedef uint64_t u64;
+
+static u32 load32_le(const u8 s[4])
+{
+    return (u32)s[0]
+        | ((u32)s[1] <<  8)
+        | ((u32)s[2] << 16)
+        | ((u32)s[3] << 24);
+}
+
+static u64 load64_le(const u8 s[8])
+{
+    return (u64)s[0]
+        | ((u64)s[1] <<  8)
+        | ((u64)s[2] << 16)
+        | ((u64)s[3] << 24)
+        | ((u64)s[4] << 32)
+        | ((u64)s[5] << 40)
+        | ((u64)s[6] << 48)
+        | ((u64)s[7] << 56);
+}
+
+sv store32_le(u8 output[4], u32 input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+}
+
+sv store64_le(u8 output[8], u64 input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+    output[4] = (input >> 32) & 0xff;
+    output[5] = (input >> 40) & 0xff;
+    output[6] = (input >> 48) & 0xff;
+    output[7] = (input >> 56) & 0xff;
+}
+
+static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
+static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
+
+int crypto_memcmp(const u8 *p1, const u8 *p2, size_t n)
+{
+    unsigned diff = 0;
+    FOR (i, 0, n) { diff |= (p1[i] ^ p2[i]); }
+    return (1 & ((diff - 1) >> 8)) - 1;
+}
+
+int crypto_zerocmp(const u8 *p, size_t n)
+{
+    unsigned diff = 0;
+    FOR (i, 0, n) { diff |= p[i]; }
+    return (1 & ((diff - 1) >> 8)) - 1;
+}
+
+/////////////////
+/// Chacha 20 ///
+/////////////////
+#define QUARTERROUND(a, b, c, d)          \
+    a += b;  d ^= a;  d = rotl32(d, 16);  \
+    c += d;  b ^= c;  b = rotl32(b, 12);  \
+    a += b;  d ^= a;  d = rotl32(d,  8);  \
+    c += d;  b ^= c;  b = rotl32(b,  7)
+
+sv chacha20_rounds(u32 out[16], const u32 in[16])
+{
+    FOR (i, 0, 16) { out[i] = in[i]; }
+    FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
+        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
+        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
+        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
+        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
+        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1
+        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2
+        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3
+        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4
+    }
+}
+
+sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
+{
+    // constant
+    ctx->input[0] = load32_le((u8*)"expa");
+    ctx->input[1] = load32_le((u8*)"nd 3");
+    ctx->input[2] = load32_le((u8*)"2-by");
+    ctx->input[3] = load32_le((u8*)"te k");
+    // key
+    FOR (i, 0, 8) {
+        ctx->input[i+4] = load32_le(key + i*4);
+    }
+    // pool index (the random pool starts empty)
+    ctx->pool_index = 64;
+}
+
+void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16])
+{
+    crypto_chacha_ctx ctx;
+    chacha20_init_key(&ctx, key);
+    FOR (i, 0, 4) {
+        ctx.input[i+12] = load32_le(in + i*4);
+    }
+    u32 buffer[16];
+    chacha20_rounds(buffer, ctx.input);
+    // prevents reversal of the rounds by revealing only half of the buffer.
+    FOR (i, 0, 4) {
+        store32_le(out      + i*4, buffer[i     ]); // constant
+        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
+    }
+}
+
+void crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                          const u8           key[32],
+                          const u8           nonce[8])
+{
+    chacha20_init_key(ctx, key  );         // key
+    ctx->input[12] = 0;                    // counter
+    ctx->input[13] = 0;                    // counter
+    ctx->input[14] = load32_le(nonce + 0); // nonce
+    ctx->input[15] = load32_le(nonce + 4); // nonce
+}
+
+void crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                           const u8           key[32],
+                           const u8           nonce[24])
+{
+    u8 derived_key[32];
+    crypto_chacha20_H(derived_key, key, nonce);
+    crypto_chacha20_init(ctx, derived_key, nonce + 16);
+}
+
+void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                             u8                *cipher_text,
+                             const u8          *plain_text,
+                             size_t             message_size)
+{
+    FOR (i, 0, message_size) {
+        // refill the pool if empty
+        if (ctx->pool_index == 64) {
+            // fill the pool
+            u32 buffer[16];
+            chacha20_rounds(buffer, ctx->input);
+            FOR (j, 0, 16) {
+                store32_le(ctx->random_pool + j*4, buffer[j] + ctx->input[j]);
+            }
+            // update the counters
+            ctx->pool_index = 0;
+            ctx->input[12]++;
+            if (!ctx->input[12])
+                ctx->input[13]++;
+        }
+        // use the pool for encryption (or random stream)
+        cipher_text[i] =
+            (plain_text == 0 ? 0 : plain_text[i]) // ignore null plaintext
+            ^ ctx->random_pool[ctx->pool_index];
+        ctx->pool_index++;
+    }
+}
+
+void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
+                            u8                *cipher_text,
+                            size_t             message_size)
+{
+    crypto_chacha20_encrypt(ctx, cipher_text, 0, message_size);
+}
+
+
+/////////////////
+/// Poly 1305 ///
+/////////////////
+
+// h = (h + c) * r
+// preconditions:
+//   ctx->h <= 7_ffffffff_ffffffff_ffffffff_ffffffff
+//   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
+//   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
+// Postcondition:
+//   ctx->h <= 4_87ffffe4_8fffffe2_97ffffe0_9ffffffa
+sv poly_block(crypto_poly1305_ctx *ctx)
+{
+    // s = h + c, without carry propagation
+    const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
+    const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
+    const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
+    const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
+    const u64 s4 = ctx->h[4] + (u64)ctx->c[4]; // s4 <=   00000004
+
+    // Local all the things!
+    const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
+    const u32 r1 = ctx->r[1];       // r1  <= 0ffffffc
+    const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
+    const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
+    const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
+    const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // * 5 trick
+    const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // * 5 trick
+    const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // * 5 trick
+
+    // (h + c) * r, without carry propagation
+    const u64 x0 = s0*r0 + s1*rr3 + s2*rr2 + s3*rr1 + s4*rr0;//<=97ffffe007fffff8
+    const u64 x1 = s0*r1 + s1*r0  + s2*rr3 + s3*rr2 + s4*rr1;//<=8fffffe20ffffff6
+    const u64 x2 = s0*r2 + s1*r1  + s2*r0  + s3*rr3 + s4*rr2;//<=87ffffe417fffff4
+    const u64 x3 = s0*r3 + s1*r2  + s2*r1  + s3*r0  + s4*rr3;//<=7fffffe61ffffff2
+    const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits       //<=0000000000000018
+
+    // partial reduction modulo 2^130 - 5
+    const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffffe
+    const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
+    const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
+    const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
+    const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
+    const u64 u4 = (u3 >> 32)     + (u5 & 3);
+
+    // Update the hash
+    ctx->h[0] = u0 & 0xffffffff; // u0 <= 1_9ffffffa
+    ctx->h[1] = u1 & 0xffffffff; // u1 <= 1_97ffffe0
+    ctx->h[2] = u2 & 0xffffffff; // u2 <= 1_8fffffe2
+    ctx->h[3] = u3 & 0xffffffff; // u3 <= 1_87ffffe4
+    ctx->h[4] = u4;              // u4 <=          4
+}
+
+// (re-)initializes the input counter and input buffer
+sv poly_clear_c(crypto_poly1305_ctx *ctx)
+{
+    FOR (i, 0, 4) { ctx->c[i] = 0; }
+    ctx->c_index = 0;
+}
+
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
+{
+    // constant init
+    FOR (i, 0, 5) { ctx->h [i] = 0; } // initial hash: zero
+    ctx->c  [4] = 1;                  // add 2^130 to every input block
+    ctx->pad[4] = 0;                  // poly_add() compatibility
+    poly_clear_c(ctx);
+    // load r and pad (r has some of its bits cleared)
+    /**/            ctx->r  [0] = load32_le(key      ) & 0x0fffffff;
+    FOR (i, 1, 4) { ctx->r  [i] = load32_le(key + i*4) & 0x0ffffffc; }
+    FOR (i, 0, 4) { ctx->pad[i] = load32_le(key + i*4 + 16);         }
+}
+
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const u8 *msg, size_t msg_size)
+{
+    FOR (i, 0, msg_size) {
+        if (ctx->c_index == 16) {
+            poly_block(ctx);
+            poly_clear_c(ctx);
+        }
+        // feed the input buffer
+        ctx->c[ctx->c_index / 4] |= msg[i] << ((ctx->c_index % 4) * 8);
+        ctx->c_index++;
+    }
+}
+
+void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
+{
+    // Process the last block (if any)
+    if (ctx->c_index != 0) {
+        // move the final 1 according to remaining input length
+        // (We may add less than 2^130 to the last input block)
+        ctx->c[4] = 0;
+        ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8);
+        // one last hash update
+        poly_block(ctx);
+    }
+
+    // check if we should subtract 2^130-5 by performing the
+    // corresponding carry propagation.
+    u64 u = 5;
+    u += ctx->h[0];  u >>= 32;
+    u += ctx->h[1];  u >>= 32;
+    u += ctx->h[2];  u >>= 32;
+    u += ctx->h[3];  u >>= 32;
+    u += ctx->h[4];  u >>=  2;
+    // now u indicates how many times we should subtract 2^130-5 (0 or 1)
+
+    // store h + pad, minus 2^130-5 if u tells us to.
+    u *= 5;
+    u += (i64)(ctx->h[0]) + ctx->pad[0];  store32_le(mac     , u);  u >>= 32;
+    u += (i64)(ctx->h[1]) + ctx->pad[1];  store32_le(mac +  4, u);  u >>= 32;
+    u += (i64)(ctx->h[2]) + ctx->pad[2];  store32_le(mac +  8, u);  u >>= 32;
+    u += (i64)(ctx->h[3]) + ctx->pad[3];  store32_le(mac + 12, u);
+}
+
+void crypto_poly1305_auth(u8      mac[16],  const u8 *msg,
+                          size_t  msg_size, const u8  key[32])
+{
+    crypto_poly1305_ctx ctx;
+    crypto_poly1305_init  (&ctx, key);
+    crypto_poly1305_update(&ctx, msg, msg_size);
+    crypto_poly1305_final (&ctx, mac);
+}
+
+////////////////
+/// Blake2 b /// (taken from the reference
+////////////////  implentation in RFC 7693)
+
+// Initialization Vector.
+static const u64 blake2b_iv[8] = {
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+};
+
+// increment a 128-bit "word".
+sv incr(u64 x[2], u64 y)
+{
+    x[0] += y;                 // increment the low word
+    if (x[0] < y) { x[1]++; }  // handle overflow
+}
+
+sv blake2b_compress(crypto_blake2b_ctx *ctx, int last_block)
+{
+    static const u8 sigma[12][16] = {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+    };
+
+    // init work variables (before shuffling them)
+    u64 v[16];
+    FOR (i, 0, 8) {
+        v[i    ] = ctx->hash[i];
+        v[i + 8] = blake2b_iv[i];
+    }
+    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
+    v[13] ^= ctx->input_size[1]; // high 64 bits
+    if (last_block) { v[14] = ~v[14]; }
+
+    // load the input buffer
+    u64 m[16];
+    FOR (i ,0, 16) { m[i] = load64_le(&ctx->buf[i * 8]); }
+
+    // shuffle the work variables with the 12 rounds
+    FOR (i, 0, 12) {
+#define B2B_G(a, b, c, d, x, y)                                    \
+        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
+        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
+
+        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+    }
+    // accumulate the work variables into the hash
+    FOR (i, 0, 8) { ctx->hash[i] ^= v[i] ^ v[i+8]; }
+}
+
+void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,
+                                 const u8           *key, size_t key_size)
+{
+    // Initial hash == initialization vector...
+    FOR (i, 0, 8) { ctx->hash[i] = blake2b_iv[i]; }
+    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size;  // ...mostly
+
+    ctx->input_size[0] = 0;       // input count low word
+    ctx->input_size[1] = 0;       // input count high word
+    ctx->c             = 0;       // pointer within buffer
+    ctx->output_size   = out_size;  // size of the final hash
+
+    // If there's a key, put it in the first block, then pad with zeroes
+    if (key_size > 0) {
+        FOR (i, 0     , key_size) { ctx->buf[i] = key[i]; }
+        FOR (i, key_size, 128   ) { ctx->buf[i] = 0;      }
+        ctx->c = 128; // mark the block as used
+    }
+}
+
+void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
+{
+    crypto_blake2b_general_init(ctx, 64, 0, 0);
+}
+
+void crypto_blake2b_update(crypto_blake2b_ctx *ctx, const u8 *in, size_t in_size)
+{
+    FOR (i, 0, in_size) {
+        // If the buffer is full, increment the counters and
+        // add (compress) the current buffer to the hash
+        if (ctx->c == 128) {
+            ctx->c = 0;
+            incr(ctx->input_size, 128);
+            blake2b_compress(ctx, 0); // not last time -> 0
+        }
+        // By now the buffer is not full.  We add one input byte.
+        ctx->buf[ctx->c] = in[i];
+        ctx->c++;
+    }
+}
+
+void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *out)
+{
+    // update input size, pad then compress the buffer
+    incr(ctx->input_size, ctx->c);
+    FOR (i, ctx->c, 128) { ctx->buf[i] = 0; }
+    blake2b_compress(ctx, 1); // last time -> 1
+
+    // copy the hash in the output (little endian of course)
+    FOR (i, 0, ctx->output_size) {
+        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xff;
+    }
+}
+
+void crypto_blake2b_general(u8       *out, size_t out_size,
+                            const u8 *key, size_t key_size,
+                            const u8 *in,  size_t in_size)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, out_size, key, key_size);
+    crypto_blake2b_update(&ctx, in, in_size);
+    crypto_blake2b_final(&ctx, out);
+}
+
+void crypto_blake2b(u8 out[64], const u8 *in, size_t in_size)
+{
+    crypto_blake2b_general(out, 64, 0, 0, in, in_size);
+}
+
+
+////////////////
+/// Argon2 i ///
+////////////////
+// references to R, Z, Q etc. come from the spec
+
+typedef struct { u64 a[128]; } block; // 1024 octets
+
+static u32 min(u32 a, u32 b) { return a <= b ? a : b; }
+
+// updates a blake2 hash with a 32 bit word, little endian.
+sv blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
+{
+    u8 buf[4];
+    store32_le(buf, input);
+    crypto_blake2b_update(ctx, buf, 4);
+}
+
+sv load_block(block *b, const u8 bytes[1024])
+{
+    FOR (i, 0, 128) { b->a[i] = load64_le(bytes + i*8); }
+}
+
+sv store_block(u8 bytes[1024], const block *b)
+{
+    FOR (i, 0, 128) { store64_le(bytes + i*8, b->a[i]); }
+}
+
+sv copy_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i]  = in->a[i]; }
+sv  xor_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i] ^= in->a[i]; }
+
+// Hash with a virtually unlimited digest size.
+// Doesn't extract more entropy than the base hash function.
+// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
+sv extended_hash(u8       *digest, u32 digest_size,
+                 const u8 *input , u32 input_size)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
+    blake_update_32            (&ctx, digest_size);
+    crypto_blake2b_update      (&ctx, input, input_size);
+    crypto_blake2b_final       (&ctx, digest);
+
+    if (digest_size > 64) {
+        // the conversion to u64 avoids integer overflow on
+        // ludicrously big hash sizes.
+        u32 r   = (((u64)digest_size + 31) / 32) - 2;
+        u32 i   =  1;
+        u32 in  =  0;
+        u32 out = 32;
+        while (i < r) {
+            // Input and output overlap. This is intentional
+            crypto_blake2b(digest + out, digest + in, 64);
+            i   +=  1;
+            in  += 32;
+            out += 32;
+        }
+        crypto_blake2b_general(digest + out, digest_size - (32 * r),
+                               0, 0, // no key
+                               digest + in , 64);
+    }
+}
+
+#define LSB(x) ((x) & 0xffffffff)
+#define G(a, b, c, d)                                            \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
+#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
+              v8,  v9, v10, v11, v12, v13, v14, v15)    \
+    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
+    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
+    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
+    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
+
+// Core of the compression function G.  Computes Z from R in place.
+sv g_rounds(block *work_block)
+{
+    // column rounds (work_block = Q)
+    for (int i = 0; i < 128; i += 16) {
+        ROUND(work_block->a[i     ], work_block->a[i +  1],
+              work_block->a[i +  2], work_block->a[i +  3],
+              work_block->a[i +  4], work_block->a[i +  5],
+              work_block->a[i +  6], work_block->a[i +  7],
+              work_block->a[i +  8], work_block->a[i +  9],
+              work_block->a[i + 10], work_block->a[i + 11],
+              work_block->a[i + 12], work_block->a[i + 13],
+              work_block->a[i + 14], work_block->a[i + 15]);
+    }
+    // row rounds (work_block = Z)
+    for (int i = 0; i < 16; i += 2) {
+        ROUND(work_block->a[i      ], work_block->a[i +   1],
+              work_block->a[i +  16], work_block->a[i +  17],
+              work_block->a[i +  32], work_block->a[i +  33],
+              work_block->a[i +  48], work_block->a[i +  49],
+              work_block->a[i +  64], work_block->a[i +  65],
+              work_block->a[i +  80], work_block->a[i +  81],
+              work_block->a[i +  96], work_block->a[i +  97],
+              work_block->a[i + 112], work_block->a[i + 113]);
+    }
+}
+
+// The compression function G
+// may overwrite result completely  (xcopy == copy_block),
+// or XOR result with the old block (xcopy ==  xor_block)
+sv binary_g(block *result, const block *x, const block *y,
+            void (*xcopy) (block*, const block*))
+{
+    block tmp;
+    copy_block(&tmp, x);     // tmp    = X
+    xor_block (&tmp, y);     // tmp    = X ^ Y = R
+    xcopy(result, &tmp);     // result = R     (or R ^ old)
+    g_rounds(&tmp);          // tmp    = Z
+    xor_block(result, &tmp); // result = R ^ Z (or R ^ old ^ Z)
+}
+
+// unary version of the compression function.
+// The missing argument is implied zero.
+// Does the transformation in place.
+sv unary_g(block *work_block)
+{
+    // work_block == R
+    block tmp;
+    copy_block(&tmp, work_block); // tmp        = R
+    g_rounds(work_block);         // work_block = Z
+    xor_block(work_block, &tmp);  // work_block = Z ^ R
+}
+
+typedef struct {
+    block b;
+    u32 pass_number; u32 slice_number;
+    u32 nb_blocks; u32 nb_iterations;
+    u32 ctr; u32 offset;
+} gidx_ctx;
+
+sv gidx_refresh(gidx_ctx *ctx)
+{
+    // seed the begining of the block...
+    ctx->b.a[0] = ctx->pass_number;
+    ctx->b.a[1] = 0;  // lane number (we have only one)
+    ctx->b.a[2] = ctx->slice_number;
+    ctx->b.a[3] = ctx->nb_blocks;
+    ctx->b.a[4] = ctx->nb_iterations;
+    ctx->b.a[5] = 1;  // type: Argon2i
+    ctx->b.a[6] = ctx->ctr;
+    FOR (i, 7, 128) { ctx->b.a[i] = 0; } // ...then zero the rest out
+
+    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
+    // (G "square" function), to get cheap pseudo-random numbers.
+    unary_g(&(ctx->b));
+    unary_g(&(ctx->b));
+}
+
+sv gidx_init(gidx_ctx *ctx,
+             u32 pass_number, u32 slice_number,
+             u32 nb_blocks,   u32 nb_iterations)
+{
+    ctx->pass_number   = pass_number;
+    ctx->slice_number  = slice_number;
+    ctx->nb_blocks     = nb_blocks;
+    ctx->nb_iterations = nb_iterations;
+    ctx->ctr           = 0;
+
+    // Offset from the begining of the segment.  For the first slice
+    // of the firs pass, we start at the *third* block, so the offset
+    // starts at 2, not 0.
+    if (pass_number != 0 || slice_number != 0) {
+        ctx->offset = 0;
+    } else {
+        ctx->offset = 2;
+        ctx->ctr++;         // Compensates for missed lazy creation
+        gidx_refresh(ctx);  // at the start of gidx_next()
+    }
+}
+
+static u32 gidx_next(gidx_ctx *ctx)
+{
+    // lazily creates the offset block we need
+    if (ctx->offset % 128 == 0) {
+        ctx->ctr++;
+        gidx_refresh(ctx);
+    }
+    u32 index  = ctx->offset % 128; // save index  for current call
+    u32 offset = ctx->offset;       // save offset for current call
+    ctx->offset++;                  // update offset for next call
+
+    // Computes the area size.
+    // Pass 0 : all already finished segments plus already constructed
+    //          blocks in this segment
+    // Pass 1+: 3 last segments plus already constructed
+    //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
+    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
+    int first_pass = ctx->pass_number == 0;
+    u32 slice_size = ctx->nb_blocks / 4;
+    u32 area_size  = ((first_pass ? ctx->slice_number : 3)
+                      * slice_size + offset - 1);
+
+    // Computes the starting position of the reference area.
+    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
+    // NEXT SEGMENT, NOT THE NEXT BLOCK.
+    u32 next_slice = (ctx->slice_number == 3
+                      ? 0
+                      : (ctx->slice_number + 1) * slice_size);
+    u32 start_pos  = first_pass ? 0 : next_slice;
+
+    // Generate offset from J1 (no need for J2, there's only one lane)
+    u64 j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
+    u64 x          = (j1 * j1)       >> 32;
+    u64 y          = (area_size * x) >> 32;
+    u64 z          = area_size - 1 - y;
+    return (start_pos + z) % ctx->nb_blocks;
+}
+
+// Main algorithm
+void crypto_argon2i(u8       *tag,       u32 tag_size,
+                    void     *work_area, u32 nb_blocks, u32 nb_iterations,
+                    const u8 *password,  u32 password_size,
+                    const u8 *salt,      u32 salt_size,
+                    const u8 *key,       u32 key_size,
+                    const u8 *ad,        u32 ad_size)
+{
+    // work area seen as blocks (must be suitably aligned)
+    block *blocks = (block*)work_area;
+    {
+        crypto_blake2b_ctx ctx;
+        crypto_blake2b_init(&ctx);
+
+        blake_update_32      (&ctx, 1            ); // p: number of threads
+        blake_update_32      (&ctx, tag_size     );
+        blake_update_32      (&ctx, nb_blocks    );
+        blake_update_32      (&ctx, nb_iterations);
+        blake_update_32      (&ctx, 0x13         ); // v: version number
+        blake_update_32      (&ctx, 1            ); // y: Argon2i
+        blake_update_32      (&ctx,           password_size);
+        crypto_blake2b_update(&ctx, password, password_size);
+        blake_update_32      (&ctx,           salt_size);
+        crypto_blake2b_update(&ctx, salt,     salt_size);
+        blake_update_32      (&ctx,           key_size);
+        crypto_blake2b_update(&ctx, key,      key_size);
+        blake_update_32      (&ctx,           ad_size);
+        crypto_blake2b_update(&ctx, ad,       ad_size);
+
+        u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
+        crypto_blake2b_final(&ctx, initial_hash);
+
+        // fill first 2 blocks
+        block   tmp_block;
+        u8 hash_area[1024];
+        store32_le(initial_hash + 64, 0); // first  additional word
+        store32_le(initial_hash + 68, 0); // second additional word
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks, &tmp_block);
+
+        store32_le(initial_hash + 64, 1); // slight modification
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks + 1, &tmp_block);
+    }
+
+    // Actual number of blocks
+    nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
+    const u32 segment_size = nb_blocks / 4;
+
+    // fill (then re-fill) the rest of the blocks
+    FOR (pass_number, 0, nb_iterations) {
+        int first_pass  = pass_number == 0;
+        // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
+        void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
+
+        FOR (segment, 0, 4) {
+            gidx_ctx ctx;
+            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
+
+            // On the first segment of the first pass,
+            // blocks 0 and 1 are already filled.
+            // We use the offset to skip them.
+            u32 offset = first_pass && segment == 0 ? 2 : 0;
+            // current, reference, and previous are block indices
+            FOR (current,
+                 segment * segment_size + offset,
+                 (segment + 1) * segment_size) {
+                u32 previous  = current == 0 ? nb_blocks - 1 : current - 1;
+                u32 reference = gidx_next(&ctx);
+                binary_g(blocks + current,
+                         blocks + previous,
+                         blocks + reference,
+                         xcopy);
+            }
+        }
+    }
+    // hash the very last block with H' into the output tag
+    u8 final_block[1024];
+    store_block(final_block, blocks + (nb_blocks - 1));
+    extended_hash(tag, tag_size, final_block, 1024);
+}
+
+////////////////////////////////////
+/// Arithmetic modulo 2^255 - 19 ///
+////////////////////////////////////
+//  Taken from Supercop's ref10 implementation.
+//  A bit bigger than TweetNaCl, over 6 times faster.
+
+// field element
+typedef i32 fe[10];
+
+sv fe_0   (fe h) {                         FOR (i, 0, 10) h[i] = 0;           }
+sv fe_1   (fe h) {              h[0] = 1;  FOR (i, 1, 10) h[i] = 0;           }
+sv fe_neg (fe h, const fe f)             { FOR (i, 0, 10) h[i] = -f[i];       }
+sv fe_add (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] + g[i]; }
+sv fe_sub (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] - g[i]; }
+sv fe_copy(fe h, const fe f            ) { FOR (i, 0, 10) h[i] = f[i];        }
+
+sv fe_cswap(fe f, fe g, u32 b)
+{
+    FOR (i, 0, 10) {
+        i32 x = (f[i] ^ g[i]) & -b;
+        f[i] = f[i] ^ x;
+        g[i] = g[i] ^ x;
+    }
+}
+
+static u32 load24_le(const u8 s[3])
+{
+    return (u32)s[0]
+        | ((u32)s[1] <<  8)
+        | ((u32)s[2] << 16);
+}
+
+sv fe_carry(fe h, i64 t[10])
+{
+    i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= (u64)c9 << 25;
+    c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1;      t[1] -= (u64)c1 << 25;
+    c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3;      t[3] -= (u64)c3 << 25;
+    c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5;      t[5] -= (u64)c5 << 25;
+    c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7;      t[7] -= (u64)c7 << 25;
+    c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0;      t[0] -= (u64)c0 << 26;
+    c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2;      t[2] -= (u64)c2 << 26;
+    c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4;      t[4] -= (u64)c4 << 26;
+    c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6;      t[6] -= (u64)c6 << 26;
+    c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8;      t[8] -= (u64)c8 << 26;
+    FOR (i, 0, 10) { h[i] = t[i]; }
+}
+
+sv fe_frombytes(fe h, const u8 s[32])
+{
+    i64 t[10]; // intermediate result (may overflow 32 bits)
+    t[0] =  load32_le(s);
+    t[1] =  load24_le(s +  4) << 6;
+    t[2] =  load24_le(s +  7) << 5;
+    t[3] =  load24_le(s + 10) << 3;
+    t[4] =  load24_le(s + 13) << 2;
+    t[5] =  load32_le(s + 16);
+    t[6] =  load24_le(s + 20) << 7;
+    t[7] =  load24_le(s + 23) << 5;
+    t[8] =  load24_le(s + 26) << 4;
+    t[9] = (load24_le(s + 29) & 8388607) << 2;
+    fe_carry(h, t);
+}
+
+sv fe_mul121666(fe h, const fe f)
+{
+    i64 t[10];
+    FOR(i, 0, 10) { t[i] = f[i] * (i64) 121666; }
+    fe_carry(h, t);
+}
+
+sv fe_mul(fe h, const fe f, const fe g)
+{
+    // Everything is unrolled and put in temporary variables.
+    // We could roll the loop, but that would make curve25519 twice as slow.
+    i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
+    i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
+    i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
+    i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
+    i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
+    i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
+    i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
+    i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
+
+    i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
+        +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
+    i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
+        +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
+    i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
+        +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
+    i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
+        +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
+    i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
+        +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
+    i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
+        +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
+    i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
+        +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
+    i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
+        +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
+    i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
+        +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
+    i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
+        +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
+
+    i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= (u64)c0 << 26;
+    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= (u64)c4 << 26;
+    c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1;      h1 -= (u64)c1 << 25;
+    c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5;      h5 -= (u64)c5 << 25;
+    c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2;      h2 -= (u64)c2 << 26;
+    c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6;      h6 -= (u64)c6 << 26;
+    c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3;      h3 -= (u64)c3 << 25;
+    c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7;      h7 -= (u64)c7 << 25;
+    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= (u64)c4 << 26;
+    c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8;      h8 -= (u64)c8 << 26;
+    c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= (u64)c9 << 25;
+    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= (u64)c0 << 26;
+
+    h[0] = h0;  h[1] = h1;  h[2] = h2;  h[3] = h3;  h[4] = h4;
+    h[5] = h5;  h[6] = h6;  h[7] = h7;  h[8] = h8;  h[9] = h9;
+}
+
+sv fe_sq(fe h, const fe f) { fe_mul(h, f, f); }
+
+// power to a power of 2 minus a small number.
+// Timing depends on pow_2 and minus, so they shall not be secret.
+sv fe_power(fe out, const fe z, int pow_2, u8 minus)
+{
+    minus--;
+    fe c; fe_copy(c, z);
+    for (int i = pow_2 - 2; i >= 0; i--) {
+        fe_sq(c, c);
+        if (i >= 8 || !((minus >> i) & 1))
+            fe_mul(c, c, z);
+    }
+    fe_copy(out, c);
+}
+sv fe_invert  (fe out, const fe z) { fe_power(out, z, 255, 21); }
+sv fe_pow22523(fe out, const fe z) { fe_power(out, z, 252,  3); }
+
+sv fe_tobytes(u8 s[32], const fe h)
+{
+    i32 t[10];
+    FOR (i, 0, 10) { t[i] = h[i]; }
+
+    i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
+    FOR (i, 0, 5) {
+        q += t[2*i  ]; q >>= 26;
+        q += t[2*i+1]; q >>= 25;
+    }
+    t[0] += 19 * q;
+
+    i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= (u64)c0 << 26;
+    i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= (u64)c1 << 25;
+    i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= (u64)c2 << 26;
+    i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= (u64)c3 << 25;
+    i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= (u64)c4 << 26;
+    i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= (u64)c5 << 25;
+    i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= (u64)c6 << 26;
+    i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= (u64)c7 << 25;
+    i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= (u64)c8 << 26;
+    i32 c9 = t[9] >> 25;             t[9] -= (u64)c9 << 25;
+
+    store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
+    store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
+    store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
+    store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
+    store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
+    store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
+    store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
+    store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
+}
+
+//  Parity check.  Returns 0 if even, 1 if odd
+static int fe_isnegative(const fe f)
+{
+    u8 s[32];
+    fe_tobytes(s, f);
+    return s[0] & 1;
+}
+
+static int fe_isnonzero(const fe f)
+{
+    u8 s[32];
+    fe_tobytes(s, f);
+    return crypto_zerocmp(s, 32);
+}
+
+///////////////
+/// X-25519 /// Taken from Supercop's ref10 implementation.
+///////////////
+
+sv trim_scalar(u8 s[32])
+{
+    s[ 0] &= 248;
+    s[31] &= 127;
+    s[31] |= 64;
+}
+
+int crypto_x25519(u8       shared_secret   [32],
+                  const u8 your_secret_key [32],
+                  const u8 their_public_key[32])
+{
+    // computes the scalar product
+    fe x1, x2, z2, x3, z3;
+    fe_frombytes(x1, their_public_key);
+
+    // restrict the possible scalar values
+    u8 e[32]; FOR (i, 0, 32) { e[i] = your_secret_key[i]; }
+    trim_scalar(e);
+
+    // Montgomery ladder
+    // In projective coordinates, to avoid divisons: x = X / Z
+    // We don't care about the y coordinate, it's only 1 bit of information
+    fe_1(x2);        fe_0(z2); // "zero" point
+    fe_copy(x3, x1); fe_1(z3); // "one"  point
+    int swap = 0;
+    for (int pos = 254; pos >= 0; --pos) {
+        // constant time conditional swap before ladder step
+        int b = (e[pos / 8] >> (pos & 7)) & 1;
+        swap ^= b; // xor trick avoids swapping at the end of the loop
+        fe_cswap(x2, x3, swap);
+        fe_cswap(z2, z3, swap);
+        swap = b;  // anticipates one last swap after the loop
+
+        // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
+        // with differential addition
+        fe t0, t1;
+        fe_sub(t0, x3, z3);  fe_sub(t1, x2, z2);    fe_add(x2, x2, z2);
+        fe_add(z2, x3, z3);  fe_mul(z3, t0, x2);    fe_mul(z2, z2, t1);
+        fe_sq (t0, t1    );  fe_sq (t1, x2    );    fe_add(x3, z3, z2);
+        fe_sub(z2, z3, z2);  fe_mul(x2, t1, t0);    fe_sub(t1, t1, t0);
+        fe_sq (z2, z2    );  fe_mul121666(z3, t1);  fe_sq (x3, x3    );
+        fe_add(t0, t0, z3);  fe_mul(z3, x1, z2);    fe_mul(z2, t1, t0);
+    }
+    // last swap is necessary to compensate for the xor trick
+    fe_cswap(x2, x3, swap);
+    fe_cswap(z2, z3, swap);
+
+    // normalises the coordinates: x == X / Z
+    fe_invert(z2, z2);
+    fe_mul(x2, x2, z2);
+    fe_tobytes(shared_secret, x2);
+
+    // Returns -1 if the input is all zero
+    // (happens with some malicious public keys)
+    return -1 - crypto_zerocmp(shared_secret, 32);
+}
+
+void crypto_x25519_public_key(u8       public_key[32],
+                              const u8 secret_key[32])
+{
+    static const u8 base_point[32] = {9};
+    crypto_x25519(public_key, secret_key, base_point);
+}
+
+///////////////
+/// Ed25519 ///
+///////////////
+
+// Point in a twisted Edwards curve,
+// in extended projective coordinates
+// x = X/Z, y = Y/Z, T = XY/Z
+typedef struct { fe X; fe Y; fe Z; fe T; } ge;
+
+sv ge_from_xy(ge *p, const fe x, const fe y)
+{
+    FOR (i, 0, 10) {
+        p->X[i] = x[i];
+        p->Y[i] = y[i];
+    }
+    fe_1  (p->Z);
+    fe_mul(p->T, x, y);
+}
+
+sv ge_cswap(ge *p, ge *q, u32 b)
+{
+    fe_cswap(p->X, q->X, b);
+    fe_cswap(p->Y, q->Y, b);
+    fe_cswap(p->Z, q->Z, b);
+    fe_cswap(p->T, q->T, b);
+}
+
+sv ge_tobytes(u8 s[32], const ge *h)
+{
+    fe recip, x, y;
+    fe_invert(recip, h->Z);
+    fe_mul(x, h->X, recip);
+    fe_mul(y, h->Y, recip);
+    fe_tobytes(s, y);
+    s[31] ^= fe_isnegative(x) << 7;
+}
+
+// Variable time! s must not be secret!
+static int ge_frombytes_neg(ge *h, const u8 s[32])
+{
+    static const fe d = {
+        -10913610,13857413,-15372611,6949391,114729,
+        -8787816,-6275908,-3247719,-18696448,-12055116
+    } ;
+    static const fe sqrtm1 = {
+        -32595792,-7943725,9377950,3500415,12389472,
+        -272473,-25146209,-2005654,326686,11406482
+    } ;
+    fe u, v, v3, vxx, check;
+    fe_frombytes(h->Y, s);
+    fe_1(h->Z);
+    fe_sq(u, h->Y);            // y^2
+    fe_mul(v, u, d);
+    fe_sub(u, u, h->Z);        // u = y^2-1
+    fe_add(v, v, h->Z);        // v = dy^2+1
+
+    fe_sq(v3, v);
+    fe_mul(v3, v3, v);         // v3 = v^3
+    fe_sq(h->X, v3);
+    fe_mul(h->X, h->X, v);
+    fe_mul(h->X, h->X, u);     // x = uv^7
+
+    fe_pow22523(h->X, h->X);   // x = (uv^7)^((q-5)/8)
+    fe_mul(h->X, h->X, v3);
+    fe_mul(h->X, h->X, u);     // x = uv^3(uv^7)^((q-5)/8)
+
+    fe_sq(vxx, h->X);
+    fe_mul(vxx, vxx, v);
+    fe_sub(check, vxx, u);     // vx^2-u
+    if (fe_isnonzero(check)) {
+        fe_add(check, vxx, u); // vx^2+u
+        if (fe_isnonzero(check)) return -1;
+        fe_mul(h->X, h->X, sqrtm1);
+    }
+
+    if (fe_isnegative(h->X) == (s[31] >> 7))
+        fe_neg(h->X, h->X);
+
+    fe_mul(h->T, h->X, h->Y);
+    return 0;
+}
+
+// for point additon
+static const fe D2 = { // - 2 * 121665 / 121666
+    0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052,
+    0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67
+};
+
+sv ge_add(ge *s, const ge *p, const ge *q)
+{
+    fe a, b, c, d, e, f, g, h;
+    //  A = (Y1-X1) * (Y2-X2)
+    //  B = (Y1+X1) * (Y2+X2)
+    fe_sub(a, p->Y, p->X);  fe_sub(h, q->Y, q->X);  fe_mul(a, a, h);
+    fe_add(b, p->X, p->Y);  fe_add(h, q->X, q->Y);  fe_mul(b, b, h);
+    fe_mul(c, p->T, q->T);  fe_mul(c, c, D2  );  //  C = T1 * k * T2
+    fe_add(d, p->Z, p->Z);  fe_mul(d, d, q->Z);  //  D = Z1 * 2 * Z2
+    fe_sub(e, b, a);     //  E  = B - A
+    fe_sub(f, d, c);     //  F  = D - C
+    fe_add(g, d, c);     //  G  = D + C
+    fe_add(h, b, a);     //  H  = B + A
+    fe_mul(s->X, e, f);  //  X3 = E * F
+    fe_mul(s->Y, g, h);  //  Y3 = G * H
+    fe_mul(s->Z, f, g);  //  Z3 = F * G
+    fe_mul(s->T, e, h);  //  T3 = E * H
+}
+
+sv ge_double(ge *s, const ge *p) { ge_add(s, p, p); }
+
+sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
+{
+    // Simple Montgomery ladder, with straight double and add.
+    ge t;
+    fe_0(p->X);  fe_copy(t.X, q->X);
+    fe_1(p->Y);  fe_copy(t.Y, q->Y);
+    fe_1(p->Z);  fe_copy(t.Z, q->Z);
+    fe_0(p->T);  fe_copy(t.T, q->T);
+    int swap = 0;
+    for (int i = 255; i >= 0; i--) {
+        int b = (scalar[i/8] >> (i & 7)) & 1;
+        swap ^= b;  // xor trick avoids unnecessary swaps
+        ge_cswap(p, &t, swap);
+        swap = b;
+        ge_add(&t, &t, p);
+        ge_double(p, p);
+    }
+    // one last swap makes up for the xor trick
+    ge_cswap(p, &t, swap);
+}
+
+sv ge_scalarmult_base(ge *p, const u8 scalar[32])
+{
+    // Calls the general ge_scalarmult() with the base point.
+    // Other implementations use a precomputed table, but it
+    // takes way too much code.
+    static const fe X = {
+        0x325d51a, 0x18b5823, 0x0f6592a, 0x104a92d, 0x1a4b31d,
+        0x1d6dc5c, 0x27118fe, 0x07fd814, 0x13cd6e5, 0x085a4db};
+    static const fe Y = {
+        0x2666658, 0x1999999, 0x0cccccc, 0x1333333, 0x1999999,
+        0x0666666, 0x3333333, 0x0cccccc, 0x2666666, 0x1999999};
+    ge base_point;
+    ge_from_xy(&base_point, X, Y);
+    ge_scalarmult(p, &base_point, scalar);
+}
+
+sv modL(u8 *r, i64 x[64])
+{
+    static const  u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+                                0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
+    for (unsigned i = 63; i >= 32; i--) {
+        i64 carry = 0;
+        FOR (j, i-32, i-12) {
+            x[j] += carry - 16 * x[i] * L[j - (i - 32)];
+            carry = (x[j] + 128) >> 8;
+            x[j] -= (u64)carry << 8;
+        }
+        x[i-12] += carry;
+        x[i] = 0;
+    }
+    i64 carry = 0;
+    FOR(i, 0, 32) {
+        x[i] += carry - (x[31] >> 4) * L[i];
+        carry = x[i] >> 8;
+        x[i] &= 255;
+    }
+    FOR(i, 0, 32) { x[i] -= carry * L[i]; }
+    FOR(i, 0, 32) {
+        x[i+1] += x[i] >> 8;
+        r[i  ]  = x[i] & 255;
+    }
+}
+
+sv reduce(u8 r[64])
+{
+    i64 x[64];
+    FOR(i, 0, 64) x[i] = (u64) r[i];
+    FOR(i, 0, 64) r[i] = 0;
+    modL(r, x);
+}
+
+// hashes R || A || M, reduces it modulo L
+sv hash_ram(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
+{
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, R , 32    );
+    HASH_UPDATE(&ctx, A , 32    );
+    HASH_UPDATE(&ctx, M , M_size);
+    HASH_FINAL (&ctx, k);
+    reduce(k);
+}
+
+void crypto_sign_public_key(u8       public_key[32],
+                            const u8 secret_key[32])
+{
+    u8 a[64];
+    HASH(a, secret_key, 32);
+    trim_scalar(a);
+    ge A;
+    ge_scalarmult_base(&A, a);
+    ge_tobytes(public_key, &A);
+}
+
+void crypto_sign(u8        signature[64],
+                 const u8  secret_key[32],
+                 const u8  public_key[32],
+                 const u8 *message, size_t message_size)
+{
+    u8 a[64], *prefix = a + 32;
+    HASH(a, secret_key, 32);
+    trim_scalar(a);
+
+    u8 pk_buf[32];
+    const u8 *pk = public_key;
+    if (public_key == 0) {
+        crypto_sign_public_key(pk_buf, secret_key);
+        pk = pk_buf;
+    }
+
+    // Constructs the "random" nonce from the secret key and message.
+    // An actual random number would work just fine, and would save us
+    // the trouble of hashing the message twice.  If we did that
+    // however, the user could fuck it up and reuse the nonce.
+    u8 r[64];
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, prefix , 32          );
+    HASH_UPDATE(&ctx, message, message_size);
+    HASH_FINAL (&ctx, r);
+    reduce(r);
+
+    // first half of the signature = "random" nonce times basepoint
+    ge R;
+    ge_scalarmult_base(&R, r);
+    ge_tobytes(signature, &R);
+
+    u8 h_ram[64];
+    hash_ram(h_ram, signature, pk, message, message_size);
+
+    i64 s[64]; // s = r + h_ram * a
+    FOR(i,  0, 32) s[i] = (u64) r[i];
+    FOR(i, 32, 64) s[i] = 0;
+    FOR(i, 0, 32) {
+        FOR(j, 0, 32) {
+            s[i+j] += h_ram[i] * (u64) a[j];
+        }
+    }
+    modL(signature + 32, s);  // second half of the signature = s
+}
+
+int crypto_check(const u8  signature[64],
+                 const u8  public_key[32],
+                 const u8 *message,  size_t message_size)
+{
+    ge A, p, sB, diff;
+    u8 h_ram[64], R_check[32];
+    if (ge_frombytes_neg(&A, public_key)) return -1;  // -A
+    hash_ram(h_ram, signature, public_key, message, message_size);
+    ge_scalarmult(&p, &A, h_ram);                     // p    = -A*h_ram
+    ge_scalarmult_base(&sB, signature + 32);
+    ge_add(&diff, &p, &sB);                           // diff = s - A*h_ram
+    ge_tobytes(R_check, &diff);
+    return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail
+}
+
+////////////////////
+/// Key exchange ///
+////////////////////
+int crypto_key_exchange(u8       shared_key[32],
+                        const u8 your_secret_key [32],
+                        const u8 their_public_key[32])
+{
+    static const u8 zero[16] = {0};
+    u8 shared_secret[32];
+    int status = crypto_x25519(shared_secret, your_secret_key, their_public_key);
+    crypto_chacha20_H(shared_key, shared_secret, zero);
+    return status;
+}
+
+////////////////////////////////
+/// Authenticated encryption ///
+////////////////////////////////
+sv authenticate2(u8 mac[16]  , const u8 auth_key[32],
+                 const u8 *t1, size_t   size1,
+                 const u8 *t2, size_t   size2)
+{
+    crypto_poly1305_ctx a_ctx;
+    crypto_poly1305_init  (&a_ctx, auth_key);
+    crypto_poly1305_update(&a_ctx, t1, size1);
+    crypto_poly1305_update(&a_ctx, t2, size2);
+    crypto_poly1305_final (&a_ctx, mac);
+}
+
+void crypto_aead_lock(u8        mac[16],
+                      u8       *ciphertext,
+                      const u8  key[32],
+                      const u8  nonce[24],
+                      const u8 *ad       , size_t ad_size,
+                      const u8 *plaintext, size_t text_size)
+{   // encrypt then mac
+    u8 auth_key[32];
+    crypto_chacha_ctx e_ctx;
+    crypto_chacha20_Xinit  (&e_ctx, key, nonce);
+    crypto_chacha20_stream (&e_ctx, auth_key, 32);
+    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
+    authenticate2(mac, auth_key, ad, ad_size, ciphertext, text_size);
+}
+
+int crypto_aead_unlock(u8       *plaintext,
+                       const u8  key[32],
+                       const u8  nonce[24],
+                       const u8  mac[16],
+                       const u8 *ad        , size_t ad_size,
+                       const u8 *ciphertext, size_t text_size)
+{
+    u8 auth_key[32], real_mac[16];
+    crypto_chacha_ctx e_ctx;
+    crypto_chacha20_Xinit (&e_ctx, key, nonce);
+    crypto_chacha20_stream(&e_ctx, auth_key, 32);
+    authenticate2(real_mac, auth_key, ad, ad_size, ciphertext, text_size);
+    if (crypto_memcmp(real_mac, mac, 16)) return -1;  // reject forgeries
+    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
+    return 0;
+}
+
+void crypto_lock(u8       *box,
+                 const u8  key[32],
+                 const u8  nonce[24],
+                 const u8 *plaintext,
+                 size_t    text_size)
+{
+    crypto_aead_lock(box, box + 16, key, nonce, 0, 0, plaintext, text_size);
+}
+
+int crypto_unlock(u8       *plaintext,
+                  const u8  key[32],
+                  const u8  nonce[24],
+                  const u8 *box, size_t box_size)
+{
+    return crypto_aead_unlock(plaintext, key, nonce, box, 0, 0,
+                              box + 16, box_size -16);
+}
diff --git a/libs/monocypher/monocypher.h b/libs/monocypher/monocypher.h
@@ -0,0 +1,167 @@
+#ifndef MONOCYPHER_H
+#define MONOCYPHER_H
+
+#include <inttypes.h>
+#include <stddef.h>
+
+// Constant time equality verification
+// returns 0 if it matches, -1 otherwise.
+int crypto_memcmp(const uint8_t *p1, const uint8_t *p2, size_t n);
+
+// constant time zero comparison.
+// returns 0 if the input is all zero, -1 otherwise.
+int crypto_zerocmp(const uint8_t *p, size_t n);
+
+////////////////
+/// Chacha20 ///
+////////////////
+typedef struct {
+    uint32_t input[16];       // current input, unencrypted
+    uint8_t  random_pool[64]; // last input, encrypted
+    uint8_t  pool_index;      // pointer to random_pool
+} crypto_chacha_ctx;
+
+void crypto_chacha20_H(uint8_t       out[32],
+                       const uint8_t key[32],
+                       const uint8_t in [16]);
+
+void crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                          const uint8_t      key[32],
+                          const uint8_t      nonce[8]);
+
+void crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                           const uint8_t      key[32],
+                           const uint8_t      nonce[24]);
+
+void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                             uint8_t           *cipher_text,
+                             const uint8_t     *plain_text,
+                             size_t             message_size);
+
+void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
+                            uint8_t           *cipher_text,
+                            size_t             message_size);
+
+/////////////////
+/// Poly 1305 ///
+/////////////////
+typedef struct {
+    uint32_t r[4];
+    uint32_t h[5];
+    uint32_t c[5];
+    uint32_t pad[5];
+    size_t   c_index;
+} crypto_poly1305_ctx;
+
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
+
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const uint8_t *msg, size_t msg_size);
+
+void crypto_poly1305_final(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
+
+void crypto_poly1305_auth(uint8_t        mac[16],
+                          const uint8_t *msg, size_t msg_size,
+                          const uint8_t  key[32]);
+
+////////////////
+/// Blake2 b ///
+////////////////
+typedef struct {
+    uint8_t  buf[128];      // input buffer
+    uint64_t hash[8];       // chained state
+    uint64_t input_size[2]; // total number of bytes
+    uint8_t  c;             // pointer for buf[]
+    uint8_t  output_size;   // digest size
+} crypto_blake2b_ctx;
+
+void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,
+                                 const uint8_t      *key, size_t key_size);
+
+void crypto_blake2b_init(crypto_blake2b_ctx *ctx);
+
+void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
+                           const uint8_t *in, size_t in_size);
+
+void crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
+
+void crypto_blake2b_general(uint8_t       *out, size_t out_size, // digest
+                            const uint8_t *key, size_t key_size, // optional
+                            const uint8_t *in , size_t in_size);
+
+void crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t in_size);
+
+////////////////
+/// Argon2 i ///
+////////////////
+void crypto_argon2i(uint8_t       *tag,       uint32_t tag_size,      // >= 4
+                    void          *work_area, uint32_t nb_blocks,     // >= 8
+                    uint32_t       nb_iterations,
+                    const uint8_t *password,  uint32_t password_size,
+                    const uint8_t *salt,      uint32_t salt_size,     // >= 8
+                    const uint8_t *key,       uint32_t key_size,
+                    const uint8_t *ad,        uint32_t ad_size);
+
+///////////////
+/// X-25519 ///
+///////////////
+int crypto_x25519(uint8_t       shared_secret   [32],
+                  const uint8_t your_secret_key [32],
+                  const uint8_t their_public_key[32]);
+
+void crypto_x25519_public_key(uint8_t       public_key[32],
+                              const uint8_t secret_key[32]);
+
+
+///////////////
+/// Ed25519 ///
+///////////////
+void crypto_sign_public_key(uint8_t        public_key[32],
+                            const uint8_t  secret_key[32]);
+
+void crypto_sign(uint8_t        signature[64],
+                 const uint8_t  secret_key[32],
+                 const uint8_t  public_key[32], // optional, may be 0
+                 const uint8_t *message, size_t message_size);
+
+int crypto_check(const uint8_t  signature[64],
+                 const uint8_t  public_key[32],
+                 const uint8_t *message, size_t message_size);
+
+////////////////////
+/// Key exchange ///
+////////////////////
+int crypto_key_exchange(uint8_t       shared_key      [32],
+                        const uint8_t your_secret_key [32],
+                        const uint8_t their_public_key[32]);
+
+////////////////////////////////
+/// Authenticated encryption ///
+////////////////////////////////
+void crypto_aead_lock(uint8_t        mac[16],
+                      uint8_t       *ciphertext,
+                      const uint8_t  key[32],
+                      const uint8_t  nonce[24],
+                      const uint8_t *ad       , size_t ad_size,
+                      const uint8_t *plaintext, size_t text_size);
+
+int crypto_aead_unlock(uint8_t       *plaintext,
+                       const uint8_t  key[32],
+                       const uint8_t  nonce[24],
+                       const uint8_t  mac[16],
+                       const uint8_t *ad        , size_t ad_size,
+                       const uint8_t *ciphertext, size_t text_size);
+
+void crypto_lock(uint8_t       *box,      // text_size + 16
+                 const uint8_t  key[32],
+                 const uint8_t  nonce[24],
+                 const uint8_t *plaintext,
+                 size_t         text_size);
+
+int crypto_unlock(uint8_t       *plaintext, // box_size - 16
+                  const uint8_t  key[32],
+                  const uint8_t  nonce[24],
+                  const uint8_t *box,
+                  size_t         box_size);
+
+#endif // MONOCYPHER_H
libs/monocypher.mk	\|	6	++++++
libs/monocypher/README	\|	107	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
libs/monocypher/monocypher.cc	\|	1366	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
libs/monocypher/monocypher.h	\|	167	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++