medfall

A super great game engine
Log | Files | Refs

commit 824c7244d139e2dc8680ce5c22c24415ae730b2e
parent ac2727f3fc9b64f4f3ce9799e1ea60c72f96f002
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Tue Jul 25 19:21:28 +0300

Update monocypher to 1.0

Diffstat:
libs/monocypher/LICENCE.md | 6++++++
libs/monocypher/README | 107-------------------------------------------------------------------------------
libs/monocypher/README.md | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
libs/monocypher/monocypher.cc | 595++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
libs/monocypher/monocypher.h | 39++++++++++++++++++++-------------------
5 files changed, 491 insertions(+), 373 deletions(-)
diff --git a/libs/monocypher/LICENCE.md b/libs/monocypher/LICENCE.md @@ -0,0 +1,6 @@ +Copying and distribution of the code, with or without modification, +are permitted in any medium without royalty. This code is offered +as-is, without any warranty. + +(Simply put, do whatever you want with the code. + I'm not responsible if anything goes wrong.) diff --git a/libs/monocypher/README b/libs/monocypher/README @@ -1,107 +0,0 @@ -Authors -------- - -Packaged by Loup Vaillant. - -- Chacha20: Loup Vaillant, implemented from spec. -- Poly1305: Loup Vaillant, implemented from spec. -- Blake2b: derived from https://tools.ietf.org/html/rfc7693 -- Argon2i: Loup Vaillant, implemented from spec. -- X25519: taken from SUPERCOP ref10. -- ed25519: adapted http://tweetnacl.cr.yp.to/ for ref10 arithmetic. -- High-level constructions: Loup Vaillant, implemented from specs and - first principles - -Licence -------- - -For everything *but* Blake2b: - - Copying and distribution of the code, with or without modification, - are permitted in any medium without royalty. This code is offered - as-is, without any warranty. - ---- - -For the Blake2b code: - - Copyright (c) 2015 IETF Trust and the persons identified as authors - of the code. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - - Neither the name of Internet Society, IETF or IETF Trust, nor the - names of specific contributors, may be used to endorse or promote - products derived from this software without specific prior written - permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, - INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS - BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR - TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF - THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. - - -Current status --------------- - -0.2 Interfaces should stabilise. Needs external review. - - -Test suite ----------- - - $ make - $ ./test - -It should display a nice printout of all the tests, all starting with -"OK". If you see "FAILURE" anywhere, something has gone very wrong -somewhere. - -*Do not* use Monocypher without having run the test suite at least -once. - - -Integration to your project ---------------------------- - -Just copy monocypher.c and monocypher.h. - -Compile as C99, C11, C++98, C++11, C++14, and C++17. (Tested with -gcc 5.4.0 and clang 2.8.0 on GNU/Linux.) - - -Customisation -------------- - -If you want to use ed25519 with the official SHA-512 hash instead of -the default Blake2b, do as the test suite does: - -- Compile monocypher.c with option -DED25519_SHA512, or modify the - relevant preprocessor directives at the beginning of monocypher.c. - -- Link the final program with a suitable SHA-512 implementation. You - can use the sha512.c and sha512.h files provided here. - -Note that even though the default hash (Blake2b) is not widely used, -it doesn't prevent you from upgrading to faster implementations if you -need to. The Donna implementations of ed25519 for instance can use a -custom hash. diff --git a/libs/monocypher/README.md b/libs/monocypher/README.md @@ -0,0 +1,117 @@ +Monocypher +---------- + +Monocypher is an easy to use, easy to deploy, auditable crypto library +inspired by [libsodium][] and [TweetNaCl][], written in portable C. + +It means to eat libsodium's lunch. + +[Official site.](http://loup-vaillant.fr/projects/monocypher/) + +[libsodium]: http://libsodium.org +[TweetNaCl]: http://tweetnacl.cr.yp.to/ + +Installation +------------ + +just copy `src/monocypher.h` and `src/monocypher.c` into your project. + +They compile as C99, C11, C++98, C++11, C++14, and C++17. (Tested with +gcc 5.4.0 and clang 2.8.0 on GNU/Linux.) + +Test suite +---------- + + $ make all + $ ./test.sh + +It should display a nice printout of all the tests, all starting with +"OK". If you see "FAILURE" anywhere, something has gone very wrong +somewhere. Note: the fuzz tests depend on libsodium. Install it +before you run them. + +To run only the self contained tests, run + + $ make vectors properties + $ ./vectors + $ ./properties + +To run only the edDSA fuzz tests (compares Monocypher with +[ed25519-donna][donna]), run + + $ make donna + $ ./donna + +*Do not* use Monocypher without running the self contained tests at + least once. + +[donna]: https://github.com/floodyberry/ed25519-donna + + +### More serious testing + +The makefile may be modified to activate sanitising. Just run the +previous tests under the various sanitisers. + +You can also use Valgrind: + + $ valgrind ./vectors + $ valgrind ./properties + $ valgrind ./donna + $ valgrind ./sodium + +### Serious auditing + +The code may be analysed more formally with [Frama-c][] and the +[TIS interpreter][TIS]. To analyse the code with Frama-c, run: + + $ make formal-analysis + $ ./frama-c.sh + +This will have Frama-c parse, and analyse the code, then launch a GUI. +You must have Frama-c installed. See `frama-c.sh` for the recommended +settings. To run the code under the TIS interpreter, run + + $ make formal-analysis + $ cd formal-analysis + $ tis-interpreter.sh *.c + +(Note: `tis-interpreter.sh` is part of TIS. If it is not in your +path, adjust the command accordingly.) + +[Frama-c]:http://frama-c.com/ +[TIS]: http://trust-in-soft.com/tis-interpreter/ + + +Speed benchmark +--------------- + + $ make speed + $ ./speed + +It should tell you how well Monocypher fares against libsodium. +Results may vary between platforms. Requires the POSIX +`clock_gettime()` function, which is generally disabled when using +-std=C99 for strict C compliance. (See the makefile to modify it.) + +To make sure the benchmark is fair, compile libsodium with suitable +optimisation levels. (My first benchmarks made libsodium look really +bad with Argon2i). + + +Customisation +------------- + +If you want to use ed25519 with the official SHA-512 hash instead of +the default Blake2b, do as the test suite does: + +- Compile monocypher.c with option -DED25519_SHA512, or modify the + relevant preprocessor directives at the beginning of monocypher.c. + +- Link the final program with a suitable SHA-512 implementation. You + can use the sha512.c and sha512.h files provided here. + +Note that even though the default hash (Blake2b) is not "standard", +you can still upgrade to faster implementations if you really need to. +The Donna implementations of ed25519 for instance can use a custom +hash —one test does just that. diff --git a/libs/monocypher/monocypher.cc b/libs/monocypher/monocypher.cc @@ -4,9 +4,9 @@ /// Utilities /// ///////////////// -// By default, signatures use blake2b. SHA-512 is provided as an -// option for full ed25519 compatibility (a must for test vectors). -// Compile with option -DED25519_SHA512 to use with sha512 If you do +// By default, EdDSA signatures use blake2b. SHA-512 is provided as +// an option for full ed25519 compatibility (a must for test vectors). +// Compile with option -DED25519_SHA512 to use with sha512. If you do // so, you must provide the "sha512" header with suitable functions. #ifdef ED25519_SHA512 #include "sha512.h" @@ -21,9 +21,8 @@ #define HASH_UPDATE COMBINE2(HASH, _update) #define HASH_FINAL COMBINE2(HASH, _final) -#define FOR(i, start, end) for (size_t i = start; i < end; i++) +#define FOR(i, start, end) for (size_t (i) = (start); (i) < (end); (i)++) #define sv static void -typedef int8_t i8; typedef uint8_t u8; typedef uint32_t u32; typedef int32_t i32; @@ -98,17 +97,26 @@ int crypto_zerocmp(const u8 *p, size_t n) sv chacha20_rounds(u32 out[16], const u32 in[16]) { - FOR (i, 0, 16) { out[i] = in[i]; } + // The temporary variables make Chacha20 10% faster. + u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3]; + u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7]; + u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11]; + u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15]; + FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop. - QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0 - QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1 - QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2 - QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3 - QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1 - QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2 - QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3 - QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4 + QUARTERROUND(t0, t4, t8 , t12); // column 0 + QUARTERROUND(t1, t5, t9 , t13); // column 1 + QUARTERROUND(t2, t6, t10, t14); // column 2 + QUARTERROUND(t3, t7, t11, t15); // column 3 + QUARTERROUND(t0, t5, t10, t15); // diagonal 0 + QUARTERROUND(t1, t6, t11, t12); // diagonal 1 + QUARTERROUND(t2, t7, t8 , t13); // diagonal 2 + QUARTERROUND(t3, t4, t9 , t14); // diagonal 3 } + out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3; + out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7; + out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11; + out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15; } sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32]) @@ -122,8 +130,6 @@ sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32]) FOR (i, 0, 8) { ctx->input[i+4] = load32_le(key + i*4); } - // pool index (the random pool starts empty) - ctx->pool_index = 64; } void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16]) @@ -147,21 +153,27 @@ void crypto_chacha20_init(crypto_chacha_ctx *ctx, const u8 nonce[8]) { chacha20_init_key(ctx, key ); // key - ctx->input[12] = 0; // counter - ctx->input[13] = 0; // counter + crypto_chacha20_set_ctr(ctx, 0); // counter ctx->input[14] = load32_le(nonce + 0); // nonce ctx->input[15] = load32_le(nonce + 4); // nonce } -void crypto_chacha20_Xinit(crypto_chacha_ctx *ctx, - const u8 key[32], - const u8 nonce[24]) +void crypto_chacha20_x_init(crypto_chacha_ctx *ctx, + const u8 key[32], + const u8 nonce[24]) { u8 derived_key[32]; crypto_chacha20_H(derived_key, key, nonce); crypto_chacha20_init(ctx, derived_key, nonce + 16); } +void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, u64 ctr) +{ + ctx->input[12] = ctr & 0xffffffff; + ctx->input[13] = ctr >> 32; + ctx->pool_index = 64; // The random pool (re)starts empty +} + void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, u8 *cipher_text, const u8 *plain_text, @@ -179,8 +191,7 @@ void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, // update the counters ctx->pool_index = 0; ctx->input[12]++; - if (!ctx->input[12]) - ctx->input[13]++; + if (ctx->input[12] == 0) { ctx->input[13]++; } } // use the pool for encryption (or random stream) cipher_text[i] = @@ -191,10 +202,9 @@ void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, } void crypto_chacha20_stream(crypto_chacha_ctx *ctx, - u8 *cipher_text, - size_t message_size) + uint8_t *stream, size_t size) { - crypto_chacha20_encrypt(ctx, cipher_text, 0, message_size); + crypto_chacha20_encrypt(ctx, stream, 0, size); } @@ -280,7 +290,7 @@ void crypto_poly1305_update(crypto_poly1305_ctx *ctx, poly_clear_c(ctx); } // feed the input buffer - ctx->c[ctx->c_index / 4] |= msg[i] << ((ctx->c_index % 4) * 8); + ctx->c[ctx->c_index / 4] |= (u32)msg[i] << ((ctx->c_index % 4) * 8); ctx->c_index++; } } @@ -292,7 +302,7 @@ void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16]) // move the final 1 according to remaining input length // (We may add less than 2^130 to the last input block) ctx->c[4] = 0; - ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8); + ctx->c[ctx->c_index / 4] |= (u32)1 << ((ctx->c_index % 4) * 8); // one last hash update poly_block(ctx); } @@ -315,8 +325,8 @@ void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16]) u += (i64)(ctx->h[3]) + ctx->pad[3]; store32_le(mac + 12, u); } -void crypto_poly1305_auth(u8 mac[16], const u8 *msg, - size_t msg_size, const u8 key[32]) +void crypto_poly1305_auth(u8 mac[16], const u8 *msg, + size_t msg_size, const u8 key[32]) { crypto_poly1305_ctx ctx; crypto_poly1305_init (&ctx, key); @@ -325,93 +335,101 @@ void crypto_poly1305_auth(u8 mac[16], const u8 *msg, } //////////////// -/// Blake2 b /// (taken from the reference -//////////////// implentation in RFC 7693) - -// Initialization Vector. -static const u64 blake2b_iv[8] = { +/// Blake2 b /// +//////////////// +static const u64 iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, }; -// increment a 128-bit "word". -sv incr(u64 x[2], u64 y) +// increment the input offset +sv incr(crypto_blake2b_ctx *ctx) { - x[0] += y; // increment the low word - if (x[0] < y) { x[1]++; } // handle overflow + u64 *x = ctx->input_offset; + size_t y = ctx->buffer_idx; + x[0] += y; // increment low word + if (x[0] < y) { x[1]++; } // carry overflow to high word } -sv blake2b_compress(crypto_blake2b_ctx *ctx, int last_block) +// pad the buffer with zeroes +sv pad(crypto_blake2b_ctx *ctx) +{ + FOR (i, ctx->buffer_idx, 128) { ctx->buffer[i] = 0; } + ctx->buffer_idx = 128; // mark the buffer as filled +} + +sv compress(crypto_blake2b_ctx *ctx, int is_last_block) { static const u8 sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, }; - // init work variables (before shuffling them) + // load input buffer + u64 input[16]; + FOR(i, 0, 16) { input[i] = load64_le(ctx->buffer + i*8); } + + // init work vector u64 v[16]; FOR (i, 0, 8) { - v[i ] = ctx->hash[i]; - v[i + 8] = blake2b_iv[i]; + v[i ] = ctx->hash[i]; + v[i+8] = iv[i]; } - v[12] ^= ctx->input_size[0]; // low 64 bits of offset - v[13] ^= ctx->input_size[1]; // high 64 bits - if (last_block) { v[14] = ~v[14]; } + v[12] ^= ctx->input_offset[0]; + v[13] ^= ctx->input_offset[1]; + if (is_last_block) { v[14] = ~v[14]; } - // load the input buffer - u64 m[16]; - FOR (i ,0, 16) { m[i] = load64_le(&ctx->buf[i * 8]); } - - // shuffle the work variables with the 12 rounds + // mangle work vector FOR (i, 0, 12) { -#define B2B_G(a, b, c, d, x, y) \ - v[a] += v[b] + x; v[d] ^= v[a]; v[d] = rotr64(v[d], 32); \ - v[c] += v[d] ; v[b] ^= v[c]; v[b] = rotr64(v[b], 24); \ - v[a] += v[b] + y; v[d] ^= v[a]; v[d] = rotr64(v[d], 16); \ - v[c] += v[d] ; v[b] ^= v[c]; v[b] = rotr64(v[b], 63) - - B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); - B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); - B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); - B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); - B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); - B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); - B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); - B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); +#define BLAKE2_G(v, a, b, c, d, x, y) \ + v[a] += v[b] + x; v[d] = rotr64(v[d] ^ v[a], 32); \ + v[c] += v[d]; v[b] = rotr64(v[b] ^ v[c], 24); \ + v[a] += v[b] + y; v[d] = rotr64(v[d] ^ v[a], 16); \ + v[c] += v[d]; v[b] = rotr64(v[b] ^ v[c], 63); \ + + BLAKE2_G(v, 0, 4, 8, 12, input[sigma[i][ 0]], input[sigma[i][ 1]]); + BLAKE2_G(v, 1, 5, 9, 13, input[sigma[i][ 2]], input[sigma[i][ 3]]); + BLAKE2_G(v, 2, 6, 10, 14, input[sigma[i][ 4]], input[sigma[i][ 5]]); + BLAKE2_G(v, 3, 7, 11, 15, input[sigma[i][ 6]], input[sigma[i][ 7]]); + BLAKE2_G(v, 0, 5, 10, 15, input[sigma[i][ 8]], input[sigma[i][ 9]]); + BLAKE2_G(v, 1, 6, 11, 12, input[sigma[i][10]], input[sigma[i][11]]); + BLAKE2_G(v, 2, 7, 8, 13, input[sigma[i][12]], input[sigma[i][13]]); + BLAKE2_G(v, 3, 4, 9, 14, input[sigma[i][14]], input[sigma[i][15]]); } - // accumulate the work variables into the hash + // update hash FOR (i, 0, 8) { ctx->hash[i] ^= v[i] ^ v[i+8]; } + // mark buffer as empty + ctx->buffer_idx = 0; } void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size, const u8 *key, size_t key_size) { - // Initial hash == initialization vector... - FOR (i, 0, 8) { ctx->hash[i] = blake2b_iv[i]; } - ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size; // ...mostly + // initial hash + FOR (i, 0, 8) { ctx->hash[i] = iv[i]; } + ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size; - ctx->input_size[0] = 0; // input count low word - ctx->input_size[1] = 0; // input count high word - ctx->c = 0; // pointer within buffer - ctx->output_size = out_size; // size of the final hash + ctx->input_offset[0] = 0; // begining of the input, no offset + ctx->input_offset[1] = 0; // begining of the input, no offset + ctx->buffer_idx = 0; // buffer is empty + ctx->hash_size = out_size; // remember the hash size we want - // If there's a key, put it in the first block, then pad with zeroes + // if there is a key, the first block is that key if (key_size > 0) { - FOR (i, 0 , key_size) { ctx->buf[i] = key[i]; } - FOR (i, key_size, 128 ) { ctx->buf[i] = 0; } - ctx->c = 128; // mark the block as used + crypto_blake2b_update(ctx, key, key_size); + pad(ctx); } } @@ -423,28 +441,21 @@ void crypto_blake2b_init(crypto_blake2b_ctx *ctx) void crypto_blake2b_update(crypto_blake2b_ctx *ctx, const u8 *in, size_t in_size) { FOR (i, 0, in_size) { - // If the buffer is full, increment the counters and - // add (compress) the current buffer to the hash - if (ctx->c == 128) { - ctx->c = 0; - incr(ctx->input_size, 128); - blake2b_compress(ctx, 0); // not last time -> 0 + if (ctx->buffer_idx == 128) { // If buffer is full, + incr(ctx); // update the input offset + compress(ctx, 0); // compress the (not last) block } - // By now the buffer is not full. We add one input byte. - ctx->buf[ctx->c] = in[i]; - ctx->c++; + ctx->buffer[ctx->buffer_idx] = in[i]; + ctx->buffer_idx++; } } void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *out) { - // update input size, pad then compress the buffer - incr(ctx->input_size, ctx->c); - FOR (i, ctx->c, 128) { ctx->buf[i] = 0; } - blake2b_compress(ctx, 1); // last time -> 1 - - // copy the hash in the output (little endian of course) - FOR (i, 0, ctx->output_size) { + incr(ctx); // update the input offset (the last block may not be full) + pad(ctx); // pad the last block with zeroes + compress(ctx, 1); // compress the last block + FOR (i, 0, ctx->hash_size) { out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xff; } } @@ -492,6 +503,8 @@ sv store_block(u8 bytes[1024], const block *b) FOR (i, 0, 128) { store64_le(bytes + i*8, b->a[i]); } } +// type of copy_block() and xor_block() +typedef void (*copy_fun) (block*, const block*); sv copy_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i] = in->a[i]; } sv xor_block(block *o, const block *in) { FOR (i, 0, 128) o->a[i] ^= in->a[i]; } @@ -567,18 +580,26 @@ sv g_rounds(block *work_block) } } -// The compression function G -// may overwrite result completely (xcopy == copy_block), -// or XOR result with the old block (xcopy == xor_block) -sv binary_g(block *result, const block *x, const block *y, - void (*xcopy) (block*, const block*)) +// The compression function G (copy version for the first pass) +sv g_copy(block *result, const block *x, const block *y) { block tmp; copy_block(&tmp, x); // tmp = X xor_block (&tmp, y); // tmp = X ^ Y = R - xcopy(result, &tmp); // result = R (or R ^ old) + copy_block(result, &tmp);// result = R g_rounds(&tmp); // tmp = Z - xor_block(result, &tmp); // result = R ^ Z (or R ^ old ^ Z) + xor_block(result, &tmp); // result = R ^ Z +} + +// The compression function G (xor version for subsequent passes) +sv g_xor(block *result, const block *x, const block *y) +{ + block tmp; + copy_block(&tmp, x); // tmp = X + xor_block (&tmp, y); // tmp = X ^ Y = R + xor_block(result, &tmp); // result = R ^ old + g_rounds(&tmp); // tmp = Z + xor_block(result, &tmp); // result = R ^ old ^ Z } // unary version of the compression function. @@ -657,24 +678,22 @@ static u32 gidx_next(gidx_ctx *ctx) // Pass 1+: 3 last segments plus already constructed // blocks in this segment. THE SPEC SUGGESTS OTHERWISE. // I CONFORM TO THE REFERENCE IMPLEMENTATION. - int first_pass = ctx->pass_number == 0; - u32 slice_size = ctx->nb_blocks / 4; - u32 area_size = ((first_pass ? ctx->slice_number : 3) - * slice_size + offset - 1); + int first_pass = ctx->pass_number == 0; + u32 slice_size = ctx->nb_blocks / 4; + u32 nb_segments = first_pass ? ctx->slice_number : 3; + u32 area_size = nb_segments * slice_size + offset - 1; // Computes the starting position of the reference area. // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE // NEXT SEGMENT, NOT THE NEXT BLOCK. - u32 next_slice = (ctx->slice_number == 3 - ? 0 - : (ctx->slice_number + 1) * slice_size); + u32 next_slice = ((ctx->slice_number + 1) % 4) * slice_size; u32 start_pos = first_pass ? 0 : next_slice; // Generate offset from J1 (no need for J2, there's only one lane) u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number u64 x = (j1 * j1) >> 32; u64 y = (area_size * x) >> 32; - u64 z = area_size - 1 - y; + u64 z = (area_size - 1) - y; return (start_pos + z) % ctx->nb_blocks; } @@ -732,8 +751,6 @@ void crypto_argon2i(u8 *tag, u32 tag_size, // fill (then re-fill) the rest of the blocks FOR (pass_number, 0, nb_iterations) { int first_pass = pass_number == 0; - // Simple copy on pass 0, XOR instead of overwrite on subsequent passes - void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block; FOR (segment, 0, 4) { gidx_ctx ctx; @@ -742,17 +759,19 @@ void crypto_argon2i(u8 *tag, u32 tag_size, // On the first segment of the first pass, // blocks 0 and 1 are already filled. // We use the offset to skip them. - u32 offset = first_pass && segment == 0 ? 2 : 0; - // current, reference, and previous are block indices - FOR (current, - segment * segment_size + offset, - (segment + 1) * segment_size) { - u32 previous = current == 0 ? nb_blocks - 1 : current - 1; - u32 reference = gidx_next(&ctx); - binary_g(blocks + current, - blocks + previous, - blocks + reference, - xcopy); + u32 start_offset = first_pass && segment == 0 ? 2 : 0; + u32 segment_start = segment * segment_size + start_offset; + u32 segment_end = (segment + 1) * segment_size; + FOR (current_block, segment_start, segment_end) { + u32 reference_block = gidx_next(&ctx); + u32 previous_block = current_block == 0 + ? nb_blocks - 1 + : current_block - 1; + block *c = blocks + current_block; + block *p = blocks + previous_block; + block *r = blocks + reference_block; + if (first_pass) { g_copy(c, p, r); } + else { g_xor (c, p, r); } } } } @@ -766,7 +785,7 @@ void crypto_argon2i(u8 *tag, u32 tag_size, /// Arithmetic modulo 2^255 - 19 /// //////////////////////////////////// // Taken from Supercop's ref10 implementation. -// A bit bigger than TweetNaCl, over 6 times faster. +// A bit bigger than TweetNaCl, about 8 times faster. // field element typedef i32 fe[10]; @@ -778,7 +797,7 @@ sv fe_add (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] + g[i]; } sv fe_sub (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] - g[i]; } sv fe_copy(fe h, const fe f ) { FOR (i, 0, 10) h[i] = f[i]; } -sv fe_cswap(fe f, fe g, u32 b) +sv fe_cswap(fe f, fe g, int b) { FOR (i, 0, 10) { i32 x = (f[i] ^ g[i]) & -b; @@ -797,16 +816,16 @@ static u32 load24_le(const u8 s[3]) sv fe_carry(fe h, i64 t[10]) { i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; - c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= (u64)c9 << 25; - c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1; t[1] -= (u64)c1 << 25; - c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3; t[3] -= (u64)c3 << 25; - c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5; t[5] -= (u64)c5 << 25; - c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7; t[7] -= (u64)c7 << 25; - c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0; t[0] -= (u64)c0 << 26; - c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2; t[2] -= (u64)c2 << 26; - c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4; t[4] -= (u64)c4 << 26; - c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6; t[6] -= (u64)c6 << 26; - c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8; t[8] -= (u64)c8 << 26; + c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= c9 * (1 << 25); + c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1; t[1] -= c1 * (1 << 25); + c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3; t[3] -= c3 * (1 << 25); + c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5; t[5] -= c5 * (1 << 25); + c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7; t[7] -= c7 * (1 << 25); + c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0; t[0] -= c0 * (1 << 26); + c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2; t[2] -= c2 * (1 << 26); + c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4; t[4] -= c4 * (1 << 26); + c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6; t[6] -= c6 * (1 << 26); + c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8; t[8] -= c8 * (1 << 26); FOR (i, 0, 10) { h[i] = t[i]; } } @@ -826,12 +845,14 @@ sv fe_frombytes(fe h, const u8 s[32]) fe_carry(h, t); } -sv fe_mul121666(fe h, const fe f) +sv fe_mul_small(fe h, const fe f, i32 g) { i64 t[10]; - FOR(i, 0, 10) { t[i] = f[i] * (i64) 121666; } + FOR(i, 0, 10) { t[i] = f[i] * (i64) g; } fe_carry(h, t); } +sv fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); } +sv fe_mul973324(fe h, const fe f) { fe_mul_small(h, f, 973324); } sv fe_mul(fe h, const fe f, const fe g) { @@ -867,41 +888,97 @@ sv fe_mul(fe h, const fe f, const fe g) i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; - i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= (u64)c0 << 26; - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= (u64)c4 << 26; - c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= (u64)c1 << 25; - c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= (u64)c5 << 25; - c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= (u64)c2 << 26; - c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= (u64)c6 << 26; - c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= (u64)c3 << 25; - c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= (u64)c7 << 25; - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= (u64)c4 << 26; - c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= (u64)c8 << 26; - c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= (u64)c9 << 25; - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= (u64)c0 << 26; - - h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; - h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; -} - -sv fe_sq(fe h, const fe f) { fe_mul(h, f, f); } - -// power to a power of 2 minus a small number. -// Timing depends on pow_2 and minus, so they shall not be secret. -sv fe_power(fe out, const fe z, int pow_2, u8 minus) -{ - minus--; - fe c; fe_copy(c, z); - for (int i = pow_2 - 2; i >= 0; i--) { - fe_sq(c, c); - if (i >= 8 || !((minus >> i) & 1)) - fe_mul(c, c, z); - } - fe_copy(out, c); +#define CARRY \ + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \ + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \ + c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 * (1 << 25); \ + c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 * (1 << 25); \ + c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 * (1 << 26); \ + c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 * (1 << 26); \ + c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 * (1 << 25); \ + c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 * (1 << 25); \ + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \ + c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 * (1 << 26); \ + c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \ + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \ + h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; \ + h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; \ + + CARRY; +} + +// we could use fe_mul() for this, but this is significantly faster +sv fe_sq(fe h, const fe f) +{ + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; + i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; + i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; + i32 f8_19 = f8*19; i32 f9_38 = f9*38; + + i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; + i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 + + f4 *(i64)f7_38 + f5_2*(i64)f6_19; + i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 + + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; + i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 + + f5_2*(i64)f8_19 + f6 *(i64)f7_38; + i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 + + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; + i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 + + f6 *(i64)f9_38 + f7_2*(i64)f8_19; + i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 + + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; + i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 + + f3_2*(i64)f4 + f8 *(i64)f9_38; + i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 + + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; + i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + + f3_2*(i64)f6 + f4 *(i64)f5_2; + + CARRY; +} + +// This could be simplified, but it would be slower +sv fe_invert(fe out, const fe z) +{ + fe t0, t1, t2, t3; + fe_sq(t0, z ); + fe_sq(t1, t0); + fe_sq(t1, t1); + fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t2, t0); fe_mul(t1 , t1, t2); + fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); +} + +// This could be simplified, but it would be slower +void fe_pow22523(fe out, const fe z) +{ + fe t0, t1, t2; + fe_sq(t0, z); + fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); } -sv fe_invert (fe out, const fe z) { fe_power(out, z, 255, 21); } -sv fe_pow22523(fe out, const fe z) { fe_power(out, z, 252, 3); } sv fe_tobytes(u8 s[32], const fe h) { @@ -915,16 +992,16 @@ sv fe_tobytes(u8 s[32], const fe h) } t[0] += 19 * q; - i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= (u64)c0 << 26; - i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= (u64)c1 << 25; - i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= (u64)c2 << 26; - i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= (u64)c3 << 25; - i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= (u64)c4 << 26; - i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= (u64)c5 << 25; - i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= (u64)c6 << 26; - i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= (u64)c7 << 25; - i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= (u64)c8 << 26; - i32 c9 = t[9] >> 25; t[9] -= (u64)c9 << 25; + i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= c0 * (1 << 26); + i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= c1 * (1 << 25); + i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= c2 * (1 << 26); + i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= c3 * (1 << 25); + i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= c4 * (1 << 26); + i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= c5 * (1 << 25); + i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= c6 * (1 << 26); + i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= c7 * (1 << 25); + i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= c8 * (1 << 26); + i32 c9 = t[9] >> 25; t[9] -= c9 * (1 << 25); store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); @@ -962,18 +1039,8 @@ sv trim_scalar(u8 s[32]) s[31] |= 64; } -int crypto_x25519(u8 shared_secret [32], - const u8 your_secret_key [32], - const u8 their_public_key[32]) +sv x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3, const u8 scalar[32]) { - // computes the scalar product - fe x1, x2, z2, x3, z3; - fe_frombytes(x1, their_public_key); - - // restrict the possible scalar values - u8 e[32]; FOR (i, 0, 32) { e[i] = your_secret_key[i]; } - trim_scalar(e); - // Montgomery ladder // In projective coordinates, to avoid divisons: x = X / Z // We don't care about the y coordinate, it's only 1 bit of information @@ -982,7 +1049,7 @@ int crypto_x25519(u8 shared_secret [32], int swap = 0; for (int pos = 254; pos >= 0; --pos) { // constant time conditional swap before ladder step - int b = (e[pos / 8] >> (pos & 7)) & 1; + int b = (scalar[pos / 8] >> (pos & 7)) & 1; swap ^= b; // xor trick avoids swapping at the end of the loop fe_cswap(x2, x3, swap); fe_cswap(z2, z3, swap); @@ -1001,6 +1068,22 @@ int crypto_x25519(u8 shared_secret [32], // last swap is necessary to compensate for the xor trick fe_cswap(x2, x3, swap); fe_cswap(z2, z3, swap); +} + +int crypto_x25519(u8 shared_secret [32], + const u8 your_secret_key [32], + const u8 their_public_key[32]) +{ + // computes the scalar product + fe x1, x2, z2, x3, z3; + fe_frombytes(x1, their_public_key); + + // restrict the possible scalar values + u8 e[32]; FOR (i, 0, 32) { e[i] = your_secret_key[i]; } + trim_scalar(e); + + // computes the actual scalar product (the result is in x2 and z2) + x25519_ladder(x1, x2, z2, x3, z3, e); // normalises the coordinates: x == X / Z fe_invert(z2, z2); @@ -1038,14 +1121,6 @@ sv ge_from_xy(ge *p, const fe x, const fe y) fe_mul(p->T, x, y); } -sv ge_cswap(ge *p, ge *q, u32 b) -{ - fe_cswap(p->X, q->X, b); - fe_cswap(p->Y, q->Y, b); - fe_cswap(p->Z, q->Z, b); - fe_cswap(p->T, q->T, b); -} - sv ge_tobytes(u8 s[32], const ge *h) { fe recip, x, y; @@ -1094,9 +1169,9 @@ static int ge_frombytes_neg(ge *h, const u8 s[32]) fe_mul(h->X, h->X, sqrtm1); } - if (fe_isnegative(h->X) == (s[31] >> 7)) + if (fe_isnegative(h->X) == (s[31] >> 7)) { fe_neg(h->X, h->X); - + } fe_mul(h->T, h->X, h->Y); return 0; } @@ -1126,27 +1201,52 @@ sv ge_add(ge *s, const ge *p, const ge *q) fe_mul(s->T, e, h); // T3 = E * H } -sv ge_double(ge *s, const ge *p) { ge_add(s, p, p); } - sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32]) { - // Simple Montgomery ladder, with straight double and add. - ge t; - fe_0(p->X); fe_copy(t.X, q->X); - fe_1(p->Y); fe_copy(t.Y, q->Y); - fe_1(p->Z); fe_copy(t.Z, q->Z); - fe_0(p->T); fe_copy(t.T, q->T); - int swap = 0; - for (int i = 255; i >= 0; i--) { - int b = (scalar[i/8] >> (i & 7)) & 1; - swap ^= b; // xor trick avoids unnecessary swaps - ge_cswap(p, &t, swap); - swap = b; - ge_add(&t, &t, p); - ge_double(p, p); - } - // one last swap makes up for the xor trick - ge_cswap(p, &t, swap); + // sqrt(-486664) + static const fe K = { 54885894, 25242303, 55597453, 9067496, 51808079, + 33312638, 25456129, 14121551, 54921728, 3972023 }; + + // convert q to montgomery format + fe x1, y1, z1, x2, z2, x3, z3, t1, t2, t3, t4; + fe_sub(z1, q->Z, q->Y); fe_mul(z1, z1, q->X); fe_invert(z1, z1); + fe_add(t1, q->Z, q->Y); + fe_mul(x1, q->X, t1 ); fe_mul(x1, x1, z1); + fe_mul(y1, q->Z, t1 ); fe_mul(y1, y1, z1); fe_mul(y1, K, y1); + fe_1(z1); // implied in the ladder, needed to convert back. + + // montgomery scalarmult + x25519_ladder(x1, x2, z2, x3, z3, scalar); + + // recover the y1 coordinate (Katsuyuki Okeya & Kouichi Sakurai, 2001) + fe_mul(t1, x1, z2); // t1 = x1 * z2 + fe_add(t2, x2, t1); // t2 = x2 + t1 + fe_sub(t3, x2, t1); // t3 = x2 − t1 + fe_sq (t3, t3); // t3 = t3^2 + fe_mul(t3, t3, x3); // t3 = t3 * x3 + fe_mul973324(t1, z2);// t1 = 2a * z2 + fe_add(t2, t2, t1); // t2 = t2 + t1 + fe_mul(t4, x1, x2); // t4 = x1 * x2 + fe_add(t4, t4, z2); // t4 = t4 + z2 + fe_mul(t2, t2, t4); // t2 = t2 * t4 + fe_mul(t1, t1, z2); // t1 = t1 * z2 + fe_sub(t2, t2, t1); // t2 = t2 − t1 + fe_mul(t2, t2, z3); // t2 = t2 * z3 + fe_add(t1, y1, y1); // t1 = y1 + y1 + fe_mul(t1, t1, z2); // t1 = t1 * z2 + fe_mul(t1, t1, z3); // t1 = t1 * z3 + fe_mul(x1, t1, x2); // x1 = t1 * x2 + fe_sub(y1, t2, t3); // y1 = t2 − t3 + fe_mul(z1, t1, z2); // z1 = t1 * z2 + + // convert back to twisted edwards + fe_sub(t1 , x1, z1); + fe_add(t2 , x1, z1); + fe_mul(x1 , K , x1); + fe_mul(p->X, x1, t2); + fe_mul(p->Y, y1, t1); + fe_mul(p->Z, y1, t2); + fe_mul(p->T, x1, t1); } sv ge_scalarmult_base(ge *p, const u8 scalar[32]) @@ -1176,7 +1276,7 @@ sv modL(u8 *r, i64 x[64]) FOR (j, i-32, i-12) { x[j] += carry - 16 * x[i] * L[j - (i - 32)]; carry = (x[j] + 128) >> 8; - x[j] -= (u64)carry << 8; + x[j] -= carry * (1 << 8); } x[i-12] += carry; x[i] = 0; @@ -1278,11 +1378,11 @@ int crypto_check(const u8 signature[64], { ge A, p, sB, diff; u8 h_ram[64], R_check[32]; - if (ge_frombytes_neg(&A, public_key)) return -1; // -A + if (ge_frombytes_neg(&A, public_key)) { return -1; } // -A hash_ram(h_ram, signature, public_key, message, message_size); - ge_scalarmult(&p, &A, h_ram); // p = -A*h_ram + ge_scalarmult(&p, &A, h_ram); // p = -A*h_ram ge_scalarmult_base(&sB, signature + 32); - ge_add(&diff, &p, &sB); // diff = s - A*h_ram + ge_add(&diff, &p, &sB); // diff = s - A*h_ram ge_tobytes(R_check, &diff); return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail } @@ -1324,7 +1424,7 @@ void crypto_aead_lock(u8 mac[16], { // encrypt then mac u8 auth_key[32]; crypto_chacha_ctx e_ctx; - crypto_chacha20_Xinit (&e_ctx, key, nonce); + crypto_chacha20_x_init (&e_ctx, key, nonce); crypto_chacha20_stream (&e_ctx, auth_key, 32); crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size); authenticate2(mac, auth_key, ad, ad_size, ciphertext, text_size); @@ -1339,28 +1439,29 @@ int crypto_aead_unlock(u8 *plaintext, { u8 auth_key[32], real_mac[16]; crypto_chacha_ctx e_ctx; - crypto_chacha20_Xinit (&e_ctx, key, nonce); + crypto_chacha20_x_init(&e_ctx, key, nonce); crypto_chacha20_stream(&e_ctx, auth_key, 32); authenticate2(real_mac, auth_key, ad, ad_size, ciphertext, text_size); - if (crypto_memcmp(real_mac, mac, 16)) return -1; // reject forgeries + if (crypto_memcmp(real_mac, mac, 16)) { return -1; } // reject forgeries crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size); return 0; } -void crypto_lock(u8 *box, +void crypto_lock(u8 mac[16], + u8 *ciphertext, const u8 key[32], const u8 nonce[24], - const u8 *plaintext, - size_t text_size) + const u8 *plaintext, size_t text_size) { - crypto_aead_lock(box, box + 16, key, nonce, 0, 0, plaintext, text_size); + crypto_aead_lock(mac, ciphertext, key, nonce, 0, 0, plaintext, text_size); } int crypto_unlock(u8 *plaintext, const u8 key[32], const u8 nonce[24], - const u8 *box, size_t box_size) + const u8 mac[16], + const u8 *ciphertext, size_t text_size) { - return crypto_aead_unlock(plaintext, key, nonce, box, 0, 0, - box + 16, box_size -16); + return crypto_aead_unlock(plaintext, key, nonce, mac, 0, 0, + ciphertext, text_size); } diff --git a/libs/monocypher/monocypher.h b/libs/monocypher/monocypher.h @@ -29,9 +29,11 @@ void crypto_chacha20_init(crypto_chacha_ctx *ctx, const uint8_t key[32], const uint8_t nonce[8]); -void crypto_chacha20_Xinit(crypto_chacha_ctx *ctx, - const uint8_t key[32], - const uint8_t nonce[24]); +void crypto_chacha20_x_init(crypto_chacha_ctx *ctx, + const uint8_t key[32], + const uint8_t nonce[24]); + +void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, uint64_t ctr); void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, uint8_t *cipher_text, @@ -39,8 +41,7 @@ void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, size_t message_size); void crypto_chacha20_stream(crypto_chacha_ctx *ctx, - uint8_t *cipher_text, - size_t message_size); + uint8_t *stream, size_t size); ///////////////// /// Poly 1305 /// @@ -68,11 +69,11 @@ void crypto_poly1305_auth(uint8_t mac[16], /// Blake2 b /// //////////////// typedef struct { - uint8_t buf[128]; // input buffer - uint64_t hash[8]; // chained state - uint64_t input_size[2]; // total number of bytes - uint8_t c; // pointer for buf[] - uint8_t output_size; // digest size + uint64_t hash[8]; + uint64_t input_offset[2]; + uint8_t buffer[128]; + size_t buffer_idx; + size_t hash_size; } crypto_blake2b_ctx; void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size, @@ -113,9 +114,9 @@ void crypto_x25519_public_key(uint8_t public_key[32], const uint8_t secret_key[32]); -/////////////// -/// Ed25519 /// -/////////////// +///////////// +/// EdDSA /// +///////////// void crypto_sign_public_key(uint8_t public_key[32], const uint8_t secret_key[32]); @@ -152,16 +153,16 @@ int crypto_aead_unlock(uint8_t *plaintext, const uint8_t *ad , size_t ad_size, const uint8_t *ciphertext, size_t text_size); -void crypto_lock(uint8_t *box, // text_size + 16 +void crypto_lock(uint8_t mac[16], + uint8_t *ciphertext, const uint8_t key[32], const uint8_t nonce[24], - const uint8_t *plaintext, - size_t text_size); + const uint8_t *plaintext, size_t text_size); -int crypto_unlock(uint8_t *plaintext, // box_size - 16 +int crypto_unlock(uint8_t *plaintext, const uint8_t key[32], const uint8_t nonce[24], - const uint8_t *box, - size_t box_size); + const uint8_t mac[16], + const uint8_t *ciphertext, size_t text_size); #endif // MONOCYPHER_H