monocypher.cc (58201B)
1 #include "monocypher.h" 2 3 ///////////////// 4 /// Utilities /// 5 ///////////////// 6 7 // By default, EdDSA signatures use blake2b. SHA-512 is provided as 8 // an option for full ed25519 compatibility (a must for test vectors). 9 // Compile with option -DED25519_SHA512 to use with sha512. If you do 10 // so, you must provide the "sha512" header with suitable functions. 11 #ifdef ED25519_SHA512 12 #include "sha512.h" 13 #define HASH crypto_sha512 14 #else 15 #define HASH crypto_blake2b 16 #endif 17 #define COMBINE1(x, y) x ## y 18 #define COMBINE2(x, y) COMBINE1(x, y) 19 #define HASH_CTX COMBINE2(HASH, _ctx) 20 #define HASH_INIT COMBINE2(HASH, _init) 21 #define HASH_UPDATE COMBINE2(HASH, _update) 22 #define HASH_FINAL COMBINE2(HASH, _final) 23 24 #define FOR(i, start, end) for (size_t (i) = (start); (i) < (end); (i)++) 25 typedef uint8_t u8; 26 typedef uint32_t u32; 27 typedef int32_t i32; 28 typedef int64_t i64; 29 typedef uint64_t u64; 30 31 static u32 load24_le(const u8 s[3]) 32 { 33 return (u32)s[0] 34 | ((u32)s[1] << 8) 35 | ((u32)s[2] << 16); 36 } 37 38 static u32 load32_le(const u8 s[4]) 39 { 40 return (u32)s[0] 41 | ((u32)s[1] << 8) 42 | ((u32)s[2] << 16) 43 | ((u32)s[3] << 24); 44 } 45 46 static u64 load64_le(const u8 s[8]) 47 { 48 return (u64)s[0] 49 | ((u64)s[1] << 8) 50 | ((u64)s[2] << 16) 51 | ((u64)s[3] << 24) 52 | ((u64)s[4] << 32) 53 | ((u64)s[5] << 40) 54 | ((u64)s[6] << 48) 55 | ((u64)s[7] << 56); 56 } 57 58 static void store32_le(u8 out[4], u32 in) 59 { 60 out[0] = in & 0xff; 61 out[1] = (in >> 8) & 0xff; 62 out[2] = (in >> 16) & 0xff; 63 out[3] = (in >> 24) & 0xff; 64 } 65 66 static void store64_le(u8 out[8], u64 in) 67 { 68 out[0] = in & 0xff; 69 out[1] = (in >> 8) & 0xff; 70 out[2] = (in >> 16) & 0xff; 71 out[3] = (in >> 24) & 0xff; 72 out[4] = (in >> 32) & 0xff; 73 out[5] = (in >> 40) & 0xff; 74 out[6] = (in >> 48) & 0xff; 75 out[7] = (in >> 56) & 0xff; 76 } 77 78 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); } 79 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); } 80 81 int crypto_memcmp(const u8 *p1, const u8 *p2, size_t n) 82 { 83 unsigned diff = 0; 84 FOR (i, 0, n) { 85 diff |= (p1[i] ^ p2[i]); 86 } 87 return (1 & ((diff - 1) >> 8)) - 1; 88 } 89 90 int crypto_zerocmp(const u8 *p, size_t n) 91 { 92 unsigned diff = 0; 93 FOR (i, 0, n) { 94 diff |= p[i]; 95 } 96 return (1 & ((diff - 1) >> 8)) - 1; 97 } 98 99 ///////////////// 100 /// Chacha 20 /// 101 ///////////////// 102 #define QUARTERROUND(a, b, c, d) \ 103 a += b; d ^= a; d = rotl32(d, 16); \ 104 c += d; b ^= c; b = rotl32(b, 12); \ 105 a += b; d ^= a; d = rotl32(d, 8); \ 106 c += d; b ^= c; b = rotl32(b, 7) 107 108 static void chacha20_rounds(u32 out[16], const u32 in[16]) 109 { 110 // The temporary variables make Chacha20 10% faster. 111 u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3]; 112 u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7]; 113 u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11]; 114 u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15]; 115 116 FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop. 117 QUARTERROUND(t0, t4, t8 , t12); // column 0 118 QUARTERROUND(t1, t5, t9 , t13); // column 1 119 QUARTERROUND(t2, t6, t10, t14); // column 2 120 QUARTERROUND(t3, t7, t11, t15); // column 3 121 QUARTERROUND(t0, t5, t10, t15); // diagonal 0 122 QUARTERROUND(t1, t6, t11, t12); // diagonal 1 123 QUARTERROUND(t2, t7, t8 , t13); // diagonal 2 124 QUARTERROUND(t3, t4, t9 , t14); // diagonal 3 125 } 126 out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3; 127 out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7; 128 out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11; 129 out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15; 130 } 131 132 static void chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32]) 133 { 134 // constant 135 ctx->input[0] = load32_le((u8*)"expa"); 136 ctx->input[1] = load32_le((u8*)"nd 3"); 137 ctx->input[2] = load32_le((u8*)"2-by"); 138 ctx->input[3] = load32_le((u8*)"te k"); 139 // key 140 FOR (i, 0, 8) { 141 ctx->input[i+4] = load32_le(key + i*4); 142 } 143 } 144 145 static u8 chacha20_pool_byte(crypto_chacha_ctx *ctx) 146 { 147 u32 pool_word = ctx->pool[ctx->pool_idx / 4]; 148 u8 pool_byte = pool_word >> (8*(ctx->pool_idx % 4)); 149 ctx->pool_idx++; 150 return pool_byte; 151 } 152 153 // Fill the pool if needed, update the counters 154 static void chacha20_refill_pool(crypto_chacha_ctx *ctx) 155 { 156 if (ctx->pool_idx == 64) { 157 chacha20_rounds(ctx->pool, ctx->input); 158 FOR (j, 0, 16) { 159 ctx->pool[j] += ctx->input[j]; 160 } 161 ctx->pool_idx = 0; 162 ctx->input[12]++; 163 if (ctx->input[12] == 0) { 164 ctx->input[13]++; 165 } 166 } 167 } 168 169 void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16]) 170 { 171 crypto_chacha_ctx ctx; 172 chacha20_init_key(&ctx, key); 173 FOR (i, 0, 4) { 174 ctx.input[i+12] = load32_le(in + i*4); 175 } 176 u32 buffer[16]; 177 chacha20_rounds(buffer, ctx.input); 178 // prevents reversal of the rounds by revealing only half of the buffer. 179 FOR (i, 0, 4) { 180 store32_le(out + i*4, buffer[i ]); // constant 181 store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce 182 } 183 } 184 185 void crypto_chacha20_init(crypto_chacha_ctx *ctx, 186 const u8 key[32], 187 const u8 nonce[8]) 188 { 189 chacha20_init_key (ctx, key); // key 190 crypto_chacha20_set_ctr(ctx, 0 ); // counter 191 ctx->input[14] = load32_le(nonce + 0); // nonce 192 ctx->input[15] = load32_le(nonce + 4); // nonce 193 } 194 195 void crypto_chacha20_x_init(crypto_chacha_ctx *ctx, 196 const u8 key[32], 197 const u8 nonce[24]) 198 { 199 u8 derived_key[32]; 200 crypto_chacha20_H(derived_key, key, nonce); 201 crypto_chacha20_init(ctx, derived_key, nonce + 16); 202 } 203 204 void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, u64 ctr) 205 { 206 ctx->input[12] = ctr & 0xffffffff; 207 ctx->input[13] = ctr >> 32; 208 ctx->pool_idx = 64; // The random pool (re)starts empty 209 } 210 211 void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx, 212 u8 *cipher_text, 213 const u8 *plain_text, 214 size_t text_size) 215 { 216 // Align ourselves with 4 byte words 217 while (ctx->pool_idx % 4 != 0 && text_size > 0) { 218 u8 stream = chacha20_pool_byte(ctx); 219 u8 plain = 0; 220 if (plain_text != 0) { 221 plain = *plain_text; 222 plain_text++; 223 } 224 *cipher_text = stream ^ plain; 225 text_size--; 226 cipher_text++; 227 } 228 // Main processing by 4 byte chunks 229 size_t nb_words = text_size / 4; 230 size_t remainder = text_size % 4; 231 FOR (i, 0, nb_words) { 232 chacha20_refill_pool(ctx); 233 u32 txt = 0; 234 if (plain_text) { 235 txt = load32_le(plain_text); 236 plain_text += 4; 237 } 238 store32_le(cipher_text, ctx->pool[ctx->pool_idx / 4] ^ txt); 239 cipher_text += 4; 240 ctx->pool_idx += 4; 241 } 242 // Remaining input, byte by byte 243 FOR (i, 0, remainder) { 244 chacha20_refill_pool(ctx); 245 u8 stream = chacha20_pool_byte(ctx); 246 u8 plain = 0; 247 if (plain_text != 0) { 248 plain = *plain_text; 249 plain_text++; 250 } 251 *cipher_text = stream ^ plain; 252 cipher_text++; 253 } 254 } 255 256 void crypto_chacha20_stream(crypto_chacha_ctx *ctx, 257 uint8_t *stream, size_t size) 258 { 259 crypto_chacha20_encrypt(ctx, stream, 0, size); 260 } 261 262 263 ///////////////// 264 /// Poly 1305 /// 265 ///////////////// 266 267 // h = (h + c) * r 268 // preconditions: 269 // ctx->h <= 7_ffffffff_ffffffff_ffffffff_ffffffff 270 // ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff 271 // ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff 272 // Postcondition: 273 // ctx->h <= 4_87ffffe4_8fffffe2_97ffffe0_9ffffffa 274 static void poly_block(crypto_poly1305_ctx *ctx) 275 { 276 // s = h + c, without carry propagation 277 const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe 278 const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe 279 const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe 280 const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe 281 const u64 s4 = ctx->h[4] + (u64)ctx->c[4]; // s4 <= 00000004 282 283 // Local all the things! 284 const u32 r0 = ctx->r[0]; // r0 <= 0fffffff 285 const u32 r1 = ctx->r[1]; // r1 <= 0ffffffc 286 const u32 r2 = ctx->r[2]; // r2 <= 0ffffffc 287 const u32 r3 = ctx->r[3]; // r3 <= 0ffffffc 288 const u32 rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits... 289 const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5 290 const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5 291 const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5 292 293 // (h + c) * r, without carry propagation 294 const u64 x0 = s0*r0 + s1*rr3 + s2*rr2 + s3*rr1 + s4*rr0;//<=97ffffe007fffff8 295 const u64 x1 = s0*r1 + s1*r0 + s2*rr3 + s3*rr2 + s4*rr1;//<=8fffffe20ffffff6 296 const u64 x2 = s0*r2 + s1*r1 + s2*r0 + s3*rr3 + s4*rr2;//<=87ffffe417fffff4 297 const u64 x3 = s0*r3 + s1*r2 + s2*r1 + s3*r0 + s4*rr3;//<=7fffffe61ffffff2 298 const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits //<=0000000000000018 299 300 // partial reduction modulo 2^130 - 5 301 const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffffe 302 const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff); 303 const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32); 304 const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32); 305 const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32); 306 const u64 u4 = (u3 >> 32) + (u5 & 3); 307 308 // Update the hash 309 ctx->h[0] = u0 & 0xffffffff; // u0 <= 1_9ffffffa 310 ctx->h[1] = u1 & 0xffffffff; // u1 <= 1_97ffffe0 311 ctx->h[2] = u2 & 0xffffffff; // u2 <= 1_8fffffe2 312 ctx->h[3] = u3 & 0xffffffff; // u3 <= 1_87ffffe4 313 ctx->h[4] = u4; // u4 <= 4 314 } 315 316 // (re-)initializes the input counter and input buffer 317 static void poly_clear_c(crypto_poly1305_ctx *ctx) 318 { 319 FOR (i, 0, 4) { 320 ctx->c[i] = 0; 321 } 322 ctx->c_idx = 0; 323 } 324 325 static void poly_end_block(crypto_poly1305_ctx *ctx) 326 { 327 if (ctx->c_idx == 16) { 328 poly_block(ctx); 329 poly_clear_c(ctx); 330 } 331 } 332 333 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input) 334 { 335 size_t word = ctx->c_idx / 4; 336 size_t byte = ctx->c_idx % 4; 337 ctx->c[word] |= (u32)input << (byte * 8); 338 ctx->c_idx++; 339 } 340 341 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32]) 342 { 343 // Initial hash is zero 344 FOR (i, 0, 5) { 345 ctx->h [i] = 0; 346 } 347 // add 2^130 to every input block 348 ctx->c [4] = 1; 349 poly_clear_c(ctx); 350 // load r and pad (r has some of its bits cleared) 351 FOR (i, 0, 1) { ctx->r [0] = load32_le(key ) & 0x0fffffff; } 352 FOR (i, 1, 4) { ctx->r [i] = load32_le(key + i*4 ) & 0x0ffffffc; } 353 FOR (i, 0, 4) { ctx->pad[i] = load32_le(key + i*4 + 16); } 354 } 355 356 void crypto_poly1305_update(crypto_poly1305_ctx *ctx, 357 const u8 *message, size_t message_size) 358 { 359 // Align ourselves with 4 byte words 360 while (ctx->c_idx % 4 != 0 && message_size > 0) { 361 poly_take_input(ctx, *message); 362 message++; 363 message_size--; 364 } 365 366 // Process the input 4 bytes at a time 367 size_t nb_words = message_size / 4; 368 size_t remainder = message_size % 4; 369 FOR (i, 0, nb_words) { 370 poly_end_block(ctx); 371 ctx->c[ctx->c_idx / 4] = load32_le(message); 372 message += 4; 373 ctx->c_idx += 4; 374 } 375 376 // Input the remaining bytes 377 if (remainder != 0) { 378 poly_end_block(ctx); 379 } 380 FOR (i, 0, remainder) { 381 poly_take_input(ctx, message[i]); 382 } 383 } 384 385 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16]) 386 { 387 // Process the last block (if any) 388 if (ctx->c_idx != 0) { 389 // move the final 1 according to remaining input length 390 // (We may add less than 2^130 to the last input block) 391 ctx->c[4] = 0; 392 poly_take_input(ctx, 1); 393 // one last hash update 394 poly_block(ctx); 395 } 396 397 // check if we should subtract 2^130-5 by performing the 398 // corresponding carry propagation. 399 u64 u = 5; 400 u += ctx->h[0]; u >>= 32; 401 u += ctx->h[1]; u >>= 32; 402 u += ctx->h[2]; u >>= 32; 403 u += ctx->h[3]; u >>= 32; 404 u += ctx->h[4]; u >>= 2; 405 // now u indicates how many times we should subtract 2^130-5 (0 or 1) 406 407 // store h + pad, minus 2^130-5 if u tells us to. 408 u *= 5; 409 u += (i64)(ctx->h[0]) + ctx->pad[0]; store32_le(mac , u); u >>= 32; 410 u += (i64)(ctx->h[1]) + ctx->pad[1]; store32_le(mac + 4, u); u >>= 32; 411 u += (i64)(ctx->h[2]) + ctx->pad[2]; store32_le(mac + 8, u); u >>= 32; 412 u += (i64)(ctx->h[3]) + ctx->pad[3]; store32_le(mac + 12, u); 413 } 414 415 void crypto_poly1305_auth(u8 mac[16], const u8 *message, 416 size_t message_size, const u8 key[32]) 417 { 418 crypto_poly1305_ctx ctx; 419 crypto_poly1305_init (&ctx, key); 420 crypto_poly1305_update(&ctx, message, message_size); 421 crypto_poly1305_final (&ctx, mac); 422 } 423 424 //////////////// 425 /// Blake2 b /// 426 //////////////// 427 static const u64 iv[8] = { 428 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 429 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 430 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 431 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, 432 }; 433 434 // increment the input offset 435 static void blake2b_incr(crypto_blake2b_ctx *ctx) 436 { 437 u64 *x = ctx->input_offset; 438 size_t y = ctx->input_idx; 439 x[0] += y; 440 if (x[0] < y) { 441 x[1]++; 442 } 443 } 444 445 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input) 446 { 447 size_t word = ctx->input_idx / 8; 448 size_t byte = ctx->input_idx % 8; 449 ctx->input[word] |= (u64)input << (byte * 8); 450 ctx->input_idx++; 451 } 452 453 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block) 454 { 455 static const u8 sigma[12][16] = { 456 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 457 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 458 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, 459 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, 460 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, 461 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, 462 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, 463 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, 464 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, 465 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, 466 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 467 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 468 }; 469 470 // init work vector 471 u64 v[16]; 472 FOR (i, 0, 8) { 473 v[i ] = ctx->hash[i]; 474 v[i+8] = iv[i]; 475 } 476 v[12] ^= ctx->input_offset[0]; 477 v[13] ^= ctx->input_offset[1]; 478 if (is_last_block) { 479 v[14] = ~v[14]; 480 } 481 482 // mangle work vector 483 uint64_t *input = ctx->input; 484 FOR (i, 0, 12) { 485 #define BLAKE2_G(v, a, b, c, d, x, y) \ 486 v[a] += v[b] + x; v[d] = rotr64(v[d] ^ v[a], 32); \ 487 v[c] += v[d]; v[b] = rotr64(v[b] ^ v[c], 24); \ 488 v[a] += v[b] + y; v[d] = rotr64(v[d] ^ v[a], 16); \ 489 v[c] += v[d]; v[b] = rotr64(v[b] ^ v[c], 63); \ 490 491 BLAKE2_G(v, 0, 4, 8, 12, input[sigma[i][ 0]], input[sigma[i][ 1]]); 492 BLAKE2_G(v, 1, 5, 9, 13, input[sigma[i][ 2]], input[sigma[i][ 3]]); 493 BLAKE2_G(v, 2, 6, 10, 14, input[sigma[i][ 4]], input[sigma[i][ 5]]); 494 BLAKE2_G(v, 3, 7, 11, 15, input[sigma[i][ 6]], input[sigma[i][ 7]]); 495 BLAKE2_G(v, 0, 5, 10, 15, input[sigma[i][ 8]], input[sigma[i][ 9]]); 496 BLAKE2_G(v, 1, 6, 11, 12, input[sigma[i][10]], input[sigma[i][11]]); 497 BLAKE2_G(v, 2, 7, 8, 13, input[sigma[i][12]], input[sigma[i][13]]); 498 BLAKE2_G(v, 3, 4, 9, 14, input[sigma[i][14]], input[sigma[i][15]]); 499 } 500 // update hash 501 FOR (i, 0, 8) { 502 ctx->hash[i] ^= v[i] ^ v[i+8]; 503 } 504 } 505 506 static void blake2b_reset_input(crypto_blake2b_ctx *ctx) 507 { 508 FOR(i, 0, 16) { 509 ctx->input[i] = 0; 510 } 511 ctx->input_idx = 0; 512 } 513 514 static void blake2b_end_block(crypto_blake2b_ctx *ctx) 515 { 516 if (ctx->input_idx == 128) { // If buffer is full, 517 blake2b_incr(ctx); // update the input offset 518 blake2b_compress(ctx, 0); // and compress the (not last) block 519 blake2b_reset_input(ctx); 520 } 521 } 522 523 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size, 524 const u8 *key, size_t key_size) 525 { 526 // initial hash 527 FOR (i, 0, 8) { 528 ctx->hash[i] = iv[i]; 529 } 530 ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size; 531 532 ctx->input_offset[0] = 0; // begining of the input, no offset 533 ctx->input_offset[1] = 0; // begining of the input, no offset 534 ctx->input_idx = 0; // buffer is empty 535 ctx->hash_size = hash_size; // remember the hash size we want 536 blake2b_reset_input(ctx); // clear the input buffer 537 538 // if there is a key, the first block is that key 539 if (key_size > 0) { 540 crypto_blake2b_update(ctx, key, key_size); 541 ctx->input_idx = 128; 542 } 543 } 544 545 void crypto_blake2b_init(crypto_blake2b_ctx *ctx) 546 { 547 crypto_blake2b_general_init(ctx, 64, 0, 0); 548 } 549 550 void crypto_blake2b_update(crypto_blake2b_ctx *ctx, 551 const u8 *message, size_t message_size) 552 { 553 // Align ourselves with 8 byte words 554 while (ctx->input_idx % 8 != 0 && message_size > 0) { 555 blake2b_set_input(ctx, *message); 556 message++; 557 message_size--; 558 } 559 560 // Process the input 8 bytes at a time 561 size_t nb_words = message_size / 8; 562 size_t remainder = message_size % 8; 563 FOR (i, 0, nb_words) { 564 blake2b_end_block(ctx); 565 ctx->input[ctx->input_idx / 8] = load64_le(message); 566 message += 8; 567 ctx->input_idx += 8; 568 } 569 570 // Load the remainder 571 if (remainder != 0) { 572 blake2b_end_block(ctx); 573 } 574 FOR (i, 0, remainder) { 575 blake2b_set_input(ctx, message[i]); 576 } 577 } 578 579 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash) 580 { 581 blake2b_incr(ctx); // update the input offset 582 blake2b_compress(ctx, 1); // compress the last block 583 size_t nb_words = ctx->hash_size / 8; 584 FOR (i, 0, nb_words) { 585 store64_le(hash + i*8, ctx->hash[i]); 586 } 587 FOR (i, nb_words * 8, ctx->hash_size) { 588 hash[i] = (ctx->hash[i / 8] >> (8 * (i % 8))) & 0xff; 589 } 590 } 591 592 void crypto_blake2b_general(u8 *hash , size_t hash_size, 593 const u8 *key , size_t key_size, 594 const u8 *message, size_t message_size) 595 { 596 crypto_blake2b_ctx ctx; 597 crypto_blake2b_general_init(&ctx, hash_size, key, key_size); 598 crypto_blake2b_update(&ctx, message, message_size); 599 crypto_blake2b_final(&ctx, hash); 600 } 601 602 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size) 603 { 604 crypto_blake2b_general(hash, 64, 0, 0, message, message_size); 605 } 606 607 608 //////////////// 609 /// Argon2 i /// 610 //////////////// 611 // references to R, Z, Q etc. come from the spec 612 613 // Argon2 operates on 1024 byte blocks. 614 typedef struct { u64 a[128]; } block; 615 616 static u32 min(u32 a, u32 b) { return a <= b ? a : b; } 617 618 // updates a blake2 hash with a 32 bit word, little endian. 619 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input) 620 { 621 u8 buf[4]; 622 store32_le(buf, input); 623 crypto_blake2b_update(ctx, buf, 4); 624 } 625 626 static void load_block(block *b, const u8 bytes[1024]) 627 { 628 FOR (i, 0, 128) { 629 b->a[i] = load64_le(bytes + i*8); 630 } 631 } 632 633 static void store_block(u8 bytes[1024], const block *b) 634 { 635 FOR (i, 0, 128) { 636 store64_le(bytes + i*8, b->a[i]); 637 } 638 } 639 640 static void copy_block(block *o,const block*in){FOR(i,0,128) o->a[i] = in->a[i];} 641 static void xor_block(block *o,const block*in){FOR(i,0,128) o->a[i]^= in->a[i];} 642 643 // Hash with a virtually unlimited digest size. 644 // Doesn't extract more entropy than the base hash function. 645 // Mainly used for filling a whole kilobyte block with pseudo-random bytes. 646 // (One could use a stream cipher with a seed hash as the key, but 647 // this would introduce another dependency βand point of failure.) 648 static void extended_hash(u8 *digest, u32 digest_size, 649 const u8 *input , u32 input_size) 650 { 651 crypto_blake2b_ctx ctx; 652 crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0); 653 blake_update_32 (&ctx, digest_size); 654 crypto_blake2b_update (&ctx, input, input_size); 655 crypto_blake2b_final (&ctx, digest); 656 657 if (digest_size > 64) { 658 // the conversion to u64 avoids integer overflow on 659 // ludicrously big hash sizes. 660 u32 r = (((u64)digest_size + 31) / 32) - 2; 661 u32 i = 1; 662 u32 in = 0; 663 u32 out = 32; 664 while (i < r) { 665 // Input and output overlap. This is intentional 666 crypto_blake2b(digest + out, digest + in, 64); 667 i += 1; 668 in += 32; 669 out += 32; 670 } 671 crypto_blake2b_general(digest + out, digest_size - (32 * r), 672 0, 0, // no key 673 digest + in , 64); 674 } 675 } 676 677 #define LSB(x) ((x) & 0xffffffff) 678 #define G(a, b, c, d) \ 679 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 32); \ 680 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 24); \ 681 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 16); \ 682 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 63) 683 #define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \ 684 v8, v9, v10, v11, v12, v13, v14, v15) \ 685 G(v0, v4, v8, v12); G(v1, v5, v9, v13); \ 686 G(v2, v6, v10, v14); G(v3, v7, v11, v15); \ 687 G(v0, v5, v10, v15); G(v1, v6, v11, v12); \ 688 G(v2, v7, v8, v13); G(v3, v4, v9, v14) 689 690 // Core of the compression function G. Computes Z from R in place. 691 static void g_rounds(block *work_block) 692 { 693 // column rounds (work_block = Q) 694 for (int i = 0; i < 128; i += 16) { 695 ROUND(work_block->a[i ], work_block->a[i + 1], 696 work_block->a[i + 2], work_block->a[i + 3], 697 work_block->a[i + 4], work_block->a[i + 5], 698 work_block->a[i + 6], work_block->a[i + 7], 699 work_block->a[i + 8], work_block->a[i + 9], 700 work_block->a[i + 10], work_block->a[i + 11], 701 work_block->a[i + 12], work_block->a[i + 13], 702 work_block->a[i + 14], work_block->a[i + 15]); 703 } 704 // row rounds (work_block = Z) 705 for (int i = 0; i < 16; i += 2) { 706 ROUND(work_block->a[i ], work_block->a[i + 1], 707 work_block->a[i + 16], work_block->a[i + 17], 708 work_block->a[i + 32], work_block->a[i + 33], 709 work_block->a[i + 48], work_block->a[i + 49], 710 work_block->a[i + 64], work_block->a[i + 65], 711 work_block->a[i + 80], work_block->a[i + 81], 712 work_block->a[i + 96], work_block->a[i + 97], 713 work_block->a[i + 112], work_block->a[i + 113]); 714 } 715 } 716 717 // The compression function G (copy version for the first pass) 718 static void g_copy(block *result, const block *x, const block *y) 719 { 720 block tmp; 721 copy_block(&tmp , x ); // tmp = X 722 xor_block (&tmp , y ); // tmp = X ^ Y = R 723 copy_block(result, &tmp); // result = R (only difference with g_xor) 724 g_rounds (&tmp); // tmp = Z 725 xor_block (result, &tmp); // result = R ^ Z 726 } 727 728 // The compression function G (xor version for subsequent passes) 729 static void g_xor(block *result, const block *x, const block *y) 730 { 731 block tmp; 732 copy_block(&tmp , x ); // tmp = X 733 xor_block (&tmp , y ); // tmp = X ^ Y = R 734 xor_block (result, &tmp); // result = R ^ old (only difference with g_copy) 735 g_rounds (&tmp); // tmp = Z 736 xor_block (result, &tmp); // result = R ^ old ^ Z 737 } 738 739 // unary version of the compression function. 740 // The missing argument is implied zero. 741 // Does the transformation in place. 742 static void unary_g(block *work_block) 743 { 744 // work_block == R 745 block tmp; 746 copy_block(&tmp, work_block); // tmp = R 747 g_rounds(work_block); // work_block = Z 748 xor_block(work_block, &tmp); // work_block = Z ^ R 749 } 750 751 // Argon2i uses a kind of stream cipher to determine which reference 752 // block it will take to synthesise the next block. This context hold 753 // that stream's state. (It's very similar to Chacha20. The block b 754 // is anologous to Chacha's own pool) 755 typedef struct { 756 block b; 757 u32 pass_number; 758 u32 slice_number; 759 u32 nb_blocks; 760 u32 nb_iterations; 761 u32 ctr; 762 u32 offset; 763 } gidx_ctx; 764 765 // The block in the context will determine array indices. To avoid 766 // timing attacks, it only depends on public information. No looking 767 // at a previous block to seed the next. This makes offline attacks 768 // easier, but timing attacks are the bigger threat in many settings. 769 static void gidx_refresh(gidx_ctx *ctx) 770 { 771 // seed the begining of the block... 772 ctx->b.a[0] = ctx->pass_number; 773 ctx->b.a[1] = 0; // lane number (we have only one) 774 ctx->b.a[2] = ctx->slice_number; 775 ctx->b.a[3] = ctx->nb_blocks; 776 ctx->b.a[4] = ctx->nb_iterations; 777 ctx->b.a[5] = 1; // type: Argon2i 778 ctx->b.a[6] = ctx->ctr; 779 FOR (i, 7, 128) { ctx->b.a[i] = 0; } // ...then zero the rest out 780 781 // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero) 782 // (G "square" function), to get cheap pseudo-random numbers. 783 unary_g(&(ctx->b)); 784 unary_g(&(ctx->b)); 785 } 786 787 static void gidx_init(gidx_ctx *ctx, 788 u32 pass_number, u32 slice_number, 789 u32 nb_blocks, u32 nb_iterations) 790 { 791 ctx->pass_number = pass_number; 792 ctx->slice_number = slice_number; 793 ctx->nb_blocks = nb_blocks; 794 ctx->nb_iterations = nb_iterations; 795 ctx->ctr = 0; 796 797 // Offset from the begining of the segment. For the first slice 798 // of the first pass, we start at the *third* block, so the offset 799 // starts at 2, not 0. 800 if (pass_number != 0 || slice_number != 0) { 801 ctx->offset = 0; 802 } else { 803 ctx->offset = 2; 804 ctx->ctr++; // Compensates for missed lazy creation 805 gidx_refresh(ctx); // at the start of gidx_next() 806 } 807 } 808 809 static u32 gidx_next(gidx_ctx *ctx) 810 { 811 // lazily creates the offset block we need 812 if (ctx->offset % 128 == 0) { 813 ctx->ctr++; 814 gidx_refresh(ctx); 815 } 816 u32 index = ctx->offset % 128; // save index for current call 817 u32 offset = ctx->offset; // save offset for current call 818 ctx->offset++; // update offset for next call 819 820 // Computes the area size. 821 // Pass 0 : all already finished segments plus already constructed 822 // blocks in this segment 823 // Pass 1+: 3 last segments plus already constructed 824 // blocks in this segment. THE SPEC SUGGESTS OTHERWISE. 825 // I CONFORM TO THE REFERENCE IMPLEMENTATION. 826 int first_pass = ctx->pass_number == 0; 827 u32 slice_size = ctx->nb_blocks / 4; 828 u32 nb_segments = first_pass ? ctx->slice_number : 3; 829 u32 area_size = nb_segments * slice_size + offset - 1; 830 831 // Computes the starting position of the reference area. 832 // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE 833 // NEXT SEGMENT, NOT THE NEXT BLOCK. 834 u32 next_slice = ((ctx->slice_number + 1) % 4) * slice_size; 835 u32 start_pos = first_pass ? 0 : next_slice; 836 837 // Generate offset from J1 (no need for J2, there's only one lane) 838 u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number 839 u64 x = (j1 * j1) >> 32; 840 u64 y = (area_size * x) >> 32; 841 u64 z = (area_size - 1) - y; 842 return (start_pos + z) % ctx->nb_blocks; 843 } 844 845 // Main algorithm 846 void crypto_argon2i(u8 *hash, u32 hash_size, 847 void *work_area, u32 nb_blocks, u32 nb_iterations, 848 const u8 *password, u32 password_size, 849 const u8 *salt, u32 salt_size, 850 const u8 *key, u32 key_size, 851 const u8 *ad, u32 ad_size) 852 { 853 // work area seen as blocks (must be suitably aligned) 854 block *blocks = (block*)work_area; 855 { 856 crypto_blake2b_ctx ctx; 857 crypto_blake2b_init(&ctx); 858 859 blake_update_32 (&ctx, 1 ); // p: number of threads 860 blake_update_32 (&ctx, hash_size ); 861 blake_update_32 (&ctx, nb_blocks ); 862 blake_update_32 (&ctx, nb_iterations); 863 blake_update_32 (&ctx, 0x13 ); // v: version number 864 blake_update_32 (&ctx, 1 ); // y: Argon2i 865 blake_update_32 (&ctx, password_size); 866 crypto_blake2b_update(&ctx, password, password_size); 867 blake_update_32 (&ctx, salt_size); 868 crypto_blake2b_update(&ctx, salt, salt_size); 869 blake_update_32 (&ctx, key_size); 870 crypto_blake2b_update(&ctx, key, key_size); 871 blake_update_32 (&ctx, ad_size); 872 crypto_blake2b_update(&ctx, ad, ad_size); 873 874 u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes 875 crypto_blake2b_final(&ctx, initial_hash); 876 877 // fill first 2 blocks 878 block tmp_block; 879 u8 hash_area[1024]; 880 store32_le(initial_hash + 64, 0); // first additional word 881 store32_le(initial_hash + 68, 0); // second additional word 882 extended_hash(hash_area, 1024, initial_hash, 72); 883 load_block(&tmp_block, hash_area); 884 copy_block(blocks, &tmp_block); 885 886 store32_le(initial_hash + 64, 1); // slight modification 887 extended_hash(hash_area, 1024, initial_hash, 72); 888 load_block(&tmp_block, hash_area); 889 copy_block(blocks + 1, &tmp_block); 890 } 891 892 // Actual number of blocks 893 nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread) 894 const u32 segment_size = nb_blocks / 4; 895 896 // fill (then re-fill) the rest of the blocks 897 FOR (pass_number, 0, nb_iterations) { 898 int first_pass = pass_number == 0; 899 900 FOR (segment, 0, 4) { 901 gidx_ctx ctx; 902 gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations); 903 904 // On the first segment of the first pass, 905 // blocks 0 and 1 are already filled. 906 // We use the offset to skip them. 907 u32 start_offset = first_pass && segment == 0 ? 2 : 0; 908 u32 segment_start = segment * segment_size + start_offset; 909 u32 segment_end = (segment + 1) * segment_size; 910 FOR (current_block, segment_start, segment_end) { 911 u32 reference_block = gidx_next(&ctx); 912 u32 previous_block = current_block == 0 913 ? nb_blocks - 1 914 : current_block - 1; 915 block *c = blocks + current_block; 916 block *p = blocks + previous_block; 917 block *r = blocks + reference_block; 918 if (first_pass) { g_copy(c, p, r); } 919 else { g_xor (c, p, r); } 920 } 921 } 922 } 923 // hash the very last block with H' into the output hash 924 u8 final_block[1024]; 925 store_block(final_block, blocks + (nb_blocks - 1)); 926 extended_hash(hash, hash_size, final_block, 1024); 927 } 928 929 //////////////////////////////////// 930 /// Arithmetic modulo 2^255 - 19 /// 931 //////////////////////////////////// 932 // Taken from Supercop's ref10 implementation. 933 // A bit bigger than TweetNaCl, over 4 times faster. 934 935 // field element 936 typedef i32 fe[10]; 937 938 static void fe_0 (fe h) { FOR(i,0,10) h[i] = 0; } 939 static void fe_1 (fe h) { h[0] = 1; FOR(i,1,10) h[i] = 0; } 940 static void fe_neg (fe h,const fe f) {FOR(i,0,10) h[i] = -f[i]; } 941 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];} 942 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];} 943 static void fe_copy(fe h,const fe f) {FOR(i,0,10) h[i] = f[i]; } 944 945 static void fe_cswap(fe f, fe g, int b) 946 { 947 FOR (i, 0, 10) { 948 i32 x = (f[i] ^ g[i]) & -b; 949 f[i] = f[i] ^ x; 950 g[i] = g[i] ^ x; 951 } 952 } 953 954 static void fe_carry(fe h, i64 t[10]) 955 { 956 i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; 957 c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= c9 * (1 << 25); 958 c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1; t[1] -= c1 * (1 << 25); 959 c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3; t[3] -= c3 * (1 << 25); 960 c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5; t[5] -= c5 * (1 << 25); 961 c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7; t[7] -= c7 * (1 << 25); 962 c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0; t[0] -= c0 * (1 << 26); 963 c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2; t[2] -= c2 * (1 << 26); 964 c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4; t[4] -= c4 * (1 << 26); 965 c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6; t[6] -= c6 * (1 << 26); 966 c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8; t[8] -= c8 * (1 << 26); 967 FOR (i, 0, 10) { h[i] = t[i]; } 968 } 969 970 static void fe_frombytes(fe h, const u8 s[32]) 971 { 972 i64 t[10]; // intermediate result (may overflow 32 bits) 973 t[0] = load32_le(s); 974 t[1] = load24_le(s + 4) << 6; 975 t[2] = load24_le(s + 7) << 5; 976 t[3] = load24_le(s + 10) << 3; 977 t[4] = load24_le(s + 13) << 2; 978 t[5] = load32_le(s + 16); 979 t[6] = load24_le(s + 20) << 7; 980 t[7] = load24_le(s + 23) << 5; 981 t[8] = load24_le(s + 26) << 4; 982 t[9] = (load24_le(s + 29) & 8388607) << 2; 983 fe_carry(h, t); 984 } 985 986 static void fe_mul_small(fe h, const fe f, i32 g) 987 { 988 i64 t[10]; 989 FOR(i, 0, 10) { 990 t[i] = f[i] * (i64) g; 991 } 992 fe_carry(h, t); 993 } 994 static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); } 995 static void fe_mul973324(fe h, const fe f) { fe_mul_small(h, f, 973324); } 996 997 static void fe_mul(fe h, const fe f, const fe g) 998 { 999 // Everything is unrolled and put in temporary variables. 1000 // We could roll the loop, but that would make curve25519 twice as slow. 1001 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; 1002 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; 1003 i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4]; 1004 i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9]; 1005 i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2; 1006 i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19; 1007 i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19; 1008 i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19; 1009 1010 i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6 1011 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1; 1012 i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7 1013 + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2; 1014 i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8 1015 + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3; 1016 i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9 1017 + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4; 1018 i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0 1019 + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5; 1020 i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1 1021 + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6; 1022 i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2 1023 + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7; 1024 i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3 1025 + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8; 1026 i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4 1027 + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9; 1028 i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 1029 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; 1030 1031 #define CARRY \ 1032 i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ 1033 c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \ 1034 c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \ 1035 c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 * (1 << 25); \ 1036 c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 * (1 << 25); \ 1037 c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 * (1 << 26); \ 1038 c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 * (1 << 26); \ 1039 c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 * (1 << 25); \ 1040 c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 * (1 << 25); \ 1041 c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \ 1042 c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 * (1 << 26); \ 1043 c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \ 1044 c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \ 1045 h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; \ 1046 h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; \ 1047 1048 CARRY; 1049 } 1050 1051 // we could use fe_mul() for this, but this is significantly faster 1052 static void fe_sq(fe h, const fe f) 1053 { 1054 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; 1055 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; 1056 i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; 1057 i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; 1058 i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; 1059 i32 f8_19 = f8*19; i32 f9_38 = f9*38; 1060 1061 i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 1062 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; 1063 i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 1064 + f4 *(i64)f7_38 + f5_2*(i64)f6_19; 1065 i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 1066 + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; 1067 i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 1068 + f5_2*(i64)f8_19 + f6 *(i64)f7_38; 1069 i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 1070 + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; 1071 i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 1072 + f6 *(i64)f9_38 + f7_2*(i64)f8_19; 1073 i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 1074 + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; 1075 i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 1076 + f3_2*(i64)f4 + f8 *(i64)f9_38; 1077 i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 1078 + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; 1079 i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 1080 + f3_2*(i64)f6 + f4 *(i64)f5_2; 1081 1082 CARRY; 1083 } 1084 1085 // This could be simplified, but it would be slower 1086 static void fe_invert(fe out, const fe z) 1087 { 1088 fe t0, t1, t2, t3; 1089 fe_sq(t0, z ); 1090 fe_sq(t1, t0); 1091 fe_sq(t1, t1); 1092 fe_mul(t1, z, t1); 1093 fe_mul(t0, t0, t1); 1094 fe_sq(t2, t0); fe_mul(t1 , t1, t2); 1095 fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); 1096 fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); 1097 fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); 1098 fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); 1099 fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); 1100 fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); 1101 fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); 1102 fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); 1103 } 1104 1105 // This could be simplified, but it would be slower 1106 void fe_pow22523(fe out, const fe z) 1107 { 1108 fe t0, t1, t2; 1109 fe_sq(t0, z); 1110 fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); 1111 fe_mul(t0, t0, t1); 1112 fe_sq(t0, t0); fe_mul(t0, t1, t0); 1113 fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1114 fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); 1115 fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); 1116 fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1117 fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); 1118 fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); 1119 fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1120 fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); 1121 } 1122 1123 static void fe_tobytes(u8 s[32], const fe h) 1124 { 1125 i32 t[10]; 1126 FOR (i, 0, 10) { 1127 t[i] = h[i]; 1128 } 1129 i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25; 1130 FOR (i, 0, 5) { 1131 q += t[2*i ]; q >>= 26; 1132 q += t[2*i+1]; q >>= 25; 1133 } 1134 t[0] += 19 * q; 1135 1136 i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= c0 * (1 << 26); 1137 i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= c1 * (1 << 25); 1138 i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= c2 * (1 << 26); 1139 i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= c3 * (1 << 25); 1140 i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= c4 * (1 << 26); 1141 i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= c5 * (1 << 25); 1142 i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= c6 * (1 << 26); 1143 i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= c7 * (1 << 25); 1144 i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= c8 * (1 << 26); 1145 i32 c9 = t[9] >> 25; t[9] -= c9 * (1 << 25); 1146 1147 store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); 1148 store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); 1149 store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13)); 1150 store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6)); 1151 store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25)); 1152 store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19)); 1153 store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12)); 1154 store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6)); 1155 } 1156 1157 // Parity check. Returns 0 if even, 1 if odd 1158 static int fe_isnegative(const fe f) 1159 { 1160 u8 s[32]; 1161 fe_tobytes(s, f); 1162 return s[0] & 1; 1163 } 1164 1165 static int fe_isnonzero(const fe f) 1166 { 1167 u8 s[32]; 1168 fe_tobytes(s, f); 1169 return crypto_zerocmp(s, 32); 1170 } 1171 1172 /////////////// 1173 /// X-25519 /// Taken from Supercop's ref10 implementation. 1174 /////////////// 1175 1176 static void trim_scalar(u8 s[32]) 1177 { 1178 s[ 0] &= 248; 1179 s[31] &= 127; 1180 s[31] |= 64; 1181 } 1182 1183 static void x25519_ladder(const fe x1, fe x2, fe z2, fe x3, fe z3, 1184 const u8 scalar[32]) 1185 { 1186 // Montgomery ladder 1187 // In projective coordinates, to avoid divisons: x = X / Z 1188 // We don't care about the y coordinate, it's only 1 bit of information 1189 fe_1(x2); fe_0(z2); // "zero" point 1190 fe_copy(x3, x1); fe_1(z3); // "one" point 1191 int swap = 0; 1192 for (int pos = 254; pos >= 0; --pos) { 1193 // constant time conditional swap before ladder step 1194 int b = (scalar[pos / 8] >> (pos & 7)) & 1; 1195 swap ^= b; // xor trick avoids swapping at the end of the loop 1196 fe_cswap(x2, x3, swap); 1197 fe_cswap(z2, z3, swap); 1198 swap = b; // anticipates one last swap after the loop 1199 1200 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3) 1201 // with differential addition 1202 fe t0, t1; 1203 fe_sub(t0, x3, z3); fe_sub(t1, x2, z2); fe_add(x2, x2, z2); 1204 fe_add(z2, x3, z3); fe_mul(z3, t0, x2); fe_mul(z2, z2, t1); 1205 fe_sq (t0, t1 ); fe_sq (t1, x2 ); fe_add(x3, z3, z2); 1206 fe_sub(z2, z3, z2); fe_mul(x2, t1, t0); fe_sub(t1, t1, t0); 1207 fe_sq (z2, z2 ); fe_mul121666(z3, t1); fe_sq (x3, x3 ); 1208 fe_add(t0, t0, z3); fe_mul(z3, x1, z2); fe_mul(z2, t1, t0); 1209 } 1210 // last swap is necessary to compensate for the xor trick 1211 // Note: after this swap, P3 == P2 + P1. 1212 fe_cswap(x2, x3, swap); 1213 fe_cswap(z2, z3, swap); 1214 } 1215 1216 int crypto_x25519(u8 shared_secret [32], 1217 const u8 your_secret_key [32], 1218 const u8 their_public_key[32]) 1219 { 1220 // computes the scalar product 1221 fe x1; 1222 fe_frombytes(x1, their_public_key); 1223 1224 // restrict the possible scalar values 1225 u8 e[32]; 1226 FOR (i, 0, 32) { 1227 e[i] = your_secret_key[i]; 1228 } 1229 trim_scalar(e); 1230 1231 // computes the actual scalar product (the result is in x2 and z2) 1232 fe x2, z2, x3, z3; 1233 x25519_ladder(x1, x2, z2, x3, z3, e); 1234 1235 // normalises the coordinates: x == X / Z 1236 fe_invert(z2, z2); 1237 fe_mul(x2, x2, z2); 1238 fe_tobytes(shared_secret, x2); 1239 1240 // Returns -1 if the input is all zero 1241 // (happens with some malicious public keys) 1242 return -1 - crypto_zerocmp(shared_secret, 32); 1243 } 1244 1245 void crypto_x25519_public_key(u8 public_key[32], 1246 const u8 secret_key[32]) 1247 { 1248 static const u8 base_point[32] = {9}; 1249 crypto_x25519(public_key, secret_key, base_point); 1250 } 1251 1252 /////////////// 1253 /// Ed25519 /// 1254 /////////////// 1255 1256 // Point in a twisted Edwards curve, 1257 // in extended projective coordinates. 1258 // x = X/Z, y = Y/Z, T = XY/Z 1259 typedef struct { fe X; fe Y; fe Z; fe T; } ge; 1260 1261 static void ge_from_xy(ge *p, const fe x, const fe y) 1262 { 1263 FOR (i, 0, 10) { 1264 p->X[i] = x[i]; 1265 p->Y[i] = y[i]; 1266 } 1267 fe_1 (p->Z); 1268 fe_mul(p->T, x, y); 1269 } 1270 1271 static void ge_tobytes(u8 s[32], const ge *h) 1272 { 1273 fe recip, x, y; 1274 fe_invert(recip, h->Z); 1275 fe_mul(x, h->X, recip); 1276 fe_mul(y, h->Y, recip); 1277 fe_tobytes(s, y); 1278 s[31] ^= fe_isnegative(x) << 7; 1279 } 1280 1281 // Variable time! s must not be secret! 1282 static int ge_frombytes_neg(ge *h, const u8 s[32]) 1283 { 1284 static const fe d = { 1285 -10913610, 13857413, -15372611, 6949391, 114729, 1286 -8787816, -6275908, -3247719, -18696448, -12055116 1287 } ; 1288 static const fe sqrtm1 = { 1289 -32595792, -7943725, 9377950, 3500415, 12389472, 1290 -272473, -25146209, -2005654, 326686, 11406482 1291 } ; 1292 fe u, v, v3, vxx, check; 1293 fe_frombytes(h->Y, s); 1294 fe_1(h->Z); 1295 fe_sq(u, h->Y); // y^2 1296 fe_mul(v, u, d); 1297 fe_sub(u, u, h->Z); // u = y^2-1 1298 fe_add(v, v, h->Z); // v = dy^2+1 1299 1300 fe_sq(v3, v); 1301 fe_mul(v3, v3, v); // v3 = v^3 1302 fe_sq(h->X, v3); 1303 fe_mul(h->X, h->X, v); 1304 fe_mul(h->X, h->X, u); // x = uv^7 1305 1306 fe_pow22523(h->X, h->X); // x = (uv^7)^((q-5)/8) 1307 fe_mul(h->X, h->X, v3); 1308 fe_mul(h->X, h->X, u); // x = uv^3(uv^7)^((q-5)/8) 1309 1310 fe_sq(vxx, h->X); 1311 fe_mul(vxx, vxx, v); 1312 fe_sub(check, vxx, u); // vx^2-u 1313 if (fe_isnonzero(check)) { 1314 fe_add(check, vxx, u); // vx^2+u 1315 if (fe_isnonzero(check)) return -1; 1316 fe_mul(h->X, h->X, sqrtm1); 1317 } 1318 1319 if (fe_isnegative(h->X) == (s[31] >> 7)) { 1320 fe_neg(h->X, h->X); 1321 } 1322 fe_mul(h->T, h->X, h->Y); 1323 return 0; 1324 } 1325 1326 static void ge_add(ge *s, const ge *p, const ge *q) 1327 { 1328 static const fe D2 = { // - 2 * 121665 / 121666 1329 0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052, 1330 0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67 1331 }; 1332 fe a, b, c, d, e, f, g, h; 1333 // A = (Y1-X1) * (Y2-X2) 1334 // B = (Y1+X1) * (Y2+X2) 1335 fe_sub(a, p->Y, p->X); fe_sub(h, q->Y, q->X); fe_mul(a, a, h); 1336 fe_add(b, p->X, p->Y); fe_add(h, q->X, q->Y); fe_mul(b, b, h); 1337 fe_mul(c, p->T, q->T); fe_mul(c, c, D2 ); // C = T1 * k * T2 1338 fe_add(d, p->Z, p->Z); fe_mul(d, d, q->Z); // D = Z1 * 2 * Z2 1339 fe_sub(e, b, a); // E = B - A 1340 fe_sub(f, d, c); // F = D - C 1341 fe_add(g, d, c); // G = D + C 1342 fe_add(h, b, a); // H = B + A 1343 fe_mul(s->X, e, f); // X3 = E * F 1344 fe_mul(s->Y, g, h); // Y3 = G * H 1345 fe_mul(s->Z, f, g); // Z3 = F * G 1346 fe_mul(s->T, e, h); // T3 = E * H 1347 } 1348 1349 // Performing the scalar multiplication directly in Twisted Edwards 1350 // space woud be simpler, but also slower. So we do it in Montgomery 1351 // space instead. The sign of the Y coordinate however gets lost in 1352 // translation, so we use a dirty trick to recover it. 1353 static void ge_scalarmult(ge *p, const ge *q, const u8 scalar[32]) 1354 { 1355 // sqrt(-486664) 1356 static const fe K = { 54885894, 25242303, 55597453, 9067496, 51808079, 1357 33312638, 25456129, 14121551, 54921728, 3972023 }; 1358 1359 // convert q to montgomery format 1360 fe x1, y1, z1, x2, z2, x3, z3, t1, t2, t3, t4; 1361 fe_sub(z1, q->Z, q->Y); fe_mul(z1, z1, q->X); fe_invert(z1, z1); 1362 fe_add(t1, q->Z, q->Y); 1363 fe_mul(x1, q->X, t1 ); fe_mul(x1, x1, z1); 1364 fe_mul(y1, q->Z, t1 ); fe_mul(y1, y1, z1); fe_mul(y1, K, y1); 1365 fe_1(z1); // implied in the ladder, needed to convert back. 1366 1367 // montgomery scalarmult 1368 x25519_ladder(x1, x2, z2, x3, z3, scalar); 1369 1370 // Recover the y coordinate (Katsuyuki Okeya & Kouichi Sakurai, 2001) 1371 // Note the shameless reuse of x1: (x1, y1, z1) will correspond to 1372 // what was originally (x2, z2). 1373 fe_mul(t1, x1, z2); // t1 = x1 * z2 1374 fe_add(t2, x2, t1); // t2 = x2 + t1 1375 fe_sub(t3, x2, t1); // t3 = x2 β t1 1376 fe_sq (t3, t3); // t3 = t3^2 1377 fe_mul(t3, t3, x3); // t3 = t3 * x3 1378 fe_mul973324(t1, z2);// t1 = 2a * z2 1379 fe_add(t2, t2, t1); // t2 = t2 + t1 1380 fe_mul(t4, x1, x2); // t4 = x1 * x2 1381 fe_add(t4, t4, z2); // t4 = t4 + z2 1382 fe_mul(t2, t2, t4); // t2 = t2 * t4 1383 fe_mul(t1, t1, z2); // t1 = t1 * z2 1384 fe_sub(t2, t2, t1); // t2 = t2 β t1 1385 fe_mul(t2, t2, z3); // t2 = t2 * z3 1386 fe_add(t1, y1, y1); // t1 = y1 + y1 1387 fe_mul(t1, t1, z2); // t1 = t1 * z2 1388 fe_mul(t1, t1, z3); // t1 = t1 * z3 1389 fe_mul(x1, t1, x2); // x1 = t1 * x2 1390 fe_sub(y1, t2, t3); // y1 = t2 β t3 1391 fe_mul(z1, t1, z2); // z1 = t1 * z2 1392 1393 // convert back to twisted edwards 1394 fe_sub(t1 , x1, z1); 1395 fe_add(t2 , x1, z1); 1396 fe_mul(x1 , K , x1); 1397 fe_mul(p->X, x1, t2); 1398 fe_mul(p->Y, y1, t1); 1399 fe_mul(p->Z, y1, t2); 1400 fe_mul(p->T, x1, t1); 1401 } 1402 1403 static void ge_scalarmult_base(ge *p, const u8 scalar[32]) 1404 { 1405 // Calls the general ge_scalarmult() with the base point. 1406 // Other implementations use a precomputed table, but it 1407 // takes way too much code. 1408 static const fe X = { 1409 0x325d51a, 0x18b5823, 0x0f6592a, 0x104a92d, 0x1a4b31d, 1410 0x1d6dc5c, 0x27118fe, 0x07fd814, 0x13cd6e5, 0x085a4db}; 1411 static const fe Y = { 1412 0x2666658, 0x1999999, 0x0cccccc, 0x1333333, 0x1999999, 1413 0x0666666, 0x3333333, 0x0cccccc, 0x2666666, 0x1999999}; 1414 ge base_point; 1415 ge_from_xy(&base_point, X, Y); 1416 ge_scalarmult(p, &base_point, scalar); 1417 } 1418 1419 static void modL(u8 *r, i64 x[64]) 1420 { 1421 static const u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 1422 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, 1423 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1424 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 }; 1425 for (unsigned i = 63; i >= 32; i--) { 1426 i64 carry = 0; 1427 FOR (j, i-32, i-12) { 1428 x[j] += carry - 16 * x[i] * L[j - (i - 32)]; 1429 carry = (x[j] + 128) >> 8; 1430 x[j] -= carry * (1 << 8); 1431 } 1432 x[i-12] += carry; 1433 x[i] = 0; 1434 } 1435 i64 carry = 0; 1436 FOR(i, 0, 32) { 1437 x[i] += carry - (x[31] >> 4) * L[i]; 1438 carry = x[i] >> 8; 1439 x[i] &= 255; 1440 } 1441 FOR(i, 0, 32) { 1442 x[i] -= carry * L[i]; 1443 } 1444 FOR(i, 0, 32) { 1445 x[i+1] += x[i] >> 8; 1446 r[i ] = x[i] & 255; 1447 } 1448 } 1449 1450 static void reduce(u8 r[64]) 1451 { 1452 i64 x[64]; 1453 FOR(i, 0, 64) { 1454 x[i] = (u64) r[i]; 1455 r[i] = 0; 1456 } 1457 modL(r, x); 1458 } 1459 1460 // hashes R || A || M, reduces it modulo L 1461 static void hash_ram(u8 k[64], const u8 R[32], const u8 A[32], 1462 const u8 *M, size_t M_size) 1463 { 1464 HASH_CTX ctx; 1465 HASH_INIT (&ctx); 1466 HASH_UPDATE(&ctx, R , 32 ); 1467 HASH_UPDATE(&ctx, A , 32 ); 1468 HASH_UPDATE(&ctx, M , M_size); 1469 HASH_FINAL (&ctx, k); 1470 reduce(k); 1471 } 1472 1473 void crypto_sign_public_key(u8 public_key[32], 1474 const u8 secret_key[32]) 1475 { 1476 u8 a[64]; 1477 HASH(a, secret_key, 32); 1478 trim_scalar(a); 1479 ge A; 1480 ge_scalarmult_base(&A, a); 1481 ge_tobytes(public_key, &A); 1482 } 1483 1484 void crypto_sign(u8 signature[64], 1485 const u8 secret_key[32], 1486 const u8 public_key[32], 1487 const u8 *message, size_t message_size) 1488 { 1489 u8 a[64], *prefix = a + 32; 1490 HASH(a, secret_key, 32); 1491 trim_scalar(a); 1492 1493 u8 pk_buf[32]; 1494 const u8 *pk = public_key; 1495 if (public_key == 0) { 1496 crypto_sign_public_key(pk_buf, secret_key); 1497 pk = pk_buf; 1498 } 1499 1500 // Constructs the "random" nonce from the secret key and message. 1501 // An actual random number would work just fine, and would save us 1502 // the trouble of hashing the message twice. If we did that 1503 // however, the user could fuck it up and reuse the nonce. 1504 u8 r[64]; 1505 HASH_CTX ctx; 1506 HASH_INIT (&ctx); 1507 HASH_UPDATE(&ctx, prefix , 32 ); 1508 HASH_UPDATE(&ctx, message, message_size); 1509 HASH_FINAL (&ctx, r); 1510 reduce(r); 1511 1512 // first half of the signature = "random" nonce times basepoint 1513 ge R; 1514 ge_scalarmult_base(&R, r); 1515 ge_tobytes(signature, &R); 1516 1517 u8 h_ram[64]; 1518 hash_ram(h_ram, signature, pk, message, message_size); 1519 1520 i64 s[64]; // s = r + h_ram * a 1521 FOR(i, 0, 32) { s[i] = (u64) r[i]; } 1522 FOR(i, 32, 64) { s[i] = 0; } 1523 FOR(i, 0, 32) { 1524 FOR(j, 0, 32) { 1525 s[i+j] += h_ram[i] * (u64) a[j]; 1526 } 1527 } 1528 modL(signature + 32, s); // second half of the signature = s 1529 } 1530 1531 int crypto_check(const u8 signature[64], 1532 const u8 public_key[32], 1533 const u8 *message, size_t message_size) 1534 { 1535 ge A, p, sB, diff; 1536 u8 h_ram[64], R_check[32]; 1537 if (ge_frombytes_neg(&A, public_key)) { // -A 1538 return -1; 1539 } 1540 hash_ram(h_ram, signature, public_key, message, message_size); 1541 ge_scalarmult(&p, &A, h_ram); // p = -A*h_ram 1542 ge_scalarmult_base(&sB, signature + 32); 1543 ge_add(&diff, &p, &sB); // diff = s - A*h_ram 1544 ge_tobytes(R_check, &diff); 1545 return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail 1546 } 1547 1548 //////////////////// 1549 /// Key exchange /// 1550 //////////////////// 1551 int crypto_key_exchange(u8 shared_key[32], 1552 const u8 your_secret_key [32], 1553 const u8 their_public_key[32]) 1554 { 1555 static const u8 zero[16] = {0}; 1556 u8 shared_secret[32]; 1557 int status = crypto_x25519(shared_secret, your_secret_key, their_public_key); 1558 crypto_chacha20_H(shared_key, shared_secret, zero); 1559 return status; 1560 } 1561 1562 //////////////////////////////// 1563 /// Authenticated encryption /// 1564 //////////////////////////////// 1565 static void authenticate2(u8 mac[16] , const u8 auth_key[32], 1566 const u8 *t1, size_t size1, 1567 const u8 *t2, size_t size2) 1568 { 1569 crypto_poly1305_ctx a_ctx; 1570 crypto_poly1305_init (&a_ctx, auth_key); 1571 crypto_poly1305_update(&a_ctx, t1, size1); 1572 crypto_poly1305_update(&a_ctx, t2, size2); 1573 crypto_poly1305_final (&a_ctx, mac); 1574 } 1575 1576 void crypto_aead_lock(u8 mac[16], 1577 u8 *cipher_text, 1578 const u8 key[32], 1579 const u8 nonce[24], 1580 const u8 *ad , size_t ad_size, 1581 const u8 *plain_text, size_t text_size) 1582 { // encrypt then mac 1583 u8 auth_key[32]; 1584 crypto_chacha_ctx e_ctx; 1585 crypto_chacha20_x_init (&e_ctx, key, nonce); 1586 crypto_chacha20_stream (&e_ctx, auth_key, 32); 1587 crypto_chacha20_encrypt(&e_ctx, cipher_text, plain_text, text_size); 1588 authenticate2(mac, auth_key, ad, ad_size, cipher_text, text_size); 1589 } 1590 1591 int crypto_aead_unlock(u8 *plain_text, 1592 const u8 key[32], 1593 const u8 nonce[24], 1594 const u8 mac[16], 1595 const u8 *ad , size_t ad_size, 1596 const u8 *cipher_text, size_t text_size) 1597 { 1598 u8 auth_key[32], real_mac[16]; 1599 crypto_chacha_ctx e_ctx; 1600 crypto_chacha20_x_init(&e_ctx, key, nonce); 1601 crypto_chacha20_stream(&e_ctx, auth_key, 32); 1602 authenticate2(real_mac, auth_key, ad, ad_size, cipher_text, text_size); 1603 if (crypto_memcmp(real_mac, mac, 16)) { 1604 return -1; // reject forgeries 1605 } 1606 crypto_chacha20_encrypt(&e_ctx, plain_text, cipher_text, text_size); 1607 return 0; 1608 } 1609 1610 void crypto_lock(u8 mac[16], 1611 u8 *cipher_text, 1612 const u8 key[32], 1613 const u8 nonce[24], 1614 const u8 *plain_text, size_t text_size) 1615 { 1616 crypto_aead_lock(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size); 1617 } 1618 1619 int crypto_unlock(u8 *plain_text, 1620 const u8 key[32], 1621 const u8 nonce[24], 1622 const u8 mac[16], 1623 const u8 *cipher_text, size_t text_size) 1624 { 1625 return crypto_aead_unlock(plain_text, key, nonce, mac, 0, 0, 1626 cipher_text, text_size); 1627 }