On Thu, Nov 15, 2012 at 02:52:11PM +0400, Solar Designer wrote: > Test on FX-8120: > > user@bull:~/scrypt/escrypt/escrypt-1$ time ./tests | md5sum > 4455b1ce0529e7f877de53f24ff78bec - > > real 0m3.428s > user 0m2.856s > sys 0m0.540s > user@bull:~/scrypt/escrypt/escrypt-1$ cd ../escrypt-5 > user@bull:~/scrypt/escrypt/escrypt-5$ time ./tests | md5sum > 4455b1ce0529e7f877de53f24ff78bec - > > real 0m2.732s > user 0m2.184s > sys 0m0.512s New speed: user@bull:~/scrypt/escrypt/escrypt-14$ time ./tests | md5sum 4455b1ce0529e7f877de53f24ff78bec - real 0m2.479s user 0m1.852s sys 0m0.584s blkcpy() and blkxor() are now gone. New code revision is attached. Alexander
--- escrypt-1/crypto_scrypt-sse.c 2010-01-16 20:48:20 +0000 +++ escrypt-14/crypto_scrypt-sse.c 2012-11-16 06:33:34 +0000 @@ -1,5 +1,6 @@ /*- * Copyright 2009 Colin Percival + * Copyright 2012 Solar Designer * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,6 +33,10 @@ #include <sys/mman.h> #include <emmintrin.h> +#ifdef __XOP__ +#include <x86intrin.h> +#endif + #include <errno.h> #include <stdint.h> #include <stdlib.h> @@ -42,130 +47,166 @@ #include "crypto_scrypt.h" -static void blkcpy(void *, void *, size_t); -static void blkxor(void *, void *, size_t); -static void salsa20_8(__m128i *); -static void blockmix_salsa8(__m128i *, __m128i *, __m128i *, size_t); -static uint64_t integerify(void *, size_t); -static void smix(uint8_t *, size_t, uint64_t, void *, void *); +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ -static void -blkcpy(void * dest, void * src, size_t len) -{ - __m128i * D = dest; - __m128i * S = src; - size_t L = len / 16; +#ifdef __XOP__ +#define SALSA20_8_HEAD \ + __m128i Bin[4]; \ + __m128i X0, X1, X2, X3; \ + size_t i; +#define XRA(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define SALSA20_8_HEAD \ + __m128i Bin[4]; \ + __m128i X0, X1, X2, X3; \ + __m128i T; \ size_t i; +#define XRA(out, in1, in2, s) \ + T = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); +#endif + +#define SALSA20_8_TAIL \ + for (i = 0; i < 8; i += 2) { \ + /* Operate on "columns". */ \ + XRA(X1, X0, X3, 7); \ + XRA(X2, X1, X0, 9); \ + XRA(X3, X2, X1, 13); \ + XRA(X0, X3, X2, 18); \ +\ + /* Rearrange data. */ \ + X1 = _mm_shuffle_epi32(X1, 0x93); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ +\ + /* Operate on "rows". */ \ + XRA(X3, X0, X1, 7); \ + XRA(X2, X3, X0, 9); \ + XRA(X1, X2, X3, 13); \ + XRA(X0, X1, X2, 18); \ +\ + /* Rearrange data. */ \ + X1 = _mm_shuffle_epi32(X1, 0x39); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x93); \ + } \ +\ + Bout[0] = _mm_add_epi32(Bin[0], X0); \ + Bout[1] = _mm_add_epi32(Bin[1], X1); \ + Bout[2] = _mm_add_epi32(Bin[2], X2); \ + Bout[3] = _mm_add_epi32(Bin[3], X3); - for (i = 0; i < L; i++) - D[i] = S[i]; +static inline void +salsa20_8_xor(__m128i Bin1[4], __m128i Bin2[4], __m128i Bout[4]) +{ + SALSA20_8_HEAD + + X0 = Bin[0] = _mm_xor_si128(Bin1[0], Bin2[0]); + X1 = Bin[1] = _mm_xor_si128(Bin1[1], Bin2[1]); + X2 = Bin[2] = _mm_xor_si128(Bin1[2], Bin2[2]); + X3 = Bin[3] = _mm_xor_si128(Bin1[3], Bin2[3]); + + SALSA20_8_TAIL } -static void -blkxor(void * dest, void * src, size_t len) +static inline void +salsa20_8_xor2(__m128i Bin1[4], __m128i Bin2[4], __m128i Bin3[4], + __m128i Bout[4]) { - __m128i * D = dest; - __m128i * S = src; - size_t L = len / 16; - size_t i; + SALSA20_8_HEAD + + X0 = Bin[0] = _mm_xor_si128(_mm_xor_si128(Bin1[0], Bin2[0]), Bin3[0]); + X1 = Bin[1] = _mm_xor_si128(_mm_xor_si128(Bin1[1], Bin2[1]), Bin3[1]); + X2 = Bin[2] = _mm_xor_si128(_mm_xor_si128(Bin1[2], Bin2[2]), Bin3[2]); + X3 = Bin[3] = _mm_xor_si128(_mm_xor_si128(Bin1[3], Bin2[3]), Bin3[3]); - for (i = 0; i < L; i++) - D[i] = _mm_xor_si128(D[i], S[i]); + SALSA20_8_TAIL } -/** - * salsa20_8(B): - * Apply the salsa20/8 core to the provided block. - */ -static void -salsa20_8(__m128i B[4]) +static inline void +salsa20_8_xor3(__m128i Bin1[4], __m128i Bin2[4], __m128i Bin3[4], + __m128i Bin4[4], __m128i Bout[4]) { - __m128i X0, X1, X2, X3; - __m128i T; - size_t i; + SALSA20_8_HEAD + + X0 = Bin[0] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( + Bin1[0], Bin2[0]), Bin3[0]), Bin4[0]); + X1 = Bin[1] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( + Bin1[1], Bin2[1]), Bin3[1]), Bin4[1]); + X2 = Bin[2] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( + Bin1[2], Bin2[2]), Bin3[2]), Bin4[2]); + X3 = Bin[3] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( + Bin1[3], Bin2[3]), Bin3[3]), Bin4[3]); - X0 = B[0]; - X1 = B[1]; - X2 = B[2]; - X3 = B[3]; - - for (i = 0; i < 8; i += 2) { - /* Operate on "columns". */ - T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); - - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x93); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x39); - - /* Operate on "rows". */ - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); - - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x39); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x93); - } - - B[0] = _mm_add_epi32(B[0], X0); - B[1] = _mm_add_epi32(B[1], X1); - B[2] = _mm_add_epi32(B[2], X2); - B[3] = _mm_add_epi32(B[3], X3); + SALSA20_8_TAIL } +#undef SALSA20_8_HEAD +#undef XRA +#undef SALSA20_8_TAIL + /** * blockmix_salsa8(Bin, Bout, X, r): * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. The - * temporary space X must be 64 bytes. + * bytes in length; the output Bout must also be the same size. */ -static void -blockmix_salsa8(__m128i * Bin, __m128i * Bout, __m128i * X, size_t r) +static inline void +blockmix_salsa8(__m128i * Bin, __m128i * Bout, size_t r) { + __m128i * X, * Y; size_t i; /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &Bin[8 * r - 4], 64); + X = &Bin[8 * r - 4]; /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < r; i++) { /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8], 64); - salsa20_8(X); + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + salsa20_8_xor(X, &Bin[i * 8], Y = &Bout[i * 4]); + /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 4], X, 64); + salsa20_8_xor(Y, &Bin[i * 8 + 4], X = &Bout[(r + i) * 4]); + } +} +static inline void +blockmix_salsa8_xor(__m128i * Bin1, __m128i * Bin2, __m128i * Bout, size_t r) +{ + __m128i * X, * Y; + size_t i; + + /* 1: X <-- B_{2r - 1} */ + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + salsa20_8_xor3(&Bin1[8 * r - 4], &Bin2[8 * r - 4], Bin1, Bin2, + Y = Bout); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; ;) { /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8 + 4], 64); - salsa20_8(X); + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + salsa20_8_xor2(Y, &Bin1[i * 8 + 4], &Bin2[i * 8 + 4], + X = &Bout[(r + i) * 4]); + + if (++i >= r) + break; + /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[(r + i) * 4], X, 64); + salsa20_8_xor2(X, &Bin1[i * 8], &Bin2[i * 8], + Y = &Bout[i * 4]); } } @@ -173,7 +214,7 @@ blockmix_salsa8(__m128i * Bin, __m128i * * integerify(B, r): * Return the result of parsing B_{2r-1} as a little-endian integer. */ -static uint64_t +static inline uint64_t integerify(void * B, size_t r) { uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64); @@ -192,14 +233,13 @@ integerify(void * B, size_t r) static void smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY) { - __m128i * X = XY; - __m128i * Y = (void *)((uintptr_t)(XY) + 128 * r); - __m128i * Z = (void *)((uintptr_t)(XY) + 256 * r); - uint32_t * X32 = (void *)X; + __m128i * X = V, * Y; + uint32_t * X32 = V; uint64_t i, j; size_t k; /* 1: X <-- B */ + /* 3: V_i <-- X */ for (k = 0; k < 2 * r; k++) { for (i = 0; i < 16; i++) { X32[k * 16 + i] = @@ -208,36 +248,48 @@ smix(uint8_t * B, size_t r, uint64_t N, } /* 2: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 3: V_i <-- X */ - blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r); - + for (i = 1; i < N - 1; i += 2) { /* 4: X <-- H(X) */ - blockmix_salsa8(X, Y, Z, r); - /* 3: V_i <-- X */ - blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r), - Y, 128 * r); + Y = (void *)((uintptr_t)(V) + i * 128 * r); + blockmix_salsa8(X, Y, r); /* 4: X <-- H(X) */ - blockmix_salsa8(Y, X, Z, r); + /* 3: V_i <-- X */ + X = (void *)((uintptr_t)(V) + (i + 1) * 128 * r); + blockmix_salsa8(Y, X, r); } + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = (void *)((uintptr_t)(V) + i * 128 * r); + blockmix_salsa8(X, Y, r); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = XY; + blockmix_salsa8(Y, X, r); + + X32 = XY; + Y = (void *)((uintptr_t)(XY) + 128 * r); + /* 6: for i = 0 to N - 1 do */ for (i = 0; i < N; i += 2) { + __m128i * V_i; + /* 7: j <-- Integerify(X) mod N */ j = integerify(X, r) & (N - 1); /* 8: X <-- H(X \xor V_j) */ - blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); - blockmix_salsa8(X, Y, Z, r); + V_i = (void *)((uintptr_t)(V) + j * 128 * r); + blockmix_salsa8_xor(X, V_i, Y, r); /* 7: j <-- Integerify(X) mod N */ j = integerify(Y, r) & (N - 1); /* 8: X <-- H(X \xor V_j) */ - blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); - blockmix_salsa8(Y, X, Z, r); + V_i = (void *)((uintptr_t)(V) + j * 128 * r); + blockmix_salsa8_xor(Y, V_i, X, r); } /* 10: B' <-- X */ @@ -298,7 +350,7 @@ crypto_scrypt(const uint8_t * passwd, si if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0) goto err0; B = (uint8_t *)(B0); - if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0) + if ((errno = posix_memalign(&XY0, 64, 256 * r)) != 0) goto err1; XY = (uint32_t *)(XY0); #ifndef MAP_ANON @@ -310,7 +362,7 @@ crypto_scrypt(const uint8_t * passwd, si if ((B0 = malloc(128 * r * p + 63)) == NULL) goto err0; B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63)); - if ((XY0 = malloc(256 * r + 64 + 63)) == NULL) + if ((XY0 = malloc(256 * r + 63)) == NULL) goto err1; XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63)); #ifndef MAP_ANON
Attachment:
escrypt-0.0.14.tar.gz
Description: Binary data