Colin, all - The attached patch for crypto_scrypt-sse.c (against its revision in scrypt-1.1.6.tgz) speeds it up by 30% on AMD Bulldozer (tested on FX-8120) in -march=native or -mxop builds, and by 5% to 10% on Intel CPUs (tested on Xeon E5649), when run on the official test vectors. I exclude "system" time from the comparison since it'll vary between systems and since it can be out of the loop with some uses of scrypt. Test on FX-8120: user@bull:~/scrypt/escrypt/escrypt-1$ time ./tests | md5sum 4455b1ce0529e7f877de53f24ff78bec - real 0m3.428s user 0m2.856s sys 0m0.540s user@bull:~/scrypt/escrypt/escrypt-1$ cd ../escrypt-5 user@bull:~/scrypt/escrypt/escrypt-5$ time ./tests | md5sum 4455b1ce0529e7f877de53f24ff78bec - real 0m2.732s user 0m2.184s sys 0m0.512s escrypt-1 uses the original crypto_scrypt-sse.c, escrypt-5 uses the revised crypto_scrypt-sse.c. I've also attached a tarball with the two source trees - it's tiny. Please let me know if I should add a copyright statement, although maybe these changes are too minor to be subject to copyright. Thanks, Alexander
--- escrypt-1/crypto_scrypt-sse.c 2010-01-16 20:48:20 +0000 +++ escrypt-5/crypto_scrypt-sse.c 2012-11-15 10:19:42 +0000 @@ -32,6 +32,10 @@ #include <sys/mman.h> #include <emmintrin.h> +#ifdef __XOP__ +#include <x86intrin.h> +#endif + #include <errno.h> #include <stdint.h> #include <stdlib.h> @@ -42,14 +46,7 @@ #include "crypto_scrypt.h" -static void blkcpy(void *, void *, size_t); -static void blkxor(void *, void *, size_t); -static void salsa20_8(__m128i *); -static void blockmix_salsa8(__m128i *, __m128i *, __m128i *, size_t); -static uint64_t integerify(void *, size_t); -static void smix(uint8_t *, size_t, uint64_t, void *, void *); - -static void +static inline void blkcpy(void * dest, void * src, size_t len) { __m128i * D = dest; @@ -57,11 +54,15 @@ blkcpy(void * dest, void * src, size_t l size_t L = len / 16; size_t i; - for (i = 0; i < L; i++) + for (i = 0; i < L; i += 4) { D[i] = S[i]; + D[i + 1] = S[i + 1]; + D[i + 2] = S[i + 2]; + D[i + 3] = S[i + 3]; + } } -static void +static inline void blkxor(void * dest, void * src, size_t len) { __m128i * D = dest; @@ -69,40 +70,47 @@ blkxor(void * dest, void * src, size_t l size_t L = len / 16; size_t i; - for (i = 0; i < L; i++) + for (i = 0; i < L; i += 4) { D[i] = _mm_xor_si128(D[i], S[i]); + D[i + 1] = _mm_xor_si128(D[i + 1], S[i + 1]); + D[i + 2] = _mm_xor_si128(D[i + 2], S[i + 2]); + D[i + 3] = _mm_xor_si128(D[i + 3], S[i + 3]); + } } /** * salsa20_8(B): * Apply the salsa20/8 core to the provided block. */ -static void -salsa20_8(__m128i B[4]) +static inline void +salsa20_8_xor(__m128i Bin1[4], __m128i Bin2[4], __m128i Bout[4]) { __m128i X0, X1, X2, X3; +#ifndef __XOP__ __m128i T; +#endif size_t i; - X0 = B[0]; - X1 = B[1]; - X2 = B[2]; - X3 = B[3]; + X0 = Bin1[0] = _mm_xor_si128(Bin1[0], Bin2[0]); + X1 = Bin1[1] = _mm_xor_si128(Bin1[1], Bin2[1]); + X2 = Bin1[2] = _mm_xor_si128(Bin1[2], Bin2[2]); + X3 = Bin1[3] = _mm_xor_si128(Bin1[3], Bin2[3]); for (i = 0; i < 8; i += 2) { +#ifdef __XOP__ +#define XRA(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define XRA(out, in1, in2, s) \ + T = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); +#endif /* Operate on "columns". */ - T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); + XRA(X1, X0, X3, 7); + XRA(X2, X1, X0, 9); + XRA(X3, X2, X1, 13); + XRA(X0, X3, X2, 18); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x93); @@ -110,18 +118,10 @@ salsa20_8(__m128i B[4]) X3 = _mm_shuffle_epi32(X3, 0x39); /* Operate on "rows". */ - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); + XRA(X3, X0, X1, 7); + XRA(X2, X3, X0, 9); + XRA(X1, X2, X3, 13); + XRA(X0, X1, X2, 18); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x39); @@ -129,10 +129,10 @@ salsa20_8(__m128i B[4]) X3 = _mm_shuffle_epi32(X3, 0x93); } - B[0] = _mm_add_epi32(B[0], X0); - B[1] = _mm_add_epi32(B[1], X1); - B[2] = _mm_add_epi32(B[2], X2); - B[3] = _mm_add_epi32(B[3], X3); + Bout[0] = Bin1[0] = _mm_add_epi32(Bin1[0], X0); + Bout[1] = Bin1[1] = _mm_add_epi32(Bin1[1], X1); + Bout[2] = Bin1[2] = _mm_add_epi32(Bin1[2], X2); + Bout[3] = Bin1[3] = _mm_add_epi32(Bin1[3], X3); } /** @@ -152,20 +152,14 @@ blockmix_salsa8(__m128i * Bin, __m128i * /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < r; i++) { /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8], 64); - salsa20_8(X); - /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 4], X, 64); + salsa20_8_xor(X, &Bin[i * 8], &Bout[i * 4]); /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8 + 4], 64); - salsa20_8(X); - /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[(r + i) * 4], X, 64); + salsa20_8_xor(X, &Bin[i * 8 + 4], &Bout[(r + i) * 4]); } } @@ -173,7 +167,7 @@ blockmix_salsa8(__m128i * Bin, __m128i * * integerify(B, r): * Return the result of parsing B_{2r-1} as a little-endian integer. */ -static uint64_t +static inline uint64_t integerify(void * B, size_t r) { uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
Attachment:
escrypt-0.0.5.tar.gz
Description: Binary data