On Thu, Nov 15, 2012 at 02:52:11PM +0400, Solar Designer wrote: > Test on FX-8120: > > user@bull:~/scrypt/escrypt/escrypt-1$ time ./tests | md5sum > 4455b1ce0529e7f877de53f24ff78bec - > > real 0m3.428s > user 0m2.856s > sys 0m0.540s > user@bull:~/scrypt/escrypt/escrypt-1$ cd ../escrypt-5 > user@bull:~/scrypt/escrypt/escrypt-5$ time ./tests | md5sum > 4455b1ce0529e7f877de53f24ff78bec - > > real 0m2.732s > user 0m2.184s > sys 0m0.512s New speed: user@bull:~/scrypt/escrypt/escrypt-14$ time ./tests | md5sum 4455b1ce0529e7f877de53f24ff78bec - real 0m2.479s user 0m1.852s sys 0m0.584s blkcpy() and blkxor() are now gone. New code revision is attached. Alexander
--- escrypt-1/crypto_scrypt-sse.c 2010-01-16 20:48:20 +0000
+++ escrypt-14/crypto_scrypt-sse.c 2012-11-16 06:33:34 +0000
@@ -1,5 +1,6 @@
/*-
* Copyright 2009 Colin Percival
+ * Copyright 2012 Solar Designer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -32,6 +33,10 @@
#include <sys/mman.h>
#include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
@@ -42,130 +47,166 @@
#include "crypto_scrypt.h"
-static void blkcpy(void *, void *, size_t);
-static void blkxor(void *, void *, size_t);
-static void salsa20_8(__m128i *);
-static void blockmix_salsa8(__m128i *, __m128i *, __m128i *, size_t);
-static uint64_t integerify(void *, size_t);
-static void smix(uint8_t *, size_t, uint64_t, void *, void *);
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
-static void
-blkcpy(void * dest, void * src, size_t len)
-{
- __m128i * D = dest;
- __m128i * S = src;
- size_t L = len / 16;
+#ifdef __XOP__
+#define SALSA20_8_HEAD \
+ __m128i Bin[4]; \
+ __m128i X0, X1, X2, X3; \
+ size_t i;
+#define XRA(out, in1, in2, s) \
+ out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define SALSA20_8_HEAD \
+ __m128i Bin[4]; \
+ __m128i X0, X1, X2, X3; \
+ __m128i T; \
size_t i;
+#define XRA(out, in1, in2, s) \
+ T = _mm_add_epi32(in1, in2); \
+ out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
+ out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s));
+#endif
+
+#define SALSA20_8_TAIL \
+ for (i = 0; i < 8; i += 2) { \
+ /* Operate on "columns". */ \
+ XRA(X1, X0, X3, 7); \
+ XRA(X2, X1, X0, 9); \
+ XRA(X3, X2, X1, 13); \
+ XRA(X0, X3, X2, 18); \
+\
+ /* Rearrange data. */ \
+ X1 = _mm_shuffle_epi32(X1, 0x93); \
+ X2 = _mm_shuffle_epi32(X2, 0x4E); \
+ X3 = _mm_shuffle_epi32(X3, 0x39); \
+\
+ /* Operate on "rows". */ \
+ XRA(X3, X0, X1, 7); \
+ XRA(X2, X3, X0, 9); \
+ XRA(X1, X2, X3, 13); \
+ XRA(X0, X1, X2, 18); \
+\
+ /* Rearrange data. */ \
+ X1 = _mm_shuffle_epi32(X1, 0x39); \
+ X2 = _mm_shuffle_epi32(X2, 0x4E); \
+ X3 = _mm_shuffle_epi32(X3, 0x93); \
+ } \
+\
+ Bout[0] = _mm_add_epi32(Bin[0], X0); \
+ Bout[1] = _mm_add_epi32(Bin[1], X1); \
+ Bout[2] = _mm_add_epi32(Bin[2], X2); \
+ Bout[3] = _mm_add_epi32(Bin[3], X3);
- for (i = 0; i < L; i++)
- D[i] = S[i];
+static inline void
+salsa20_8_xor(__m128i Bin1[4], __m128i Bin2[4], __m128i Bout[4])
+{
+ SALSA20_8_HEAD
+
+ X0 = Bin[0] = _mm_xor_si128(Bin1[0], Bin2[0]);
+ X1 = Bin[1] = _mm_xor_si128(Bin1[1], Bin2[1]);
+ X2 = Bin[2] = _mm_xor_si128(Bin1[2], Bin2[2]);
+ X3 = Bin[3] = _mm_xor_si128(Bin1[3], Bin2[3]);
+
+ SALSA20_8_TAIL
}
-static void
-blkxor(void * dest, void * src, size_t len)
+static inline void
+salsa20_8_xor2(__m128i Bin1[4], __m128i Bin2[4], __m128i Bin3[4],
+ __m128i Bout[4])
{
- __m128i * D = dest;
- __m128i * S = src;
- size_t L = len / 16;
- size_t i;
+ SALSA20_8_HEAD
+
+ X0 = Bin[0] = _mm_xor_si128(_mm_xor_si128(Bin1[0], Bin2[0]), Bin3[0]);
+ X1 = Bin[1] = _mm_xor_si128(_mm_xor_si128(Bin1[1], Bin2[1]), Bin3[1]);
+ X2 = Bin[2] = _mm_xor_si128(_mm_xor_si128(Bin1[2], Bin2[2]), Bin3[2]);
+ X3 = Bin[3] = _mm_xor_si128(_mm_xor_si128(Bin1[3], Bin2[3]), Bin3[3]);
- for (i = 0; i < L; i++)
- D[i] = _mm_xor_si128(D[i], S[i]);
+ SALSA20_8_TAIL
}
-/**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
- */
-static void
-salsa20_8(__m128i B[4])
+static inline void
+salsa20_8_xor3(__m128i Bin1[4], __m128i Bin2[4], __m128i Bin3[4],
+ __m128i Bin4[4], __m128i Bout[4])
{
- __m128i X0, X1, X2, X3;
- __m128i T;
- size_t i;
+ SALSA20_8_HEAD
+
+ X0 = Bin[0] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(
+ Bin1[0], Bin2[0]), Bin3[0]), Bin4[0]);
+ X1 = Bin[1] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(
+ Bin1[1], Bin2[1]), Bin3[1]), Bin4[1]);
+ X2 = Bin[2] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(
+ Bin1[2], Bin2[2]), Bin3[2]), Bin4[2]);
+ X3 = Bin[3] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(
+ Bin1[3], Bin2[3]), Bin3[3]), Bin4[3]);
- X0 = B[0];
- X1 = B[1];
- X2 = B[2];
- X3 = B[3];
-
- for (i = 0; i < 8; i += 2) {
- /* Operate on "columns". */
- T = _mm_add_epi32(X0, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X1, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X3, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
- /* Rearrange data. */
- X1 = _mm_shuffle_epi32(X1, 0x93);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x39);
-
- /* Operate on "rows". */
- T = _mm_add_epi32(X0, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X3, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X1, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
- /* Rearrange data. */
- X1 = _mm_shuffle_epi32(X1, 0x39);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x93);
- }
-
- B[0] = _mm_add_epi32(B[0], X0);
- B[1] = _mm_add_epi32(B[1], X1);
- B[2] = _mm_add_epi32(B[2], X2);
- B[3] = _mm_add_epi32(B[3], X3);
+ SALSA20_8_TAIL
}
+#undef SALSA20_8_HEAD
+#undef XRA
+#undef SALSA20_8_TAIL
+
/**
* blockmix_salsa8(Bin, Bout, X, r):
* Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size. The
- * temporary space X must be 64 bytes.
+ * bytes in length; the output Bout must also be the same size.
*/
-static void
-blockmix_salsa8(__m128i * Bin, __m128i * Bout, __m128i * X, size_t r)
+static inline void
+blockmix_salsa8(__m128i * Bin, __m128i * Bout, size_t r)
{
+ __m128i * X, * Y;
size_t i;
/* 1: X <-- B_{2r - 1} */
- blkcpy(X, &Bin[8 * r - 4], 64);
+ X = &Bin[8 * r - 4];
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < r; i++) {
/* 3: X <-- H(X \xor B_i) */
- blkxor(X, &Bin[i * 8], 64);
- salsa20_8(X);
+ /* 4: Y_i <-- X */
+ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+ salsa20_8_xor(X, &Bin[i * 8], Y = &Bout[i * 4]);
+ /* 3: X <-- H(X \xor B_i) */
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
- blkcpy(&Bout[i * 4], X, 64);
+ salsa20_8_xor(Y, &Bin[i * 8 + 4], X = &Bout[(r + i) * 4]);
+ }
+}
+static inline void
+blockmix_salsa8_xor(__m128i * Bin1, __m128i * Bin2, __m128i * Bout, size_t r)
+{
+ __m128i * X, * Y;
+ size_t i;
+
+ /* 1: X <-- B_{2r - 1} */
+ /* 3: X <-- H(X \xor B_i) */
+ /* 4: Y_i <-- X */
+ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+ salsa20_8_xor3(&Bin1[8 * r - 4], &Bin2[8 * r - 4], Bin1, Bin2,
+ Y = Bout);
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; ;) {
/* 3: X <-- H(X \xor B_i) */
- blkxor(X, &Bin[i * 8 + 4], 64);
- salsa20_8(X);
+ /* 4: Y_i <-- X */
+ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+ salsa20_8_xor2(Y, &Bin1[i * 8 + 4], &Bin2[i * 8 + 4],
+ X = &Bout[(r + i) * 4]);
+
+ if (++i >= r)
+ break;
+ /* 3: X <-- H(X \xor B_i) */
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
- blkcpy(&Bout[(r + i) * 4], X, 64);
+ salsa20_8_xor2(X, &Bin1[i * 8], &Bin2[i * 8],
+ Y = &Bout[i * 4]);
}
}
@@ -173,7 +214,7 @@ blockmix_salsa8(__m128i * Bin, __m128i *
* integerify(B, r):
* Return the result of parsing B_{2r-1} as a little-endian integer.
*/
-static uint64_t
+static inline uint64_t
integerify(void * B, size_t r)
{
uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
@@ -192,14 +233,13 @@ integerify(void * B, size_t r)
static void
smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY)
{
- __m128i * X = XY;
- __m128i * Y = (void *)((uintptr_t)(XY) + 128 * r);
- __m128i * Z = (void *)((uintptr_t)(XY) + 256 * r);
- uint32_t * X32 = (void *)X;
+ __m128i * X = V, * Y;
+ uint32_t * X32 = V;
uint64_t i, j;
size_t k;
/* 1: X <-- B */
+ /* 3: V_i <-- X */
for (k = 0; k < 2 * r; k++) {
for (i = 0; i < 16; i++) {
X32[k * 16 + i] =
@@ -208,36 +248,48 @@ smix(uint8_t * B, size_t r, uint64_t N,
}
/* 2: for i = 0 to N - 1 do */
- for (i = 0; i < N; i += 2) {
- /* 3: V_i <-- X */
- blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r);
-
+ for (i = 1; i < N - 1; i += 2) {
/* 4: X <-- H(X) */
- blockmix_salsa8(X, Y, Z, r);
-
/* 3: V_i <-- X */
- blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r),
- Y, 128 * r);
+ Y = (void *)((uintptr_t)(V) + i * 128 * r);
+ blockmix_salsa8(X, Y, r);
/* 4: X <-- H(X) */
- blockmix_salsa8(Y, X, Z, r);
+ /* 3: V_i <-- X */
+ X = (void *)((uintptr_t)(V) + (i + 1) * 128 * r);
+ blockmix_salsa8(Y, X, r);
}
+ /* 4: X <-- H(X) */
+ /* 3: V_i <-- X */
+ Y = (void *)((uintptr_t)(V) + i * 128 * r);
+ blockmix_salsa8(X, Y, r);
+
+ /* 4: X <-- H(X) */
+ /* 3: V_i <-- X */
+ X = XY;
+ blockmix_salsa8(Y, X, r);
+
+ X32 = XY;
+ Y = (void *)((uintptr_t)(XY) + 128 * r);
+
/* 6: for i = 0 to N - 1 do */
for (i = 0; i < N; i += 2) {
+ __m128i * V_i;
+
/* 7: j <-- Integerify(X) mod N */
j = integerify(X, r) & (N - 1);
/* 8: X <-- H(X \xor V_j) */
- blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
- blockmix_salsa8(X, Y, Z, r);
+ V_i = (void *)((uintptr_t)(V) + j * 128 * r);
+ blockmix_salsa8_xor(X, V_i, Y, r);
/* 7: j <-- Integerify(X) mod N */
j = integerify(Y, r) & (N - 1);
/* 8: X <-- H(X \xor V_j) */
- blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
- blockmix_salsa8(Y, X, Z, r);
+ V_i = (void *)((uintptr_t)(V) + j * 128 * r);
+ blockmix_salsa8_xor(Y, V_i, X, r);
}
/* 10: B' <-- X */
@@ -298,7 +350,7 @@ crypto_scrypt(const uint8_t * passwd, si
if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0)
goto err0;
B = (uint8_t *)(B0);
- if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0)
+ if ((errno = posix_memalign(&XY0, 64, 256 * r)) != 0)
goto err1;
XY = (uint32_t *)(XY0);
#ifndef MAP_ANON
@@ -310,7 +362,7 @@ crypto_scrypt(const uint8_t * passwd, si
if ((B0 = malloc(128 * r * p + 63)) == NULL)
goto err0;
B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63));
- if ((XY0 = malloc(256 * r + 64 + 63)) == NULL)
+ if ((XY0 = malloc(256 * r + 63)) == NULL)
goto err1;
XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63));
#ifndef MAP_ANON
Attachment:
escrypt-0.0.14.tar.gz
Description: Binary data