[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: crypto_scrypt-sse.c speedup



Colin -

On Fri, Nov 16, 2012 at 02:56:04PM +0400, Solar Designer wrote:
> I ended up replacing the X array with two pointers, X and Y, which point
> to Bin and Bout array elements.  This avoids having to save a copy of X.

I revised this some further.  The patch is attached.  The source file is
now smaller than in my previous revisions - it is now almost the same
size as original in scrypt-1.1.6.  I think I'll leave this at that (and
proceed to try out other changes that are not code optimizations).

Please consider applying this patch.

Thanks,

Alexander
--- escrypt-1/crypto_scrypt-sse.c	2010-01-16 20:48:20 +0000
+++ escrypt-19/crypto_scrypt-sse.c	2012-11-17 03:16:15 +0000
@@ -1,5 +1,6 @@
 /*-
  * Copyright 2009 Colin Percival
+ * Copyright 2012 Alexander Peslyak
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,6 +33,10 @@
 #include <sys/mman.h>
 
 #include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -42,138 +47,134 @@
 
 #include "crypto_scrypt.h"
 
-static void blkcpy(void *, void *, size_t);
-static void blkxor(void *, void *, size_t);
-static void salsa20_8(__m128i *);
-static void blockmix_salsa8(__m128i *, __m128i *, __m128i *, size_t);
-static uint64_t integerify(void *, size_t);
-static void smix(uint8_t *, size_t, uint64_t, void *, void *);
-
-static void
-blkcpy(void * dest, void * src, size_t len)
-{
-	__m128i * D = dest;
-	__m128i * S = src;
-	size_t L = len / 16;
-	size_t i;
-
-	for (i = 0; i < L; i++)
-		D[i] = S[i];
-}
+#ifdef __XOP__
+#define XRA(out, in1, in2, s) \
+	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define XRA(out, in1, in2, s) \
+	{ \
+		__m128i T = _mm_add_epi32(in1, in2); \
+		out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
+		out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \
+	}
+#endif
 
-static void
-blkxor(void * dest, void * src, size_t len)
-{
-	__m128i * D = dest;
-	__m128i * S = src;
-	size_t L = len / 16;
-	size_t i;
+#define SALSA20_2ROUNDS \
+	/* Operate on "columns". */ \
+	XRA(X1, X0, X3, 7); \
+	XRA(X2, X1, X0, 9); \
+	XRA(X3, X2, X1, 13); \
+	XRA(X0, X3, X2, 18); \
+\
+	/* Rearrange data. */ \
+	X1 = _mm_shuffle_epi32(X1, 0x93); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x39); \
+\
+	/* Operate on "rows". */ \
+	XRA(X3, X0, X1, 7); \
+	XRA(X2, X3, X0, 9); \
+	XRA(X1, X2, X3, 13); \
+	XRA(X0, X1, X2, 18); \
+\
+	/* Rearrange data. */ \
+	X1 = _mm_shuffle_epi32(X1, 0x39); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x93);
 
-	for (i = 0; i < L; i++)
-		D[i] = _mm_xor_si128(D[i], S[i]);
-}
+/**
+ * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ in.
+ */
+#define SALSA20_8_XOR(in, out) \
+	{ \
+		__m128i Y0 = X0 = _mm_xor_si128(X0, (in)[0]); \
+		__m128i Y1 = X1 = _mm_xor_si128(X1, (in)[1]); \
+		__m128i Y2 = X2 = _mm_xor_si128(X2, (in)[2]); \
+		__m128i Y3 = X3 = _mm_xor_si128(X3, (in)[3]); \
+		SALSA20_2ROUNDS; \
+		SALSA20_2ROUNDS; \
+		SALSA20_2ROUNDS; \
+		SALSA20_2ROUNDS; \
+		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
+		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
+		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
+		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
+	}
 
 /**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
+ * blockmix_salsa8(Bin, Bout, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.
  */
-static void
-salsa20_8(__m128i B[4])
+static inline void
+blockmix_salsa8(__m128i * Bin, __m128i * Bout, size_t r)
 {
 	__m128i X0, X1, X2, X3;
-	__m128i T;
 	size_t i;
 
-	X0 = B[0];
-	X1 = B[1];
-	X2 = B[2];
-	X3 = B[3];
-
-	for (i = 0; i < 8; i += 2) {
-		/* Operate on "columns". */
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
-		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
-		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
-		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
-		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
-		/* Rearrange data. */
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-
-		/* Operate on "rows". */
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
-		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
-		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
-		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
-		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
-		/* Rearrange data. */
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-	}
-
-	B[0] = _mm_add_epi32(B[0], X0);
-	B[1] = _mm_add_epi32(B[1], X1);
-	B[2] = _mm_add_epi32(B[2], X2);
-	B[3] = _mm_add_epi32(B[3], X3);
+	/* 1: X <-- B_{2r - 1} */
+	X0 = Bin[8 * r - 4];
+	X1 = Bin[8 * r - 3];
+	X2 = Bin[8 * r - 2];
+	X3 = Bin[8 * r - 1];
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < r; i++) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR(&Bin[i * 8], &Bout[i * 4])
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR(&Bin[i * 8 + 4], &Bout[(r + i) * 4])
+	}
 }
 
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(__m128i * Bin, __m128i * Bout, __m128i * X, size_t r)
+static inline void
+blockmix_salsa8_xor(__m128i * Bin1, __m128i * Bin2, __m128i * Bout, size_t r)
 {
+	__m128i X0, X1, X2, X3;
 	size_t i;
 
 	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[8 * r - 4], 64);
+	X0 = _mm_xor_si128(Bin1[8 * r - 4], Bin2[8 * r - 4]);
+	X1 = _mm_xor_si128(Bin1[8 * r - 3], Bin2[8 * r - 3]);
+	X2 = _mm_xor_si128(Bin1[8 * r - 2], Bin2[8 * r - 2]);
+	X3 = _mm_xor_si128(Bin1[8 * r - 1], Bin2[8 * r - 1]);
 
 	/* 2: for i = 0 to 2r - 1 do */
 	for (i = 0; i < r; i++) {
 		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8], 64);
-		salsa20_8(X);
-
 		/* 4: Y_i <-- X */
 		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4], X, 64);
+		X0 = _mm_xor_si128(X0, Bin1[i * 8]);
+		X1 = _mm_xor_si128(X1, Bin1[i * 8 + 1]);
+		X2 = _mm_xor_si128(X2, Bin1[i * 8 + 2]);
+		X3 = _mm_xor_si128(X3, Bin1[i * 8 + 3]);
+		SALSA20_8_XOR(&Bin2[i * 8], &Bout[i * 4])
 
 		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8 + 4], 64);
-		salsa20_8(X);
-
 		/* 4: Y_i <-- X */
 		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[(r + i) * 4], X, 64);
+		X0 = _mm_xor_si128(X0, Bin1[i * 8 + 4]);
+		X1 = _mm_xor_si128(X1, Bin1[i * 8 + 5]);
+		X2 = _mm_xor_si128(X2, Bin1[i * 8 + 6]);
+		X3 = _mm_xor_si128(X3, Bin1[i * 8 + 7]);
+		SALSA20_8_XOR(&Bin2[i * 8 + 4], &Bout[(r + i) * 4])
 	}
 }
 
+#undef XRA
+#undef SALSA20_2ROUNDS
+#undef SALSA20_8_XOR
+
 /**
  * integerify(B, r):
  * Return the result of parsing B_{2r-1} as a little-endian integer.
  */
-static uint64_t
+static inline uint64_t
 integerify(void * B, size_t r)
 {
 	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
@@ -192,14 +193,13 @@ integerify(void * B, size_t r)
 static void
 smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY)
 {
-	__m128i * X = XY;
-	__m128i * Y = (void *)((uintptr_t)(XY) + 128 * r);
-	__m128i * Z = (void *)((uintptr_t)(XY) + 256 * r);
-	uint32_t * X32 = (void *)X;
+	__m128i * X = V, * Y;
+	uint32_t * X32 = V;
 	uint64_t i, j;
 	size_t k;
 
 	/* 1: X <-- B */
+	/* 3: V_i <-- X */
 	for (k = 0; k < 2 * r; k++) {
 		for (i = 0; i < 16; i++) {
 			X32[k * 16 + i] =
@@ -208,36 +208,48 @@ smix(uint8_t * B, size_t r, uint64_t N, 
 	}
 
 	/* 2: for i = 0 to N - 1 do */
-	for (i = 0; i < N; i += 2) {
-		/* 3: V_i <-- X */
-		blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r);
-
+	for (i = 1; i < N - 1; i += 2) {
 		/* 4: X <-- H(X) */
-		blockmix_salsa8(X, Y, Z, r);
-
 		/* 3: V_i <-- X */
-		blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r),
-		    Y, 128 * r);
+		Y = (void *)((uintptr_t)(V) + i * 128 * r);
+		blockmix_salsa8(X, Y, r);
 
 		/* 4: X <-- H(X) */
-		blockmix_salsa8(Y, X, Z, r);
+		/* 3: V_i <-- X */
+		X = (void *)((uintptr_t)(V) + (i + 1) * 128 * r);
+		blockmix_salsa8(Y, X, r);
 	}
 
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+	Y = (void *)((uintptr_t)(V) + i * 128 * r);
+	blockmix_salsa8(X, Y, r);
+
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+	X = XY;
+	blockmix_salsa8(Y, X, r);
+
+	X32 = XY;
+	Y = (void *)((uintptr_t)(XY) + 128 * r);
+
 	/* 6: for i = 0 to N - 1 do */
 	for (i = 0; i < N; i += 2) {
+		__m128i * V_j;
+
 		/* 7: j <-- Integerify(X) mod N */
 		j = integerify(X, r) & (N - 1);
 
 		/* 8: X <-- H(X \xor V_j) */
-		blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
-		blockmix_salsa8(X, Y, Z, r);
+		V_j = (void *)((uintptr_t)(V) + j * 128 * r);
+		blockmix_salsa8_xor(X, V_j, Y, r);
 
 		/* 7: j <-- Integerify(X) mod N */
 		j = integerify(Y, r) & (N - 1);
 
 		/* 8: X <-- H(X \xor V_j) */
-		blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
-		blockmix_salsa8(Y, X, Z, r);
+		V_j = (void *)((uintptr_t)(V) + j * 128 * r);
+		blockmix_salsa8_xor(Y, V_j, X, r);
 	}
 
 	/* 10: B' <-- X */
@@ -298,7 +310,7 @@ crypto_scrypt(const uint8_t * passwd, si
 	if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0)
 		goto err0;
 	B = (uint8_t *)(B0);
-	if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0)
+	if ((errno = posix_memalign(&XY0, 64, 256 * r)) != 0)
 		goto err1;
 	XY = (uint32_t *)(XY0);
 #ifndef MAP_ANON
@@ -310,7 +322,7 @@ crypto_scrypt(const uint8_t * passwd, si
 	if ((B0 = malloc(128 * r * p + 63)) == NULL)
 		goto err0;
 	B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63));
-	if ((XY0 = malloc(256 * r + 64 + 63)) == NULL)
+	if ((XY0 = malloc(256 * r + 63)) == NULL)
 		goto err1;
 	XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63));
 #ifndef MAP_ANON

Attachment: escrypt-0.0.19.tar.gz
Description: Binary data