[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: scrypt Integerify



On Sat, Nov 17, 2012 at 09:17:05AM +0400, Solar Designer wrote:
> Curiously, the scrypt paper does not appear to fully define Integerify().
> The code uses:
> 
> static inline uint64_t
> integerify(void * B, size_t r)
> {
> 	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
> 
> 	return (((uint64_t)(X[13]) << 32) + X[0]);
> }
> 
> However, this also works fine (on little-endian), except for very large
> values of N, where it'd differ:
> 
> integerify(void * B, size_t r)
> {
> 	return *(uint64_t *)((uintptr_t)(B) + (2 * r - 1) * 64);
> }
> 
> Obviously, this is slightly smaller and faster code.

The attached patch implements this using _mm_cvtsi128_si64(X0) right in
the last loop iteration of BlockMix.  The first and last loop iterations
are specialized such that the instructions can get intermixed across the
two (inlined) BlockMix invocations in the callers.  The instruction
corresponding to _mm_cvtsi128_si64(X0) may then get moved up or down in
the code as the compiler deems best for instruction scheduling.

This avoids the two 32-bit memory (cache) reads, instead reusing data
that we already have in a register.

Alexander
diff -up escrypt-1/crypto_scrypt-sse.c escrypt-21/crypto_scrypt-sse.c
--- escrypt-1/crypto_scrypt-sse.c	2010-01-16 20:48:20 +0000
+++ escrypt-21/crypto_scrypt-sse.c	2012-11-17 06:08:09 +0000
@@ -1,5 +1,6 @@
 /*-
  * Copyright 2009 Colin Percival
+ * Copyright 2012 Alexander Peslyak
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,6 +33,10 @@
 #include <sys/mman.h>
 
 #include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -42,143 +47,168 @@
 
 #include "crypto_scrypt.h"
 
-static void blkcpy(void *, void *, size_t);
-static void blkxor(void *, void *, size_t);
-static void salsa20_8(__m128i *);
-static void blockmix_salsa8(__m128i *, __m128i *, __m128i *, size_t);
-static uint64_t integerify(void *, size_t);
-static void smix(uint8_t *, size_t, uint64_t, void *, void *);
-
-static void
-blkcpy(void * dest, void * src, size_t len)
-{
-	__m128i * D = dest;
-	__m128i * S = src;
-	size_t L = len / 16;
-	size_t i;
-
-	for (i = 0; i < L; i++)
-		D[i] = S[i];
-}
+#ifdef __XOP__
+#define XRA(out, in1, in2, s) \
+	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define XRA(out, in1, in2, s) \
+	{ \
+		__m128i T = _mm_add_epi32(in1, in2); \
+		out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
+		out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \
+	}
+#endif
 
-static void
-blkxor(void * dest, void * src, size_t len)
-{
-	__m128i * D = dest;
-	__m128i * S = src;
-	size_t L = len / 16;
-	size_t i;
+#define SALSA20_2ROUNDS \
+	/* Operate on "columns". */ \
+	XRA(X1, X0, X3, 7); \
+	XRA(X2, X1, X0, 9); \
+	XRA(X3, X2, X1, 13); \
+	XRA(X0, X3, X2, 18); \
+\
+	/* Rearrange data. */ \
+	X1 = _mm_shuffle_epi32(X1, 0x93); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x39); \
+\
+	/* Operate on "rows". */ \
+	XRA(X3, X0, X1, 7); \
+	XRA(X2, X3, X0, 9); \
+	XRA(X1, X2, X3, 13); \
+	XRA(X0, X1, X2, 18); \
+\
+	/* Rearrange data. */ \
+	X1 = _mm_shuffle_epi32(X1, 0x39); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x93);
 
-	for (i = 0; i < L; i++)
-		D[i] = _mm_xor_si128(D[i], S[i]);
-}
+/**
+ * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ in.
+ */
+#define SALSA20_8_XOR(in, out) \
+	{ \
+		__m128i Y0 = X0 = _mm_xor_si128(X0, (in)[0]); \
+		__m128i Y1 = X1 = _mm_xor_si128(X1, (in)[1]); \
+		__m128i Y2 = X2 = _mm_xor_si128(X2, (in)[2]); \
+		__m128i Y3 = X3 = _mm_xor_si128(X3, (in)[3]); \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
+		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
+		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
+		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
+	}
 
 /**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
+ * blockmix_salsa8(Bin, Bout, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.
  */
-static void
-salsa20_8(__m128i B[4])
+static inline void
+blockmix_salsa8(__m128i * Bin, __m128i * Bout, size_t r)
 {
 	__m128i X0, X1, X2, X3;
-	__m128i T;
 	size_t i;
 
-	X0 = B[0];
-	X1 = B[1];
-	X2 = B[2];
-	X3 = B[3];
-
-	for (i = 0; i < 8; i += 2) {
-		/* Operate on "columns". */
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
-		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
-		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
-		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
-		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
-		/* Rearrange data. */
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-
-		/* Operate on "rows". */
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
-		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
-		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
-		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
-		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
-
-		/* Rearrange data. */
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-	}
-
-	B[0] = _mm_add_epi32(B[0], X0);
-	B[1] = _mm_add_epi32(B[1], X1);
-	B[2] = _mm_add_epi32(B[2], X2);
-	B[3] = _mm_add_epi32(B[3], X3);
+	/* 1: X <-- B_{2r - 1} */
+	X0 = Bin[8 * r - 4];
+	X1 = Bin[8 * r - 3];
+	X2 = Bin[8 * r - 2];
+	X3 = Bin[8 * r - 1];
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	SALSA20_8_XOR(Bin, Bout)
+
+	/* 2: for i = 0 to 2r - 1 do */
+	r--;
+	for (i = 0; i < r;) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR(&Bin[i * 8 + 4], &Bout[(r + i) * 4 + 4])
+
+		i++;
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR(&Bin[i * 8], &Bout[i * 4])
+	}
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	SALSA20_8_XOR(&Bin[i * 8 + 4], &Bout[(r + i) * 4 + 4])
 }
 
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(__m128i * Bin, __m128i * Bout, __m128i * X, size_t r)
+#define XOR4(in) \
+	X0 = _mm_xor_si128(X0, (in)[0]); \
+	X1 = _mm_xor_si128(X1, (in)[1]); \
+	X2 = _mm_xor_si128(X2, (in)[2]); \
+	X3 = _mm_xor_si128(X3, (in)[3]);
+
+static inline uint64_t
+blockmix_salsa8_xor(__m128i * Bin1, __m128i * Bin2, __m128i * Bout, size_t r)
 {
+	__m128i X0, X1, X2, X3;
 	size_t i;
 
 	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[8 * r - 4], 64);
+	X0 = _mm_xor_si128(Bin1[8 * r - 4], Bin2[8 * r - 4]);
+	X1 = _mm_xor_si128(Bin1[8 * r - 3], Bin2[8 * r - 3]);
+	X2 = _mm_xor_si128(Bin1[8 * r - 2], Bin2[8 * r - 2]);
+	X3 = _mm_xor_si128(Bin1[8 * r - 1], Bin2[8 * r - 1]);
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(Bin1)
+	SALSA20_8_XOR(Bin2, Bout)
 
 	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < r; i++) {
+	r--;
+	for (i = 0; i < r;) {
 		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8], 64);
-		salsa20_8(X);
-
 		/* 4: Y_i <-- X */
 		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4], X, 64);
+		XOR4(&Bin1[i * 8 + 4])
+		SALSA20_8_XOR(&Bin2[i * 8 + 4], &Bout[(r + i) * 4 + 4])
 
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8 + 4], 64);
-		salsa20_8(X);
+		i++;
 
+		/* 3: X <-- H(X \xor B_i) */
 		/* 4: Y_i <-- X */
 		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[(r + i) * 4], X, 64);
+		XOR4(&Bin1[i * 8])
+		SALSA20_8_XOR(&Bin2[i * 8], &Bout[i * 4])
 	}
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(&Bin1[i * 8 + 4])
+	SALSA20_8_XOR(&Bin2[i * 8 + 4], &Bout[(r + i) * 4 + 4])
+
+	return _mm_cvtsi128_si64(X0);
 }
 
+#undef XRA
+#undef SALSA20_2ROUNDS
+#undef SALSA20_8_XOR
+#undef XOR4
+
 /**
  * integerify(B, r):
  * Return the result of parsing B_{2r-1} as a little-endian integer.
  */
-static uint64_t
+static inline uint64_t
 integerify(void * B, size_t r)
 {
-	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
-
-	return (((uint64_t)(X[13]) << 32) + X[0]);
+	return *(uint64_t *)((uintptr_t)(B) + (2 * r - 1) * 64);
 }
 
 /**
@@ -192,14 +222,13 @@ integerify(void * B, size_t r)
 static void
 smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY)
 {
-	__m128i * X = XY;
-	__m128i * Y = (void *)((uintptr_t)(XY) + 128 * r);
-	__m128i * Z = (void *)((uintptr_t)(XY) + 256 * r);
-	uint32_t * X32 = (void *)X;
+	__m128i * X = V, * Y, * V_j;
+	uint32_t * X32 = V;
 	uint64_t i, j;
 	size_t k;
 
 	/* 1: X <-- B */
+	/* 3: V_i <-- X */
 	for (k = 0; k < 2 * r; k++) {
 		for (i = 0; i < 16; i++) {
 			X32[k * 16 + i] =
@@ -208,36 +237,46 @@ smix(uint8_t * B, size_t r, uint64_t N, 
 	}
 
 	/* 2: for i = 0 to N - 1 do */
-	for (i = 0; i < N; i += 2) {
-		/* 3: V_i <-- X */
-		blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r);
-
+	for (i = 1; i < N - 1; i += 2) {
 		/* 4: X <-- H(X) */
-		blockmix_salsa8(X, Y, Z, r);
-
 		/* 3: V_i <-- X */
-		blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r),
-		    Y, 128 * r);
+		Y = (void *)((uintptr_t)(V) + i * 128 * r);
+		blockmix_salsa8(X, Y, r);
 
 		/* 4: X <-- H(X) */
-		blockmix_salsa8(Y, X, Z, r);
+		/* 3: V_i <-- X */
+		X = (void *)((uintptr_t)(V) + (i + 1) * 128 * r);
+		blockmix_salsa8(Y, X, r);
 	}
 
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+	Y = (void *)((uintptr_t)(V) + i * 128 * r);
+	blockmix_salsa8(X, Y, r);
+
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+	X = XY;
+	blockmix_salsa8(Y, X, r);
+
+	X32 = XY;
+	Y = (void *)((uintptr_t)(XY) + 128 * r);
+
+	/* 7: j <-- Integerify(X) mod N */
+	j = integerify(X, r) & (N - 1);
+	V_j = (void *)((uintptr_t)(V) + j * 128 * r);
+
 	/* 6: for i = 0 to N - 1 do */
 	for (i = 0; i < N; i += 2) {
-		/* 7: j <-- Integerify(X) mod N */
-		j = integerify(X, r) & (N - 1);
-
 		/* 8: X <-- H(X \xor V_j) */
-		blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
-		blockmix_salsa8(X, Y, Z, r);
-
 		/* 7: j <-- Integerify(X) mod N */
-		j = integerify(Y, r) & (N - 1);
+		j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1);
+		V_j = (void *)((uintptr_t)(V) + j * 128 * r);
 
 		/* 8: X <-- H(X \xor V_j) */
-		blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r);
-		blockmix_salsa8(Y, X, Z, r);
+		/* 7: j <-- Integerify(X) mod N */
+		j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1);
+		V_j = (void *)((uintptr_t)(V) + j * 128 * r);
 	}
 
 	/* 10: B' <-- X */
@@ -298,7 +337,7 @@ crypto_scrypt(const uint8_t * passwd, si
 	if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0)
 		goto err0;
 	B = (uint8_t *)(B0);
-	if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0)
+	if ((errno = posix_memalign(&XY0, 64, 256 * r)) != 0)
 		goto err1;
 	XY = (uint32_t *)(XY0);
 #ifndef MAP_ANON
@@ -310,7 +349,7 @@ crypto_scrypt(const uint8_t * passwd, si
 	if ((B0 = malloc(128 * r * p + 63)) == NULL)
 		goto err0;
 	B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63));
-	if ((XY0 = malloc(256 * r + 64 + 63)) == NULL)
+	if ((XY0 = malloc(256 * r + 63)) == NULL)
 		goto err1;
 	XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63));
 #ifndef MAP_ANON