crypto/rc4: faster amd64 implementation

XOR key into data 128 bits at a time instead of 64 bits and pipeline half of state loads. Rotate loop to allow single-register indexing for state[i]. On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkRC4_128 412 224 -45.63% BenchmarkRC4_1K 3179 1613 -49.26% BenchmarkRC4_8K 25223 12545 -50.26% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 310.51 570.42 1.84x BenchmarkRC4_1K 322.09 634.48 1.97x BenchmarkRC4_8K 320.97 645.32 2.01x For comparison, on the same machine, openssl 0.9.8r reports its rc4 speed as somewhat under 350 MB/s for both 1K and 8K (it is operating 64 bits at a time). On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkRC4_128 418 259 -38.04% BenchmarkRC4_1K 3200 1884 -41.12% BenchmarkRC4_8K 25173 14529 -42.28% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 306.04 492.48 1.61x BenchmarkRC4_1K 319.93 543.26 1.70x BenchmarkRC4_8K 321.61 557.20 1.73x For comparison, on the same machine, openssl 1.0.1 reports its rc4 speed as 587 MB/s for 1K and 601 MB/s for 8K. R=agl CC=golang-dev https://golang.org/cl/7865046
2024-09-25 05:20:13 -06:00 · 2013-03-21 16:38:57 -04:00 · 2013-03-21 16:38:57 -04:00 · b505ff6279
commit b505ff6279
parent d04ac4b0b7
7 changed files with 158 additions and 61 deletions
--- a/src/cmd/6l/optab.c
+++ b/src/cmd/6l/optab.c
@ -537,6 +537,11 @@ uchar	yextrw[] =
 	Yxr,	Yrl,	Zibm_r,	2,
 	0
 };
 uchar	yinsrw[] =
 {
 	Yml,	Yxr,	Zibm_r,	2,
 	0
 };
 uchar	yinsr[] =
 {
 	Ymm,	Yxr,	Zibm_r,	3,
@ -992,7 +997,7 @@ Optab optab[] =
 	{ APFRSQRT,	ymfp,	Px, 0x97 },
 	{ APFSUB,	ymfp,	Px, 0x9a },
 	{ APFSUBR,	ymfp,	Px, 0xaa },
-	{ APINSRW,	yextrw,	Pq, 0xc4,(00) },
+	{ APINSRW,	yinsrw,	Pq, 0xc4,(00) },
 	{ APINSRD,	yinsr,	Pq, 0x3a, 0x22, (00) },
 	{ APINSRQ,	yinsr,	Pq3, 0x3a, 0x22, (00) },
 	{ APMADDWL,	ymm,	Py, 0xf5,Pe,0xf5 },
--- a/src/pkg/crypto/rc4/rc4.go
+++ b/src/pkg/crypto/rc4/rc4.go
@ -13,7 +13,7 @@ import "strconv"
 // A Cipher is an instance of RC4 using a particular key.
 type Cipher struct {
-	s    [256]byte
+	s    [256]uint32
 	i, j uint8
 }
@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) {
 	}
 	var c Cipher
 	for i := 0; i < 256; i++ {
-		c.s[i] = uint8(i)
+		c.s[i] = uint32(i)
 	}
 	var j uint8 = 0
 	for i := 0; i < 256; i++ {
-		j += c.s[i] + key[i%k]
+		j += uint8(c.s[i]) + key[i%k]
 		c.s[i], c.s[j] = c.s[j], c.s[i]
 	}
 	return &c, nil
--- a/src/pkg/crypto/rc4/rc4_386.s
+++ b/src/pkg/crypto/rc4/rc4_386.s
@ -20,19 +20,19 @@ loop:
 	INCB AX
 	// j += c.s[i]
-	MOVBLZX (BP)(AX*1), DX
+	MOVBLZX (BP)(AX*4), DX
 	ADDB DX, BX
 	MOVBLZX BX, BX
 	// c.s[i], c.s[j] = c.s[j], c.s[i]
-	MOVBLZX (BP)(BX*1), CX
+	MOVBLZX (BP)(BX*4), CX
-	MOVB CX, (BP)(AX*1)
+	MOVB CX, (BP)(AX*4)
-	MOVB DX, (BP)(BX*1)
+	MOVB DX, (BP)(BX*4)
 	// *dst = *src ^ c.s[c.s[i]+c.s[j]]
 	ADDB DX, CX
 	MOVBLZX CX, CX
-	MOVB (BP)(CX*1), CX
+	MOVB (BP)(CX*4), CX
 	XORB (SI), CX
 	MOVBLZX CX, CX
 	MOVB CX, (DI)
--- a/src/pkg/crypto/rc4/rc4_amd64.s
+++ b/src/pkg/crypto/rc4/rc4_amd64.s
@ -1,11 +1,19 @@
 // Original source:
 //	http://www.zorinaq.com/papers/rc4-amd64.html
 //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
 // Local modifications:
 //
 // Transliterated from GNU to 6a assembly syntax by the Go authors.
 // The comments and spacing are from the original.
-
+//
 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
 //
 // The original code accumulated 64 bits of key stream in an integer
 // register and then XOR'ed the key stream into the data 8 bytes at a time.
 // Modified to accumulate 128 bits of key stream into an XMM register
 // and then XOR the key stream into the data 16 bytes at a time.
 // Approximately doubles throughput.
 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
 // but makes the code run 2.0x slower on Xeon.
@ -38,59 +46,123 @@ TEXT ·xorKeyStream(SB),7,$0
 	MOVQ	yp+40(FP),	AX
 	MOVBQZX	0(AX),		DX		// y = *yp
-	INCQ	CX				// x++
+	LEAQ	(SI)(BX*1),	R9		// limit = in+len
 	ANDQ	$255,		CX		// x &= 0xff
 	LEAQ	-8(BX)(SI*1),	BX		// rbx = in+len-8
 	MOVQ	BX,		R9		// tmp = in+len-8
 	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x]
 	CMPQ	BX,		SI		// cmp in with in+len-8
 	JLT	end				// jump if (in+len-8 < in)
-start:
+l1:	CMPQ	SI,		R9		// cmp in with in+len
-	ADDQ	$8,		SI		// increment in
+	JGE	finished			// jump if (in >= in+len)
 	ADDQ	$8,		DI		// increment out
-	// generate the next 8 bytes of the rc4 stream into R8
+	INCB	CX
 	MOVQ	$8,		R11		// byte counter
 l1:	ADDB	AX,		DX
 	EXTEND(DX)
 	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
 	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty + tx
 	EXTEND(BX)
 	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
 	INCB	CX				// x++ (NEXT ROUND)
 	EXTEND(CX)
-	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
+	TESTL	$15,		CX
-	SHLQ	$8,		R8
+	JZ	wordloop
 	MOVB	(BP)(BX*1),	R8		// val = d[val]
 	DECQ	R11
 	JNZ	l1
-	// xor 8 bytes
+	MOVBLZX	(BP)(CX*4),	AX
 	BSWAPQ	R8
 	XORQ	-8(SI),		R8
 	CMPQ	SI,		R9		// cmp in+len-8 with in XXX
 	MOVQ	R8,		-8(DI)
 	JLE	start				// jump if (in <= in+len-8)
 end:
 	ADDQ	$8,		R9		// tmp = in+len
 	// handle the last bytes, one by one
 l2:	CMPQ	R9,		SI		// cmp in with in+len
 	JLE	finished			// jump if (in+len <= in)
 	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
-	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
+	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
-	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
+	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
-	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
+	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
-	INCB	CX				// x++ (NEXT ROUND)
+	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
 	XORB	(SI),		R8		// xor 1 byte
 	MOVB	R8,		(DI)
 	INCQ	SI				// in++
 	INCQ	DI				// out++
 	JMP l1
 wordloop:
 	SUBQ	$16,		R9
 	CMPQ	SI,		R9
 	JGT	end
 start:
 	ADDQ	$16,		SI		// increment in
 	ADDQ	$16,		DI		// increment out
 	// Each KEYROUND generates one byte of key and
 	// inserts it into an XMM register at the given 16-bit index.
 	// The key state array is uint32 words only using the bottom
 	// byte of each word, so the 16-bit OR only copies 8 useful bits.
 	// We accumulate alternating bytes into X0 and X1, and then at
 	// the end we OR X1<<8 into X0 to produce the actual key.
 	//
 	// At the beginning of the loop, CX%16 == 0, so the 16 loads
 	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
 	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
 	// without fear of the byte computation CX+15 wrapping around.
 	//
 	// The first round needs R12[0], the second needs R12[1], and so on.
 	// We can avoid memory stalls by starting the load for round n+1
 	// before the end of round n, using the LOAD macro.
 	LEAQ	(BP)(CX*4),	R12
 #define KEYROUND(xmm, load, off, r1, r2, index) \
 	MOVBLZX	(BP)(DX*4),	R8; \
 	MOVB	r1,		(BP)(DX*4); \
 	load((off+1), r2); \
 	MOVB	R8,		(off*4)(R12); \
 	ADDB	r1,		R8; \
 	EXTEND(R8); \
 	PINSRW	$index, (BP)(R8*4), xmm
 #define LOAD(off, reg) \
 	MOVBLZX	(off*4)(R12),	reg; \
 	ADDB	reg,		DX; \
 	EXTEND(DX)
 #define SKIP(off, reg)
 	LOAD(0, AX)
 	KEYROUND(X0, LOAD, 0, AX, BX, 0)
 	KEYROUND(X1, LOAD, 1, BX, AX, 0)
 	KEYROUND(X0, LOAD, 2, AX, BX, 1)
 	KEYROUND(X1, LOAD, 3, BX, AX, 1)
 	KEYROUND(X0, LOAD, 4, AX, BX, 2)
 	KEYROUND(X1, LOAD, 5, BX, AX, 2)
 	KEYROUND(X0, LOAD, 6, AX, BX, 3)
 	KEYROUND(X1, LOAD, 7, BX, AX, 3)
 	KEYROUND(X0, LOAD, 8, AX, BX, 4)
 	KEYROUND(X1, LOAD, 9, BX, AX, 4)
 	KEYROUND(X0, LOAD, 10, AX, BX, 5)
 	KEYROUND(X1, LOAD, 11, BX, AX, 5)
 	KEYROUND(X0, LOAD, 12, AX, BX, 6)
 	KEYROUND(X1, LOAD, 13, BX, AX, 6)
 	KEYROUND(X0, LOAD, 14, AX, BX, 7)
 	KEYROUND(X1, SKIP, 15, BX, AX, 7)
 	ADDB	$16,		CX
 	PSLLQ	$8,		X1
 	PXOR	X1,		X0
 	MOVOU	-16(SI),	X2
 	PXOR	X0,		X2
 	MOVOU	X2,		-16(DI)
 	CMPQ	SI,		R9		// cmp in with in+len-16
 	JLE	start				// jump if (in <= in+len-16)
 end:
 	DECB	CX
 	ADDQ	$16,		R9		// tmp = in+len
 	// handle the last bytes, one by one
 l2:	CMPQ	SI,		R9		// cmp in with in+len
 	JGE	finished			// jump if (in >= in+len)
 	INCB	CX
 	EXTEND(CX)
-	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
+	MOVBLZX	(BP)(CX*4),	AX
-	MOVBLZX	(BP)(BX*1),	R8		// val = d[val]
+
 	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
 	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
 	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
 	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
 	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
 	XORB	(SI),		R8		// xor 1 byte
 	MOVB	R8,		(DI)
 	INCQ	SI				// in++
@ -98,7 +170,6 @@ l2:	CMPQ	R9,		SI		// cmp in with in+len
 	JMP l2
 finished:
 	DECQ	CX				// x--
 	MOVQ	yp+40(FP),	BX
 	MOVB	DX, 0(BX)
 	MOVQ	xp+32(FP),	AX
--- a/src/pkg/crypto/rc4/rc4_arm.s
+++ b/src/pkg/crypto/rc4/rc4_arm.s
@ -31,19 +31,19 @@ loop:
 	// i += 1; j += state[i]
 	ADD $1, R(i)
 	AND $0xff, R(i)
-	MOVBU R(i)<<0(R(state)), R(t)
+	MOVBU R(i)<<2(R(state)), R(t)
 	ADD R(t), R(j)
 	AND $0xff, R(j)
 	// swap state[i] <-> state[j]
-	MOVBU R(j)<<0(R(state)), R(t2)
+	MOVBU R(j)<<2(R(state)), R(t2)
-	MOVB R(t2), R(i)<<0(R(state))
+	MOVB R(t2), R(i)<<2(R(state))
-	MOVB R(t), R(j)<<0(R(state))
+	MOVB R(t), R(j)<<2(R(state))
 	// dst[k] = src[k] ^ state[state[i] + state[j]]
 	ADD R(t2), R(t)
 	AND $0xff, R(t)
-	MOVBU R(t)<<0(R(state)), R(t)
+	MOVBU R(t)<<2(R(state)), R(t)
 	MOVBU R(k)<<0(R(src)), R(t2)
 	EOR R(t), R(t2)
 	MOVB R(t2), R(k)<<0(R(dst))
--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@ -6,7 +6,7 @@
 package rc4
-func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
+func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
 // XORKeyStream sets dst to the result of XORing src with the key stream.
 // Dst and src may be the same slice but otherwise should not overlap.
--- a/src/pkg/crypto/rc4/rc4_test.go
+++ b/src/pkg/crypto/rc4/rc4_test.go
@ -5,6 +5,7 @@
 package rc4
 import (
 	"bytes"
 	"fmt"
 	"testing"
 )
@ -115,6 +116,26 @@ func TestGolden(t *testing.T) {
 	}
 }
 func TestBlock(t *testing.T) {
 	c1a, _ := NewCipher(golden[0].key)
 	c1b, _ := NewCipher(golden[1].key)
 	data1 := make([]byte, 1<<20)
 	for i := range data1 {
 		c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
 		c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
 	}
 	c2a, _ := NewCipher(golden[0].key)
 	c2b, _ := NewCipher(golden[1].key)
 	data2 := make([]byte, 1<<20)
 	c2a.XORKeyStream(data2, data2)
 	c2b.XORKeyStream(data2, data2)
 	if !bytes.Equal(data1, data2) {
 		t.Fatalf("bad block")
 	}
 }
 func benchmark(b *testing.B, size int64) {
 	buf := make([]byte, size)
 	c, err := NewCipher(golden[0].key)