1
0
mirror of https://github.com/golang/go synced 2024-11-12 07:00:21 -07:00

crypto/rc4: faster amd64 implementation

XOR key into data 128 bits at a time instead of 64 bits
and pipeline half of state loads. Rotate loop to allow
single-register indexing for state[i].

On a MacBookPro10,2 (Core i5):

benchmark           old ns/op    new ns/op    delta
BenchmarkRC4_128          412          224  -45.63%
BenchmarkRC4_1K          3179         1613  -49.26%
BenchmarkRC4_8K         25223        12545  -50.26%

benchmark            old MB/s     new MB/s  speedup
BenchmarkRC4_128       310.51       570.42    1.84x
BenchmarkRC4_1K        322.09       634.48    1.97x
BenchmarkRC4_8K        320.97       645.32    2.01x

For comparison, on the same machine, openssl 0.9.8r reports
its rc4 speed as somewhat under 350 MB/s for both 1K and 8K
(it is operating 64 bits at a time).

On an Intel Xeon E5520:

benchmark           old ns/op    new ns/op    delta
BenchmarkRC4_128          418          259  -38.04%
BenchmarkRC4_1K          3200         1884  -41.12%
BenchmarkRC4_8K         25173        14529  -42.28%

benchmark            old MB/s     new MB/s  speedup
BenchmarkRC4_128       306.04       492.48    1.61x
BenchmarkRC4_1K        319.93       543.26    1.70x
BenchmarkRC4_8K        321.61       557.20    1.73x

For comparison, on the same machine, openssl 1.0.1
reports its rc4 speed as 587 MB/s for 1K and 601 MB/s for 8K.

R=agl
CC=golang-dev
https://golang.org/cl/7865046
This commit is contained in:
Russ Cox 2013-03-21 16:38:57 -04:00
parent d04ac4b0b7
commit b505ff6279
7 changed files with 158 additions and 61 deletions

View File

@ -537,6 +537,11 @@ uchar yextrw[] =
Yxr, Yrl, Zibm_r, 2,
0
};
uchar yinsrw[] =
{
Yml, Yxr, Zibm_r, 2,
0
};
uchar yinsr[] =
{
Ymm, Yxr, Zibm_r, 3,
@ -992,7 +997,7 @@ Optab optab[] =
{ APFRSQRT, ymfp, Px, 0x97 },
{ APFSUB, ymfp, Px, 0x9a },
{ APFSUBR, ymfp, Px, 0xaa },
{ APINSRW, yextrw, Pq, 0xc4,(00) },
{ APINSRW, yinsrw, Pq, 0xc4,(00) },
{ APINSRD, yinsr, Pq, 0x3a, 0x22, (00) },
{ APINSRQ, yinsr, Pq3, 0x3a, 0x22, (00) },
{ APMADDWL, ymm, Py, 0xf5,Pe,0xf5 },

View File

@ -13,7 +13,7 @@ import "strconv"
// A Cipher is an instance of RC4 using a particular key.
type Cipher struct {
s [256]byte
s [256]uint32
i, j uint8
}
@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) {
}
var c Cipher
for i := 0; i < 256; i++ {
c.s[i] = uint8(i)
c.s[i] = uint32(i)
}
var j uint8 = 0
for i := 0; i < 256; i++ {
j += c.s[i] + key[i%k]
j += uint8(c.s[i]) + key[i%k]
c.s[i], c.s[j] = c.s[j], c.s[i]
}
return &c, nil

View File

@ -20,19 +20,19 @@ loop:
INCB AX
// j += c.s[i]
MOVBLZX (BP)(AX*1), DX
MOVBLZX (BP)(AX*4), DX
ADDB DX, BX
MOVBLZX BX, BX
// c.s[i], c.s[j] = c.s[j], c.s[i]
MOVBLZX (BP)(BX*1), CX
MOVB CX, (BP)(AX*1)
MOVB DX, (BP)(BX*1)
MOVBLZX (BP)(BX*4), CX
MOVB CX, (BP)(AX*4)
MOVB DX, (BP)(BX*4)
// *dst = *src ^ c.s[c.s[i]+c.s[j]]
ADDB DX, CX
MOVBLZX CX, CX
MOVB (BP)(CX*1), CX
MOVB (BP)(CX*4), CX
XORB (SI), CX
MOVBLZX CX, CX
MOVB CX, (DI)

View File

@ -1,11 +1,19 @@
// Original source:
// http://www.zorinaq.com/papers/rc4-amd64.html
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
// Local modifications:
//
// Transliterated from GNU to 6a assembly syntax by the Go authors.
// The comments and spacing are from the original.
//
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
//
// The original code accumulated 64 bits of key stream in an integer
// register and then XOR'ed the key stream into the data 8 bytes at a time.
// Modified to accumulate 128 bits of key stream into an XMM register
// and then XOR the key stream into the data 16 bytes at a time.
// Approximately doubles throughput.
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
// but makes the code run 2.0x slower on Xeon.
@ -38,59 +46,123 @@ TEXT ·xorKeyStream(SB),7,$0
MOVQ yp+40(FP), AX
MOVBQZX 0(AX), DX // y = *yp
INCQ CX // x++
ANDQ $255, CX // x &= 0xff
LEAQ -8(BX)(SI*1), BX // rbx = in+len-8
MOVQ BX, R9 // tmp = in+len-8
MOVBLZX (BP)(CX*1), AX // tx = d[x]
CMPQ BX, SI // cmp in with in+len-8
JLT end // jump if (in+len-8 < in)
LEAQ (SI)(BX*1), R9 // limit = in+len
start:
ADDQ $8, SI // increment in
ADDQ $8, DI // increment out
// generate the next 8 bytes of the rc4 stream into R8
MOVQ $8, R11 // byte counter
l1: ADDB AX, DX
EXTEND(DX)
MOVBLZX (BP)(DX*1), BX // ty = d[y]
MOVB BX, (BP)(CX*1) // d[x] = ty
ADDB AX, BX // val = ty + tx
EXTEND(BX)
MOVB AX, (BP)(DX*1) // d[y] = tx
INCB CX // x++ (NEXT ROUND)
l1: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
MOVBLZX (BP)(CX*1), AX // tx = d[x] (NEXT ROUND)
SHLQ $8, R8
MOVB (BP)(BX*1), R8 // val = d[val]
DECQ R11
JNZ l1
TESTL $15, CX
JZ wordloop
// xor 8 bytes
BSWAPQ R8
XORQ -8(SI), R8
CMPQ SI, R9 // cmp in+len-8 with in XXX
MOVQ R8, -8(DI)
JLE start // jump if (in <= in+len-8)
MOVBLZX (BP)(CX*4), AX
end:
ADDQ $8, R9 // tmp = in+len
// handle the last bytes, one by one
l2: CMPQ R9, SI // cmp in with in+len
JLE finished // jump if (in+len <= in)
ADDB AX, DX // y += tx
EXTEND(DX)
MOVBLZX (BP)(DX*1), BX // ty = d[y]
MOVB BX, (BP)(CX*1) // d[x] = ty
MOVBLZX (BP)(DX*4), BX // ty = d[y]
MOVB BX, (BP)(CX*4) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
MOVB AX, (BP)(DX*1) // d[y] = tx
INCB CX // x++ (NEXT ROUND)
MOVB AX, (BP)(DX*4) // d[y] = tx
MOVBLZX (BP)(BX*4), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
INCQ DI // out++
JMP l1
wordloop:
SUBQ $16, R9
CMPQ SI, R9
JGT end
start:
ADDQ $16, SI // increment in
ADDQ $16, DI // increment out
// Each KEYROUND generates one byte of key and
// inserts it into an XMM register at the given 16-bit index.
// The key state array is uint32 words only using the bottom
// byte of each word, so the 16-bit OR only copies 8 useful bits.
// We accumulate alternating bytes into X0 and X1, and then at
// the end we OR X1<<8 into X0 to produce the actual key.
//
// At the beginning of the loop, CX%16 == 0, so the 16 loads
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
// without fear of the byte computation CX+15 wrapping around.
//
// The first round needs R12[0], the second needs R12[1], and so on.
// We can avoid memory stalls by starting the load for round n+1
// before the end of round n, using the LOAD macro.
LEAQ (BP)(CX*4), R12
#define KEYROUND(xmm, load, off, r1, r2, index) \
MOVBLZX (BP)(DX*4), R8; \
MOVB r1, (BP)(DX*4); \
load((off+1), r2); \
MOVB R8, (off*4)(R12); \
ADDB r1, R8; \
EXTEND(R8); \
PINSRW $index, (BP)(R8*4), xmm
#define LOAD(off, reg) \
MOVBLZX (off*4)(R12), reg; \
ADDB reg, DX; \
EXTEND(DX)
#define SKIP(off, reg)
LOAD(0, AX)
KEYROUND(X0, LOAD, 0, AX, BX, 0)
KEYROUND(X1, LOAD, 1, BX, AX, 0)
KEYROUND(X0, LOAD, 2, AX, BX, 1)
KEYROUND(X1, LOAD, 3, BX, AX, 1)
KEYROUND(X0, LOAD, 4, AX, BX, 2)
KEYROUND(X1, LOAD, 5, BX, AX, 2)
KEYROUND(X0, LOAD, 6, AX, BX, 3)
KEYROUND(X1, LOAD, 7, BX, AX, 3)
KEYROUND(X0, LOAD, 8, AX, BX, 4)
KEYROUND(X1, LOAD, 9, BX, AX, 4)
KEYROUND(X0, LOAD, 10, AX, BX, 5)
KEYROUND(X1, LOAD, 11, BX, AX, 5)
KEYROUND(X0, LOAD, 12, AX, BX, 6)
KEYROUND(X1, LOAD, 13, BX, AX, 6)
KEYROUND(X0, LOAD, 14, AX, BX, 7)
KEYROUND(X1, SKIP, 15, BX, AX, 7)
ADDB $16, CX
PSLLQ $8, X1
PXOR X1, X0
MOVOU -16(SI), X2
PXOR X0, X2
MOVOU X2, -16(DI)
CMPQ SI, R9 // cmp in with in+len-16
JLE start // jump if (in <= in+len-16)
end:
DECB CX
ADDQ $16, R9 // tmp = in+len
// handle the last bytes, one by one
l2: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
MOVBLZX (BP)(CX*1), AX // tx = d[x] (NEXT ROUND)
MOVBLZX (BP)(BX*1), R8 // val = d[val]
MOVBLZX (BP)(CX*4), AX
ADDB AX, DX // y += tx
EXTEND(DX)
MOVBLZX (BP)(DX*4), BX // ty = d[y]
MOVB BX, (BP)(CX*4) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
MOVB AX, (BP)(DX*4) // d[y] = tx
MOVBLZX (BP)(BX*4), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
@ -98,7 +170,6 @@ l2: CMPQ R9, SI // cmp in with in+len
JMP l2
finished:
DECQ CX // x--
MOVQ yp+40(FP), BX
MOVB DX, 0(BX)
MOVQ xp+32(FP), AX

View File

@ -31,19 +31,19 @@ loop:
// i += 1; j += state[i]
ADD $1, R(i)
AND $0xff, R(i)
MOVBU R(i)<<0(R(state)), R(t)
MOVBU R(i)<<2(R(state)), R(t)
ADD R(t), R(j)
AND $0xff, R(j)
// swap state[i] <-> state[j]
MOVBU R(j)<<0(R(state)), R(t2)
MOVB R(t2), R(i)<<0(R(state))
MOVB R(t), R(j)<<0(R(state))
MOVBU R(j)<<2(R(state)), R(t2)
MOVB R(t2), R(i)<<2(R(state))
MOVB R(t), R(j)<<2(R(state))
// dst[k] = src[k] ^ state[state[i] + state[j]]
ADD R(t2), R(t)
AND $0xff, R(t)
MOVBU R(t)<<0(R(state)), R(t)
MOVBU R(t)<<2(R(state)), R(t)
MOVBU R(k)<<0(R(src)), R(t2)
EOR R(t), R(t2)
MOVB R(t2), R(k)<<0(R(dst))

View File

@ -6,7 +6,7 @@
package rc4
func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src may be the same slice but otherwise should not overlap.

View File

@ -5,6 +5,7 @@
package rc4
import (
"bytes"
"fmt"
"testing"
)
@ -115,6 +116,26 @@ func TestGolden(t *testing.T) {
}
}
func TestBlock(t *testing.T) {
c1a, _ := NewCipher(golden[0].key)
c1b, _ := NewCipher(golden[1].key)
data1 := make([]byte, 1<<20)
for i := range data1 {
c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
}
c2a, _ := NewCipher(golden[0].key)
c2b, _ := NewCipher(golden[1].key)
data2 := make([]byte, 1<<20)
c2a.XORKeyStream(data2, data2)
c2b.XORKeyStream(data2, data2)
if !bytes.Equal(data1, data2) {
t.Fatalf("bad block")
}
}
func benchmark(b *testing.B, size int64) {
buf := make([]byte, size)
c, err := NewCipher(golden[0].key)