mirror of
https://github.com/golang/go
synced 2024-09-25 05:20:13 -06:00
crypto/rc4: faster amd64 implementation
XOR key into data 128 bits at a time instead of 64 bits and pipeline half of state loads. Rotate loop to allow single-register indexing for state[i]. On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkRC4_128 412 224 -45.63% BenchmarkRC4_1K 3179 1613 -49.26% BenchmarkRC4_8K 25223 12545 -50.26% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 310.51 570.42 1.84x BenchmarkRC4_1K 322.09 634.48 1.97x BenchmarkRC4_8K 320.97 645.32 2.01x For comparison, on the same machine, openssl 0.9.8r reports its rc4 speed as somewhat under 350 MB/s for both 1K and 8K (it is operating 64 bits at a time). On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkRC4_128 418 259 -38.04% BenchmarkRC4_1K 3200 1884 -41.12% BenchmarkRC4_8K 25173 14529 -42.28% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 306.04 492.48 1.61x BenchmarkRC4_1K 319.93 543.26 1.70x BenchmarkRC4_8K 321.61 557.20 1.73x For comparison, on the same machine, openssl 1.0.1 reports its rc4 speed as 587 MB/s for 1K and 601 MB/s for 8K. R=agl CC=golang-dev https://golang.org/cl/7865046
This commit is contained in:
parent
d04ac4b0b7
commit
b505ff6279
@ -537,6 +537,11 @@ uchar yextrw[] =
|
|||||||
Yxr, Yrl, Zibm_r, 2,
|
Yxr, Yrl, Zibm_r, 2,
|
||||||
0
|
0
|
||||||
};
|
};
|
||||||
|
uchar yinsrw[] =
|
||||||
|
{
|
||||||
|
Yml, Yxr, Zibm_r, 2,
|
||||||
|
0
|
||||||
|
};
|
||||||
uchar yinsr[] =
|
uchar yinsr[] =
|
||||||
{
|
{
|
||||||
Ymm, Yxr, Zibm_r, 3,
|
Ymm, Yxr, Zibm_r, 3,
|
||||||
@ -992,7 +997,7 @@ Optab optab[] =
|
|||||||
{ APFRSQRT, ymfp, Px, 0x97 },
|
{ APFRSQRT, ymfp, Px, 0x97 },
|
||||||
{ APFSUB, ymfp, Px, 0x9a },
|
{ APFSUB, ymfp, Px, 0x9a },
|
||||||
{ APFSUBR, ymfp, Px, 0xaa },
|
{ APFSUBR, ymfp, Px, 0xaa },
|
||||||
{ APINSRW, yextrw, Pq, 0xc4,(00) },
|
{ APINSRW, yinsrw, Pq, 0xc4,(00) },
|
||||||
{ APINSRD, yinsr, Pq, 0x3a, 0x22, (00) },
|
{ APINSRD, yinsr, Pq, 0x3a, 0x22, (00) },
|
||||||
{ APINSRQ, yinsr, Pq3, 0x3a, 0x22, (00) },
|
{ APINSRQ, yinsr, Pq3, 0x3a, 0x22, (00) },
|
||||||
{ APMADDWL, ymm, Py, 0xf5,Pe,0xf5 },
|
{ APMADDWL, ymm, Py, 0xf5,Pe,0xf5 },
|
||||||
|
@ -13,7 +13,7 @@ import "strconv"
|
|||||||
|
|
||||||
// A Cipher is an instance of RC4 using a particular key.
|
// A Cipher is an instance of RC4 using a particular key.
|
||||||
type Cipher struct {
|
type Cipher struct {
|
||||||
s [256]byte
|
s [256]uint32
|
||||||
i, j uint8
|
i, j uint8
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) {
|
|||||||
}
|
}
|
||||||
var c Cipher
|
var c Cipher
|
||||||
for i := 0; i < 256; i++ {
|
for i := 0; i < 256; i++ {
|
||||||
c.s[i] = uint8(i)
|
c.s[i] = uint32(i)
|
||||||
}
|
}
|
||||||
var j uint8 = 0
|
var j uint8 = 0
|
||||||
for i := 0; i < 256; i++ {
|
for i := 0; i < 256; i++ {
|
||||||
j += c.s[i] + key[i%k]
|
j += uint8(c.s[i]) + key[i%k]
|
||||||
c.s[i], c.s[j] = c.s[j], c.s[i]
|
c.s[i], c.s[j] = c.s[j], c.s[i]
|
||||||
}
|
}
|
||||||
return &c, nil
|
return &c, nil
|
||||||
|
@ -20,19 +20,19 @@ loop:
|
|||||||
INCB AX
|
INCB AX
|
||||||
|
|
||||||
// j += c.s[i]
|
// j += c.s[i]
|
||||||
MOVBLZX (BP)(AX*1), DX
|
MOVBLZX (BP)(AX*4), DX
|
||||||
ADDB DX, BX
|
ADDB DX, BX
|
||||||
MOVBLZX BX, BX
|
MOVBLZX BX, BX
|
||||||
|
|
||||||
// c.s[i], c.s[j] = c.s[j], c.s[i]
|
// c.s[i], c.s[j] = c.s[j], c.s[i]
|
||||||
MOVBLZX (BP)(BX*1), CX
|
MOVBLZX (BP)(BX*4), CX
|
||||||
MOVB CX, (BP)(AX*1)
|
MOVB CX, (BP)(AX*4)
|
||||||
MOVB DX, (BP)(BX*1)
|
MOVB DX, (BP)(BX*4)
|
||||||
|
|
||||||
// *dst = *src ^ c.s[c.s[i]+c.s[j]]
|
// *dst = *src ^ c.s[c.s[i]+c.s[j]]
|
||||||
ADDB DX, CX
|
ADDB DX, CX
|
||||||
MOVBLZX CX, CX
|
MOVBLZX CX, CX
|
||||||
MOVB (BP)(CX*1), CX
|
MOVB (BP)(CX*4), CX
|
||||||
XORB (SI), CX
|
XORB (SI), CX
|
||||||
MOVBLZX CX, CX
|
MOVBLZX CX, CX
|
||||||
MOVB CX, (DI)
|
MOVB CX, (DI)
|
||||||
|
@ -1,11 +1,19 @@
|
|||||||
// Original source:
|
// Original source:
|
||||||
// http://www.zorinaq.com/papers/rc4-amd64.html
|
// http://www.zorinaq.com/papers/rc4-amd64.html
|
||||||
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
|
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
|
||||||
|
|
||||||
|
// Local modifications:
|
||||||
//
|
//
|
||||||
// Transliterated from GNU to 6a assembly syntax by the Go authors.
|
// Transliterated from GNU to 6a assembly syntax by the Go authors.
|
||||||
// The comments and spacing are from the original.
|
// The comments and spacing are from the original.
|
||||||
|
//
|
||||||
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
|
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
|
||||||
|
//
|
||||||
|
// The original code accumulated 64 bits of key stream in an integer
|
||||||
|
// register and then XOR'ed the key stream into the data 8 bytes at a time.
|
||||||
|
// Modified to accumulate 128 bits of key stream into an XMM register
|
||||||
|
// and then XOR the key stream into the data 16 bytes at a time.
|
||||||
|
// Approximately doubles throughput.
|
||||||
|
|
||||||
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
|
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
|
||||||
// but makes the code run 2.0x slower on Xeon.
|
// but makes the code run 2.0x slower on Xeon.
|
||||||
@ -38,59 +46,123 @@ TEXT ·xorKeyStream(SB),7,$0
|
|||||||
MOVQ yp+40(FP), AX
|
MOVQ yp+40(FP), AX
|
||||||
MOVBQZX 0(AX), DX // y = *yp
|
MOVBQZX 0(AX), DX // y = *yp
|
||||||
|
|
||||||
INCQ CX // x++
|
LEAQ (SI)(BX*1), R9 // limit = in+len
|
||||||
ANDQ $255, CX // x &= 0xff
|
|
||||||
LEAQ -8(BX)(SI*1), BX // rbx = in+len-8
|
|
||||||
MOVQ BX, R9 // tmp = in+len-8
|
|
||||||
MOVBLZX (BP)(CX*1), AX // tx = d[x]
|
|
||||||
CMPQ BX, SI // cmp in with in+len-8
|
|
||||||
JLT end // jump if (in+len-8 < in)
|
|
||||||
|
|
||||||
start:
|
l1: CMPQ SI, R9 // cmp in with in+len
|
||||||
ADDQ $8, SI // increment in
|
JGE finished // jump if (in >= in+len)
|
||||||
ADDQ $8, DI // increment out
|
|
||||||
|
|
||||||
// generate the next 8 bytes of the rc4 stream into R8
|
INCB CX
|
||||||
MOVQ $8, R11 // byte counter
|
|
||||||
l1: ADDB AX, DX
|
|
||||||
EXTEND(DX)
|
|
||||||
MOVBLZX (BP)(DX*1), BX // ty = d[y]
|
|
||||||
MOVB BX, (BP)(CX*1) // d[x] = ty
|
|
||||||
ADDB AX, BX // val = ty + tx
|
|
||||||
EXTEND(BX)
|
|
||||||
MOVB AX, (BP)(DX*1) // d[y] = tx
|
|
||||||
INCB CX // x++ (NEXT ROUND)
|
|
||||||
EXTEND(CX)
|
EXTEND(CX)
|
||||||
MOVBLZX (BP)(CX*1), AX // tx = d[x] (NEXT ROUND)
|
TESTL $15, CX
|
||||||
SHLQ $8, R8
|
JZ wordloop
|
||||||
MOVB (BP)(BX*1), R8 // val = d[val]
|
|
||||||
DECQ R11
|
|
||||||
JNZ l1
|
|
||||||
|
|
||||||
// xor 8 bytes
|
MOVBLZX (BP)(CX*4), AX
|
||||||
BSWAPQ R8
|
|
||||||
XORQ -8(SI), R8
|
|
||||||
CMPQ SI, R9 // cmp in+len-8 with in XXX
|
|
||||||
MOVQ R8, -8(DI)
|
|
||||||
JLE start // jump if (in <= in+len-8)
|
|
||||||
|
|
||||||
end:
|
|
||||||
ADDQ $8, R9 // tmp = in+len
|
|
||||||
|
|
||||||
// handle the last bytes, one by one
|
|
||||||
l2: CMPQ R9, SI // cmp in with in+len
|
|
||||||
JLE finished // jump if (in+len <= in)
|
|
||||||
ADDB AX, DX // y += tx
|
ADDB AX, DX // y += tx
|
||||||
EXTEND(DX)
|
EXTEND(DX)
|
||||||
MOVBLZX (BP)(DX*1), BX // ty = d[y]
|
MOVBLZX (BP)(DX*4), BX // ty = d[y]
|
||||||
MOVB BX, (BP)(CX*1) // d[x] = ty
|
MOVB BX, (BP)(CX*4) // d[x] = ty
|
||||||
ADDB AX, BX // val = ty+tx
|
ADDB AX, BX // val = ty+tx
|
||||||
EXTEND(BX)
|
EXTEND(BX)
|
||||||
MOVB AX, (BP)(DX*1) // d[y] = tx
|
MOVB AX, (BP)(DX*4) // d[y] = tx
|
||||||
INCB CX // x++ (NEXT ROUND)
|
MOVBLZX (BP)(BX*4), R8 // val = d[val]
|
||||||
|
XORB (SI), R8 // xor 1 byte
|
||||||
|
MOVB R8, (DI)
|
||||||
|
INCQ SI // in++
|
||||||
|
INCQ DI // out++
|
||||||
|
JMP l1
|
||||||
|
|
||||||
|
wordloop:
|
||||||
|
SUBQ $16, R9
|
||||||
|
CMPQ SI, R9
|
||||||
|
JGT end
|
||||||
|
|
||||||
|
start:
|
||||||
|
ADDQ $16, SI // increment in
|
||||||
|
ADDQ $16, DI // increment out
|
||||||
|
|
||||||
|
// Each KEYROUND generates one byte of key and
|
||||||
|
// inserts it into an XMM register at the given 16-bit index.
|
||||||
|
// The key state array is uint32 words only using the bottom
|
||||||
|
// byte of each word, so the 16-bit OR only copies 8 useful bits.
|
||||||
|
// We accumulate alternating bytes into X0 and X1, and then at
|
||||||
|
// the end we OR X1<<8 into X0 to produce the actual key.
|
||||||
|
//
|
||||||
|
// At the beginning of the loop, CX%16 == 0, so the 16 loads
|
||||||
|
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
|
||||||
|
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
|
||||||
|
// without fear of the byte computation CX+15 wrapping around.
|
||||||
|
//
|
||||||
|
// The first round needs R12[0], the second needs R12[1], and so on.
|
||||||
|
// We can avoid memory stalls by starting the load for round n+1
|
||||||
|
// before the end of round n, using the LOAD macro.
|
||||||
|
LEAQ (BP)(CX*4), R12
|
||||||
|
|
||||||
|
#define KEYROUND(xmm, load, off, r1, r2, index) \
|
||||||
|
MOVBLZX (BP)(DX*4), R8; \
|
||||||
|
MOVB r1, (BP)(DX*4); \
|
||||||
|
load((off+1), r2); \
|
||||||
|
MOVB R8, (off*4)(R12); \
|
||||||
|
ADDB r1, R8; \
|
||||||
|
EXTEND(R8); \
|
||||||
|
PINSRW $index, (BP)(R8*4), xmm
|
||||||
|
|
||||||
|
#define LOAD(off, reg) \
|
||||||
|
MOVBLZX (off*4)(R12), reg; \
|
||||||
|
ADDB reg, DX; \
|
||||||
|
EXTEND(DX)
|
||||||
|
|
||||||
|
#define SKIP(off, reg)
|
||||||
|
|
||||||
|
LOAD(0, AX)
|
||||||
|
KEYROUND(X0, LOAD, 0, AX, BX, 0)
|
||||||
|
KEYROUND(X1, LOAD, 1, BX, AX, 0)
|
||||||
|
KEYROUND(X0, LOAD, 2, AX, BX, 1)
|
||||||
|
KEYROUND(X1, LOAD, 3, BX, AX, 1)
|
||||||
|
KEYROUND(X0, LOAD, 4, AX, BX, 2)
|
||||||
|
KEYROUND(X1, LOAD, 5, BX, AX, 2)
|
||||||
|
KEYROUND(X0, LOAD, 6, AX, BX, 3)
|
||||||
|
KEYROUND(X1, LOAD, 7, BX, AX, 3)
|
||||||
|
KEYROUND(X0, LOAD, 8, AX, BX, 4)
|
||||||
|
KEYROUND(X1, LOAD, 9, BX, AX, 4)
|
||||||
|
KEYROUND(X0, LOAD, 10, AX, BX, 5)
|
||||||
|
KEYROUND(X1, LOAD, 11, BX, AX, 5)
|
||||||
|
KEYROUND(X0, LOAD, 12, AX, BX, 6)
|
||||||
|
KEYROUND(X1, LOAD, 13, BX, AX, 6)
|
||||||
|
KEYROUND(X0, LOAD, 14, AX, BX, 7)
|
||||||
|
KEYROUND(X1, SKIP, 15, BX, AX, 7)
|
||||||
|
|
||||||
|
ADDB $16, CX
|
||||||
|
|
||||||
|
PSLLQ $8, X1
|
||||||
|
PXOR X1, X0
|
||||||
|
MOVOU -16(SI), X2
|
||||||
|
PXOR X0, X2
|
||||||
|
MOVOU X2, -16(DI)
|
||||||
|
|
||||||
|
CMPQ SI, R9 // cmp in with in+len-16
|
||||||
|
JLE start // jump if (in <= in+len-16)
|
||||||
|
|
||||||
|
end:
|
||||||
|
DECB CX
|
||||||
|
ADDQ $16, R9 // tmp = in+len
|
||||||
|
|
||||||
|
// handle the last bytes, one by one
|
||||||
|
l2: CMPQ SI, R9 // cmp in with in+len
|
||||||
|
JGE finished // jump if (in >= in+len)
|
||||||
|
|
||||||
|
INCB CX
|
||||||
EXTEND(CX)
|
EXTEND(CX)
|
||||||
MOVBLZX (BP)(CX*1), AX // tx = d[x] (NEXT ROUND)
|
MOVBLZX (BP)(CX*4), AX
|
||||||
MOVBLZX (BP)(BX*1), R8 // val = d[val]
|
|
||||||
|
ADDB AX, DX // y += tx
|
||||||
|
EXTEND(DX)
|
||||||
|
MOVBLZX (BP)(DX*4), BX // ty = d[y]
|
||||||
|
MOVB BX, (BP)(CX*4) // d[x] = ty
|
||||||
|
ADDB AX, BX // val = ty+tx
|
||||||
|
EXTEND(BX)
|
||||||
|
MOVB AX, (BP)(DX*4) // d[y] = tx
|
||||||
|
MOVBLZX (BP)(BX*4), R8 // val = d[val]
|
||||||
XORB (SI), R8 // xor 1 byte
|
XORB (SI), R8 // xor 1 byte
|
||||||
MOVB R8, (DI)
|
MOVB R8, (DI)
|
||||||
INCQ SI // in++
|
INCQ SI // in++
|
||||||
@ -98,7 +170,6 @@ l2: CMPQ R9, SI // cmp in with in+len
|
|||||||
JMP l2
|
JMP l2
|
||||||
|
|
||||||
finished:
|
finished:
|
||||||
DECQ CX // x--
|
|
||||||
MOVQ yp+40(FP), BX
|
MOVQ yp+40(FP), BX
|
||||||
MOVB DX, 0(BX)
|
MOVB DX, 0(BX)
|
||||||
MOVQ xp+32(FP), AX
|
MOVQ xp+32(FP), AX
|
||||||
|
@ -31,19 +31,19 @@ loop:
|
|||||||
// i += 1; j += state[i]
|
// i += 1; j += state[i]
|
||||||
ADD $1, R(i)
|
ADD $1, R(i)
|
||||||
AND $0xff, R(i)
|
AND $0xff, R(i)
|
||||||
MOVBU R(i)<<0(R(state)), R(t)
|
MOVBU R(i)<<2(R(state)), R(t)
|
||||||
ADD R(t), R(j)
|
ADD R(t), R(j)
|
||||||
AND $0xff, R(j)
|
AND $0xff, R(j)
|
||||||
|
|
||||||
// swap state[i] <-> state[j]
|
// swap state[i] <-> state[j]
|
||||||
MOVBU R(j)<<0(R(state)), R(t2)
|
MOVBU R(j)<<2(R(state)), R(t2)
|
||||||
MOVB R(t2), R(i)<<0(R(state))
|
MOVB R(t2), R(i)<<2(R(state))
|
||||||
MOVB R(t), R(j)<<0(R(state))
|
MOVB R(t), R(j)<<2(R(state))
|
||||||
|
|
||||||
// dst[k] = src[k] ^ state[state[i] + state[j]]
|
// dst[k] = src[k] ^ state[state[i] + state[j]]
|
||||||
ADD R(t2), R(t)
|
ADD R(t2), R(t)
|
||||||
AND $0xff, R(t)
|
AND $0xff, R(t)
|
||||||
MOVBU R(t)<<0(R(state)), R(t)
|
MOVBU R(t)<<2(R(state)), R(t)
|
||||||
MOVBU R(k)<<0(R(src)), R(t2)
|
MOVBU R(k)<<0(R(src)), R(t2)
|
||||||
EOR R(t), R(t2)
|
EOR R(t), R(t2)
|
||||||
MOVB R(t2), R(k)<<0(R(dst))
|
MOVB R(t2), R(k)<<0(R(dst))
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
package rc4
|
package rc4
|
||||||
|
|
||||||
func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
|
func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
|
||||||
|
|
||||||
// XORKeyStream sets dst to the result of XORing src with the key stream.
|
// XORKeyStream sets dst to the result of XORing src with the key stream.
|
||||||
// Dst and src may be the same slice but otherwise should not overlap.
|
// Dst and src may be the same slice but otherwise should not overlap.
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
package rc4
|
package rc4
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@ -115,6 +116,26 @@ func TestGolden(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBlock(t *testing.T) {
|
||||||
|
c1a, _ := NewCipher(golden[0].key)
|
||||||
|
c1b, _ := NewCipher(golden[1].key)
|
||||||
|
data1 := make([]byte, 1<<20)
|
||||||
|
for i := range data1 {
|
||||||
|
c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
|
||||||
|
c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
|
||||||
|
}
|
||||||
|
|
||||||
|
c2a, _ := NewCipher(golden[0].key)
|
||||||
|
c2b, _ := NewCipher(golden[1].key)
|
||||||
|
data2 := make([]byte, 1<<20)
|
||||||
|
c2a.XORKeyStream(data2, data2)
|
||||||
|
c2b.XORKeyStream(data2, data2)
|
||||||
|
|
||||||
|
if !bytes.Equal(data1, data2) {
|
||||||
|
t.Fatalf("bad block")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func benchmark(b *testing.B, size int64) {
|
func benchmark(b *testing.B, size int64) {
|
||||||
buf := make([]byte, size)
|
buf := make([]byte, size)
|
||||||
c, err := NewCipher(golden[0].key)
|
c, err := NewCipher(golden[0].key)
|
||||||
|
Loading…
Reference in New Issue
Block a user