mirror of
https://github.com/golang/go
synced 2024-11-19 13:44:52 -07:00
hash/crc32: add SSE4.2 support
Using the CRC32 instruction speeds up the Castagnoli computation by about 20x on a modern Intel CPU. R=rsc CC=golang-dev https://golang.org/cl/4650072
This commit is contained in:
parent
0f8678a747
commit
9f4c288c16
@ -169,6 +169,13 @@ assemble(char *file)
|
||||
struct
|
||||
{
|
||||
char *name;
|
||||
/*
|
||||
* type is the lexical type to return. It dictates what kind of
|
||||
* operands 6a allows to follow it (in a.y) as the possible operand
|
||||
* types are handled by a grammar. How do you know which LTYPE?
|
||||
* Either read a.y or think of an instruction that has the same
|
||||
* possible operands and look up what it takes.
|
||||
*/
|
||||
ushort type;
|
||||
ushort value;
|
||||
} itab[] =
|
||||
@ -985,6 +992,8 @@ struct
|
||||
"UNPCKLPS", LTYPE3, AUNPCKLPS,
|
||||
"XORPD", LTYPE3, AXORPD,
|
||||
"XORPS", LTYPE3, AXORPS,
|
||||
"CRC32B", LTYPE4, ACRC32B,
|
||||
"CRC32Q", LTYPE4, ACRC32Q,
|
||||
|
||||
0
|
||||
};
|
||||
|
@ -730,6 +730,8 @@ enum as
|
||||
ASWAPGS,
|
||||
|
||||
AMODE,
|
||||
ACRC32B,
|
||||
ACRC32Q,
|
||||
|
||||
ALAST
|
||||
};
|
||||
|
@ -222,6 +222,7 @@ enum
|
||||
Zxxx = 0,
|
||||
|
||||
Zlit,
|
||||
Zlitm_r,
|
||||
Z_rp,
|
||||
Zbr,
|
||||
Zcall,
|
||||
|
@ -529,7 +529,69 @@ uchar ymskb[] =
|
||||
Ymr, Yrl, Zm_r_xm, 1,
|
||||
0
|
||||
};
|
||||
uchar ycrc32l[] =
|
||||
{
|
||||
Yml, Yrl, Zlitm_r, 0,
|
||||
};
|
||||
|
||||
/*
|
||||
* You are doasm, holding in your hand a Prog* with p->as set to, say, ACRC32,
|
||||
* and p->from and p->to as operands (Adr*). The linker scans optab to find
|
||||
* the entry with the given p->as and then looks through the ytable for that
|
||||
* instruction (the second field in the optab struct) for a line whose first
|
||||
* two values match the Ytypes of the p->from and p->to operands. The function
|
||||
* oclass in span.c computes the specific Ytype of an operand and then the set
|
||||
* of more general Ytypes that it satisfies is implied by the ycover table, set
|
||||
* up in instinit. For example, oclass distinguishes the constants 0 and 1
|
||||
* from the more general 8-bit constants, but instinit says
|
||||
*
|
||||
* ycover[Yi0*Ymax + Ys32] = 1;
|
||||
* ycover[Yi1*Ymax + Ys32] = 1;
|
||||
* ycover[Yi8*Ymax + Ys32] = 1;
|
||||
*
|
||||
* which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
|
||||
* if that's what an instruction can handle.
|
||||
*
|
||||
* In parallel with the scan through the ytable for the appropriate line, there
|
||||
* is a z pointer that starts out pointing at the strange magic byte list in
|
||||
* the Optab struct. With each step past a non-matching ytable line, z
|
||||
* advances by the 4th entry in the line. When a matching line is found, that
|
||||
* z pointer has the extra data to use in laying down the instruction bytes.
|
||||
* The actual bytes laid down are a function of the 3rd entry in the line (that
|
||||
* is, the Ztype) and the z bytes.
|
||||
*
|
||||
* For example, let's look at AADDL. The optab line says:
|
||||
* { AADDL, yaddl, Px, 0x83,(00),0x05,0x81,(00),0x01,0x03 },
|
||||
*
|
||||
* and yaddl says
|
||||
* uchar yaddl[] =
|
||||
* {
|
||||
* Yi8, Yml, Zibo_m, 2,
|
||||
* Yi32, Yax, Zil_, 1,
|
||||
* Yi32, Yml, Zilo_m, 2,
|
||||
* Yrl, Yml, Zr_m, 1,
|
||||
* Yml, Yrl, Zm_r, 1,
|
||||
* 0
|
||||
* };
|
||||
*
|
||||
* so there are 5 possible types of ADDL instruction that can be laid down, and
|
||||
* possible states used to lay them down (Ztype and z pointer, assuming z
|
||||
* points at {0x83,(00),0x05,0x81,(00),0x01,0x03}) are:
|
||||
*
|
||||
* Yi8, Yml -> Zibo_m, z (0x83, 00)
|
||||
* Yi32, Yax -> Zil_, z+2 (0x05)
|
||||
* Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
|
||||
* Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
|
||||
* Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
|
||||
*
|
||||
* The Pconstant in the optab line controls the prefix bytes to emit. That's
|
||||
* relatively straightforward as this program goes.
|
||||
*
|
||||
* The switch on t[2] in doasm implements the various Z cases. Zibo_m, for
|
||||
* example, is an opcode byte (z[0]) then an asmando (which is some kind of
|
||||
* encoded addressing mode for the Yml arg), and then a single immediate byte.
|
||||
* Zilo_m is the same but a long (32-bit) immediate.
|
||||
*/
|
||||
Optab optab[] =
|
||||
/* as, ytab, andproto, opcode */
|
||||
{
|
||||
@ -1199,6 +1261,9 @@ Optab optab[] =
|
||||
{ AXADDQ, yrl_ml, Pw, 0x0f,0xc1 },
|
||||
{ AXADDW, yrl_ml, Pe, 0x0f,0xc1 },
|
||||
|
||||
{ ACRC32B, ycrc32l,Px, 0xf2,0x0f,0x38,0xf0,0},
|
||||
{ ACRC32Q, ycrc32l,Pw, 0xf2,0x0f,0x38,0xf1,0},
|
||||
|
||||
{ AEND },
|
||||
0
|
||||
};
|
||||
|
@ -1166,6 +1166,12 @@ found:
|
||||
*andptr++ = op;
|
||||
break;
|
||||
|
||||
case Zlitm_r:
|
||||
for(; op = o->op[z]; z++)
|
||||
*andptr++ = op;
|
||||
asmand(&p->from, &p->to);
|
||||
break;
|
||||
|
||||
case Zmb_r:
|
||||
bytereg(&p->from, &p->ft);
|
||||
/* fall through */
|
||||
|
@ -5,7 +5,16 @@
|
||||
include ../../../Make.inc
|
||||
|
||||
TARG=hash/crc32
|
||||
|
||||
ifeq ($(GOARCH), amd64)
|
||||
ARCH_GOFILES=crc32_amd64.go
|
||||
OFILES=crc32_amd64.6
|
||||
else
|
||||
ARCH_GOFILES=crc32_generic.go
|
||||
endif
|
||||
|
||||
GOFILES=\
|
||||
crc32.go\
|
||||
$(ARCH_GOFILES)
|
||||
|
||||
include ../../../Make.pkg
|
||||
|
@ -10,6 +10,7 @@ package crc32
|
||||
import (
|
||||
"hash"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// The size of a CRC-32 checksum in bytes.
|
||||
@ -35,8 +36,34 @@ const (
|
||||
// Table is a 256-word table representing the polynomial for efficient processing.
|
||||
type Table [256]uint32
|
||||
|
||||
// castagnoliTable points to a lazily initialized Table for the Castagnoli
|
||||
// polynomial. MakeTable will always return this value when asked to make a
|
||||
// Castagnoli table so we can compare against it to find when the caller is
|
||||
// using this polynomial.
|
||||
var castagnoliTable *Table
|
||||
var castagnoliOnce sync.Once
|
||||
|
||||
func castagnoliInit() {
|
||||
castagnoliTable = makeTable(Castagnoli)
|
||||
}
|
||||
|
||||
// IEEETable is the table for the IEEE polynomial.
|
||||
var IEEETable = makeTable(IEEE)
|
||||
|
||||
// MakeTable returns the Table constructed from the specified polynomial.
|
||||
func MakeTable(poly uint32) *Table {
|
||||
switch poly {
|
||||
case IEEE:
|
||||
return IEEETable
|
||||
case Castagnoli:
|
||||
castagnoliOnce.Do(castagnoliInit)
|
||||
return castagnoliTable
|
||||
}
|
||||
return makeTable(poly)
|
||||
}
|
||||
|
||||
// makeTable returns the Table constructed from the specified polynomial.
|
||||
func makeTable(poly uint32) *Table {
|
||||
t := new(Table)
|
||||
for i := 0; i < 256; i++ {
|
||||
crc := uint32(i)
|
||||
@ -52,9 +79,6 @@ func MakeTable(poly uint32) *Table {
|
||||
return t
|
||||
}
|
||||
|
||||
// IEEETable is the table for the IEEE polynomial.
|
||||
var IEEETable = MakeTable(IEEE)
|
||||
|
||||
// digest represents the partial evaluation of a checksum.
|
||||
type digest struct {
|
||||
crc uint32
|
||||
@ -83,11 +107,14 @@ func update(crc uint32, tab *Table, p []byte) uint32 {
|
||||
|
||||
// Update returns the result of adding the bytes in p to the crc.
|
||||
func Update(crc uint32, tab *Table, p []byte) uint32 {
|
||||
if tab == castagnoliTable {
|
||||
return updateCastagnoli(crc, p)
|
||||
}
|
||||
return update(crc, tab, p)
|
||||
}
|
||||
|
||||
func (d *digest) Write(p []byte) (n int, err os.Error) {
|
||||
d.crc = update(d.crc, d.tab, p)
|
||||
d.crc = Update(d.crc, d.tab, p)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
@ -105,7 +132,7 @@ func (d *digest) Sum() []byte {
|
||||
|
||||
// Checksum returns the CRC-32 checksum of data
|
||||
// using the polynomial represented by the Table.
|
||||
func Checksum(data []byte, tab *Table) uint32 { return update(0, tab, data) }
|
||||
func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
|
||||
|
||||
// ChecksumIEEE returns the CRC-32 checksum of data
|
||||
// using the IEEE polynomial.
|
||||
|
25
src/pkg/hash/crc32/crc32_amd64.go
Normal file
25
src/pkg/hash/crc32/crc32_amd64.go
Normal file
@ -0,0 +1,25 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package crc32
|
||||
|
||||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
// CRC.
|
||||
|
||||
// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2
|
||||
// support.
|
||||
func haveSSE42() bool
|
||||
|
||||
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
|
||||
// instruction.
|
||||
func castagnoliSSE42(uint32, []byte) uint32
|
||||
|
||||
var sse42 = haveSSE42()
|
||||
|
||||
func updateCastagnoli(crc uint32, p []byte) uint32 {
|
||||
if sse42 {
|
||||
return castagnoliSSE42(crc, p)
|
||||
}
|
||||
return update(crc, castagnoliTable, p)
|
||||
}
|
62
src/pkg/hash/crc32/crc32_amd64.s
Normal file
62
src/pkg/hash/crc32/crc32_amd64.s
Normal file
@ -0,0 +1,62 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// func castagnoliSSE42(crc uint32, p []byte) uint32
|
||||
TEXT ·castagnoliSSE42(SB),7,$0
|
||||
MOVL crc+0(FP), AX // CRC value
|
||||
MOVQ p+8(FP), SI // data pointer
|
||||
MOVL p+16(FP), CX // len(p)
|
||||
|
||||
NOTL AX
|
||||
|
||||
/* If there's less than 8 bytes to process, we do it byte-by-byte. */
|
||||
CMPL CX, $8
|
||||
JL cleanup
|
||||
|
||||
/* Process individual bytes until the input is 8-byte aligned. */
|
||||
startup:
|
||||
MOVQ SI, BX
|
||||
ANDQ $7, BX
|
||||
JZ aligned
|
||||
|
||||
CRC32B (SI), AX
|
||||
DECL CX
|
||||
INCQ SI
|
||||
JMP startup
|
||||
|
||||
aligned:
|
||||
/* The input is now 8-byte aligned and we can process 8-byte chunks. */
|
||||
CMPL CX, $8
|
||||
JL cleanup
|
||||
|
||||
CRC32Q (SI), AX
|
||||
ADDQ $8, SI
|
||||
SUBQ $8, CX
|
||||
JMP aligned
|
||||
|
||||
cleanup:
|
||||
/* We may have some bytes left over that we process one at a time. */
|
||||
CMPL CX, $0
|
||||
JE done
|
||||
|
||||
CRC32B (SI), AX
|
||||
INCQ SI
|
||||
DECQ CX
|
||||
JMP cleanup
|
||||
|
||||
done:
|
||||
NOTL AX
|
||||
MOVL AX, ret+24(FP)
|
||||
RET
|
||||
|
||||
// func haveSSE42() bool
|
||||
TEXT ·haveSSE42(SB),7,$0
|
||||
XORQ AX, AX
|
||||
INCL AX
|
||||
CPUID
|
||||
SHRQ $20, CX
|
||||
ANDQ $1, CX
|
||||
MOVB CX, ret+0(FP)
|
||||
RET
|
||||
|
12
src/pkg/hash/crc32/crc32_generic.go
Normal file
12
src/pkg/hash/crc32/crc32_generic.go
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package crc32
|
||||
|
||||
// The file contains the generic version of updateCastagnoli which just calls
|
||||
// the software implementation.
|
||||
|
||||
func updateCastagnoli(crc uint32, p []byte) uint32 {
|
||||
return update(crc, castagnoliTable, p)
|
||||
}
|
@ -10,53 +10,73 @@ import (
|
||||
)
|
||||
|
||||
type test struct {
|
||||
out uint32
|
||||
ieee, castagnoli uint32
|
||||
in string
|
||||
}
|
||||
|
||||
var golden = []test{
|
||||
{0x0, ""},
|
||||
{0xe8b7be43, "a"},
|
||||
{0x9e83486d, "ab"},
|
||||
{0x352441c2, "abc"},
|
||||
{0xed82cd11, "abcd"},
|
||||
{0x8587d865, "abcde"},
|
||||
{0x4b8e39ef, "abcdef"},
|
||||
{0x312a6aa6, "abcdefg"},
|
||||
{0xaeef2a50, "abcdefgh"},
|
||||
{0x8da988af, "abcdefghi"},
|
||||
{0x3981703a, "abcdefghij"},
|
||||
{0x6b9cdfe7, "Discard medicine more than two years old."},
|
||||
{0xc90ef73f, "He who has a shady past knows that nice guys finish last."},
|
||||
{0xb902341f, "I wouldn't marry him with a ten foot pole."},
|
||||
{0x42080e8, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
|
||||
{0x154c6d11, "The days of the digital watch are numbered. -Tom Stoppard"},
|
||||
{0x4c418325, "Nepal premier won't resign."},
|
||||
{0x33955150, "For every action there is an equal and opposite government program."},
|
||||
{0x26216a4b, "His money is twice tainted: 'taint yours and 'taint mine."},
|
||||
{0x1abbe45e, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
|
||||
{0xc89a94f7, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
|
||||
{0xab3abe14, "size: a.out: bad magic"},
|
||||
{0xbab102b6, "The major problem is with sendmail. -Mark Horton"},
|
||||
{0x999149d7, "Give me a rock, paper and scissors and I will move the world. CCFestoon"},
|
||||
{0x6d52a33c, "If the enemy is within range, then so are you."},
|
||||
{0x90631e8d, "It's well we cannot hear the screams/That we create in others' dreams."},
|
||||
{0x78309130, "You remind me of a TV show, but that's all right: I watch it anyway."},
|
||||
{0x7d0a377f, "C is as portable as Stonehedge!!"},
|
||||
{0x8c79fd79, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
|
||||
{0xa20b7167, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"},
|
||||
{0x8e0bb443, "How can you write a big system without C++? -Paul Glick"},
|
||||
{0x0, 0x0, ""},
|
||||
{0xe8b7be43, 0xc1d04330, "a"},
|
||||
{0x9e83486d, 0xe2a22936, "ab"},
|
||||
{0x352441c2, 0x364b3fb7, "abc"},
|
||||
{0xed82cd11, 0x92c80a31, "abcd"},
|
||||
{0x8587d865, 0xc450d697, "abcde"},
|
||||
{0x4b8e39ef, 0x53bceff1, "abcdef"},
|
||||
{0x312a6aa6, 0xe627f441, "abcdefg"},
|
||||
{0xaeef2a50, 0xa9421b7, "abcdefgh"},
|
||||
{0x8da988af, 0x2ddc99fc, "abcdefghi"},
|
||||
{0x3981703a, 0xe6599437, "abcdefghij"},
|
||||
{0x6b9cdfe7, 0xb2cc01fe, "Discard medicine more than two years old."},
|
||||
{0xc90ef73f, 0xe28207f, "He who has a shady past knows that nice guys finish last."},
|
||||
{0xb902341f, 0xbe93f964, "I wouldn't marry him with a ten foot pole."},
|
||||
{0x42080e8, 0x9e3be0c3, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
|
||||
{0x154c6d11, 0xf505ef04, "The days of the digital watch are numbered. -Tom Stoppard"},
|
||||
{0x4c418325, 0x85d3dc82, "Nepal premier won't resign."},
|
||||
{0x33955150, 0xc5142380, "For every action there is an equal and opposite government program."},
|
||||
{0x26216a4b, 0x75eb77dd, "His money is twice tainted: 'taint yours and 'taint mine."},
|
||||
{0x1abbe45e, 0x91ebe9f7, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
|
||||
{0xc89a94f7, 0xf0b1168e, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
|
||||
{0xab3abe14, 0x572b74e2, "size: a.out: bad magic"},
|
||||
{0xbab102b6, 0x8a58a6d5, "The major problem is with sendmail. -Mark Horton"},
|
||||
{0x999149d7, 0x9c426c50, "Give me a rock, paper and scissors and I will move the world. CCFestoon"},
|
||||
{0x6d52a33c, 0x735400a4, "If the enemy is within range, then so are you."},
|
||||
{0x90631e8d, 0xbec49c95, "It's well we cannot hear the screams/That we create in others' dreams."},
|
||||
{0x78309130, 0xa95a2079, "You remind me of a TV show, but that's all right: I watch it anyway."},
|
||||
{0x7d0a377f, 0xde2e65c5, "C is as portable as Stonehedge!!"},
|
||||
{0x8c79fd79, 0x297a88ed, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
|
||||
{0xa20b7167, 0x66ed1d8b, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"},
|
||||
{0x8e0bb443, 0xdcded527, "How can you write a big system without C++? -Paul Glick"},
|
||||
}
|
||||
|
||||
func TestGolden(t *testing.T) {
|
||||
for i := 0; i < len(golden); i++ {
|
||||
g := golden[i]
|
||||
c := NewIEEE()
|
||||
io.WriteString(c, g.in)
|
||||
s := c.Sum32()
|
||||
if s != g.out {
|
||||
t.Errorf("crc32(%s) = 0x%x want 0x%x", g.in, s, g.out)
|
||||
t.FailNow()
|
||||
castagnoliTab := MakeTable(Castagnoli)
|
||||
|
||||
for _, g := range golden {
|
||||
ieee := NewIEEE()
|
||||
io.WriteString(ieee, g.in)
|
||||
s := ieee.Sum32()
|
||||
if s != g.ieee {
|
||||
t.Errorf("IEEE(%s) = 0x%x want 0x%x", g.in, s, g.ieee)
|
||||
}
|
||||
|
||||
castagnoli := New(castagnoliTab)
|
||||
io.WriteString(castagnoli, g.in)
|
||||
s = castagnoli.Sum32()
|
||||
if s != g.castagnoli {
|
||||
t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
|
||||
}
|
||||
|
||||
if len(g.in) > 0 {
|
||||
// The SSE4.2 implementation of this has code to deal
|
||||
// with misaligned data so we ensure that we test that
|
||||
// too.
|
||||
castagnoli = New(castagnoliTab)
|
||||
io.WriteString(castagnoli, g.in[:1])
|
||||
io.WriteString(castagnoli, g.in[1:])
|
||||
s = castagnoli.Sum32()
|
||||
if s != g.castagnoli {
|
||||
t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -69,6 +89,7 @@ func BenchmarkCrc32KB(b *testing.B) {
|
||||
}
|
||||
c := NewIEEE()
|
||||
b.StartTimer()
|
||||
b.SetBytes(int64(len(data)))
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
c.Write(data)
|
||||
|
Loading…
Reference in New Issue
Block a user