diff --git a/src/cmd/6a/lex.c b/src/cmd/6a/lex.c index b4c7d0c2cb..784241bde5 100644 --- a/src/cmd/6a/lex.c +++ b/src/cmd/6a/lex.c @@ -169,6 +169,13 @@ assemble(char *file) struct { char *name; + /* + * type is the lexical type to return. It dictates what kind of + * operands 6a allows to follow it (in a.y) as the possible operand + * types are handled by a grammar. How do you know which LTYPE? + * Either read a.y or think of an instruction that has the same + * possible operands and look up what it takes. + */ ushort type; ushort value; } itab[] = @@ -985,6 +992,8 @@ struct "UNPCKLPS", LTYPE3, AUNPCKLPS, "XORPD", LTYPE3, AXORPD, "XORPS", LTYPE3, AXORPS, + "CRC32B", LTYPE4, ACRC32B, + "CRC32Q", LTYPE4, ACRC32Q, 0 }; diff --git a/src/cmd/6l/6.out.h b/src/cmd/6l/6.out.h index 709f82ccc1..24fede53c2 100644 --- a/src/cmd/6l/6.out.h +++ b/src/cmd/6l/6.out.h @@ -730,6 +730,8 @@ enum as ASWAPGS, AMODE, + ACRC32B, + ACRC32Q, ALAST }; diff --git a/src/cmd/6l/l.h b/src/cmd/6l/l.h index f4ee6aa923..c3f5949ed8 100644 --- a/src/cmd/6l/l.h +++ b/src/cmd/6l/l.h @@ -222,6 +222,7 @@ enum Zxxx = 0, Zlit, + Zlitm_r, Z_rp, Zbr, Zcall, diff --git a/src/cmd/6l/optab.c b/src/cmd/6l/optab.c index 6cc50313e3..928ad5d9ae 100644 --- a/src/cmd/6l/optab.c +++ b/src/cmd/6l/optab.c @@ -529,7 +529,69 @@ uchar ymskb[] = Ymr, Yrl, Zm_r_xm, 1, 0 }; +uchar ycrc32l[] = +{ + Yml, Yrl, Zlitm_r, 0, +}; +/* + * You are doasm, holding in your hand a Prog* with p->as set to, say, ACRC32, + * and p->from and p->to as operands (Adr*). The linker scans optab to find + * the entry with the given p->as and then looks through the ytable for that + * instruction (the second field in the optab struct) for a line whose first + * two values match the Ytypes of the p->from and p->to operands. The function + * oclass in span.c computes the specific Ytype of an operand and then the set + * of more general Ytypes that it satisfies is implied by the ycover table, set + * up in instinit. For example, oclass distinguishes the constants 0 and 1 + * from the more general 8-bit constants, but instinit says + * + * ycover[Yi0*Ymax + Ys32] = 1; + * ycover[Yi1*Ymax + Ys32] = 1; + * ycover[Yi8*Ymax + Ys32] = 1; + * + * which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32) + * if that's what an instruction can handle. + * + * In parallel with the scan through the ytable for the appropriate line, there + * is a z pointer that starts out pointing at the strange magic byte list in + * the Optab struct. With each step past a non-matching ytable line, z + * advances by the 4th entry in the line. When a matching line is found, that + * z pointer has the extra data to use in laying down the instruction bytes. + * The actual bytes laid down are a function of the 3rd entry in the line (that + * is, the Ztype) and the z bytes. + * + * For example, let's look at AADDL. The optab line says: + * { AADDL, yaddl, Px, 0x83,(00),0x05,0x81,(00),0x01,0x03 }, + * + * and yaddl says + * uchar yaddl[] = + * { + * Yi8, Yml, Zibo_m, 2, + * Yi32, Yax, Zil_, 1, + * Yi32, Yml, Zilo_m, 2, + * Yrl, Yml, Zr_m, 1, + * Yml, Yrl, Zm_r, 1, + * 0 + * }; + * + * so there are 5 possible types of ADDL instruction that can be laid down, and + * possible states used to lay them down (Ztype and z pointer, assuming z + * points at {0x83,(00),0x05,0x81,(00),0x01,0x03}) are: + * + * Yi8, Yml -> Zibo_m, z (0x83, 00) + * Yi32, Yax -> Zil_, z+2 (0x05) + * Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00) + * Yrl, Yml -> Zr_m, z+2+1+2 (0x01) + * Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03) + * + * The Pconstant in the optab line controls the prefix bytes to emit. That's + * relatively straightforward as this program goes. + * + * The switch on t[2] in doasm implements the various Z cases. Zibo_m, for + * example, is an opcode byte (z[0]) then an asmando (which is some kind of + * encoded addressing mode for the Yml arg), and then a single immediate byte. + * Zilo_m is the same but a long (32-bit) immediate. + */ Optab optab[] = /* as, ytab, andproto, opcode */ { @@ -1199,6 +1261,9 @@ Optab optab[] = { AXADDQ, yrl_ml, Pw, 0x0f,0xc1 }, { AXADDW, yrl_ml, Pe, 0x0f,0xc1 }, + { ACRC32B, ycrc32l,Px, 0xf2,0x0f,0x38,0xf0,0}, + { ACRC32Q, ycrc32l,Pw, 0xf2,0x0f,0x38,0xf1,0}, + { AEND }, 0 }; diff --git a/src/cmd/6l/span.c b/src/cmd/6l/span.c index 5251f19bba..426db255d9 100644 --- a/src/cmd/6l/span.c +++ b/src/cmd/6l/span.c @@ -1166,6 +1166,12 @@ found: *andptr++ = op; break; + case Zlitm_r: + for(; op = o->op[z]; z++) + *andptr++ = op; + asmand(&p->from, &p->to); + break; + case Zmb_r: bytereg(&p->from, &p->ft); /* fall through */ diff --git a/src/pkg/hash/crc32/Makefile b/src/pkg/hash/crc32/Makefile index 31b205185c..af8a64cf21 100644 --- a/src/pkg/hash/crc32/Makefile +++ b/src/pkg/hash/crc32/Makefile @@ -5,7 +5,16 @@ include ../../../Make.inc TARG=hash/crc32 + +ifeq ($(GOARCH), amd64) + ARCH_GOFILES=crc32_amd64.go + OFILES=crc32_amd64.6 +else + ARCH_GOFILES=crc32_generic.go +endif + GOFILES=\ crc32.go\ + $(ARCH_GOFILES) include ../../../Make.pkg diff --git a/src/pkg/hash/crc32/crc32.go b/src/pkg/hash/crc32/crc32.go index 88a4499716..0245b1ee8a 100644 --- a/src/pkg/hash/crc32/crc32.go +++ b/src/pkg/hash/crc32/crc32.go @@ -10,6 +10,7 @@ package crc32 import ( "hash" "os" + "sync" ) // The size of a CRC-32 checksum in bytes. @@ -35,8 +36,34 @@ const ( // Table is a 256-word table representing the polynomial for efficient processing. type Table [256]uint32 +// castagnoliTable points to a lazily initialized Table for the Castagnoli +// polynomial. MakeTable will always return this value when asked to make a +// Castagnoli table so we can compare against it to find when the caller is +// using this polynomial. +var castagnoliTable *Table +var castagnoliOnce sync.Once + +func castagnoliInit() { + castagnoliTable = makeTable(Castagnoli) +} + +// IEEETable is the table for the IEEE polynomial. +var IEEETable = makeTable(IEEE) + // MakeTable returns the Table constructed from the specified polynomial. func MakeTable(poly uint32) *Table { + switch poly { + case IEEE: + return IEEETable + case Castagnoli: + castagnoliOnce.Do(castagnoliInit) + return castagnoliTable + } + return makeTable(poly) +} + +// makeTable returns the Table constructed from the specified polynomial. +func makeTable(poly uint32) *Table { t := new(Table) for i := 0; i < 256; i++ { crc := uint32(i) @@ -52,9 +79,6 @@ func MakeTable(poly uint32) *Table { return t } -// IEEETable is the table for the IEEE polynomial. -var IEEETable = MakeTable(IEEE) - // digest represents the partial evaluation of a checksum. type digest struct { crc uint32 @@ -83,11 +107,14 @@ func update(crc uint32, tab *Table, p []byte) uint32 { // Update returns the result of adding the bytes in p to the crc. func Update(crc uint32, tab *Table, p []byte) uint32 { + if tab == castagnoliTable { + return updateCastagnoli(crc, p) + } return update(crc, tab, p) } func (d *digest) Write(p []byte) (n int, err os.Error) { - d.crc = update(d.crc, d.tab, p) + d.crc = Update(d.crc, d.tab, p) return len(p), nil } @@ -105,7 +132,7 @@ func (d *digest) Sum() []byte { // Checksum returns the CRC-32 checksum of data // using the polynomial represented by the Table. -func Checksum(data []byte, tab *Table) uint32 { return update(0, tab, data) } +func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } // ChecksumIEEE returns the CRC-32 checksum of data // using the IEEE polynomial. diff --git a/src/pkg/hash/crc32/crc32_amd64.go b/src/pkg/hash/crc32/crc32_amd64.go new file mode 100644 index 0000000000..83349bc6c2 --- /dev/null +++ b/src/pkg/hash/crc32/crc32_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crc32 + +// This file contains the code to call the SSE 4.2 version of the Castagnoli +// CRC. + +// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2 +// support. +func haveSSE42() bool + +// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// instruction. +func castagnoliSSE42(uint32, []byte) uint32 + +var sse42 = haveSSE42() + +func updateCastagnoli(crc uint32, p []byte) uint32 { + if sse42 { + return castagnoliSSE42(crc, p) + } + return update(crc, castagnoliTable, p) +} diff --git a/src/pkg/hash/crc32/crc32_amd64.s b/src/pkg/hash/crc32/crc32_amd64.s new file mode 100644 index 0000000000..a9e5317e1a --- /dev/null +++ b/src/pkg/hash/crc32/crc32_amd64.s @@ -0,0 +1,62 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// func castagnoliSSE42(crc uint32, p []byte) uint32 +TEXT ·castagnoliSSE42(SB),7,$0 + MOVL crc+0(FP), AX // CRC value + MOVQ p+8(FP), SI // data pointer + MOVL p+16(FP), CX // len(p) + + NOTL AX + + /* If there's less than 8 bytes to process, we do it byte-by-byte. */ + CMPL CX, $8 + JL cleanup + + /* Process individual bytes until the input is 8-byte aligned. */ +startup: + MOVQ SI, BX + ANDQ $7, BX + JZ aligned + + CRC32B (SI), AX + DECL CX + INCQ SI + JMP startup + +aligned: + /* The input is now 8-byte aligned and we can process 8-byte chunks. */ + CMPL CX, $8 + JL cleanup + + CRC32Q (SI), AX + ADDQ $8, SI + SUBQ $8, CX + JMP aligned + +cleanup: + /* We may have some bytes left over that we process one at a time. */ + CMPL CX, $0 + JE done + + CRC32B (SI), AX + INCQ SI + DECQ CX + JMP cleanup + +done: + NOTL AX + MOVL AX, ret+24(FP) + RET + +// func haveSSE42() bool +TEXT ·haveSSE42(SB),7,$0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $20, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + diff --git a/src/pkg/hash/crc32/crc32_generic.go b/src/pkg/hash/crc32/crc32_generic.go new file mode 100644 index 0000000000..27aabd903b --- /dev/null +++ b/src/pkg/hash/crc32/crc32_generic.go @@ -0,0 +1,12 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crc32 + +// The file contains the generic version of updateCastagnoli which just calls +// the software implementation. + +func updateCastagnoli(crc uint32, p []byte) uint32 { + return update(crc, castagnoliTable, p) +} diff --git a/src/pkg/hash/crc32/crc32_test.go b/src/pkg/hash/crc32/crc32_test.go index cf5743c992..7e82dd755e 100644 --- a/src/pkg/hash/crc32/crc32_test.go +++ b/src/pkg/hash/crc32/crc32_test.go @@ -10,53 +10,73 @@ import ( ) type test struct { - out uint32 - in string + ieee, castagnoli uint32 + in string } var golden = []test{ - {0x0, ""}, - {0xe8b7be43, "a"}, - {0x9e83486d, "ab"}, - {0x352441c2, "abc"}, - {0xed82cd11, "abcd"}, - {0x8587d865, "abcde"}, - {0x4b8e39ef, "abcdef"}, - {0x312a6aa6, "abcdefg"}, - {0xaeef2a50, "abcdefgh"}, - {0x8da988af, "abcdefghi"}, - {0x3981703a, "abcdefghij"}, - {0x6b9cdfe7, "Discard medicine more than two years old."}, - {0xc90ef73f, "He who has a shady past knows that nice guys finish last."}, - {0xb902341f, "I wouldn't marry him with a ten foot pole."}, - {0x42080e8, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"}, - {0x154c6d11, "The days of the digital watch are numbered. -Tom Stoppard"}, - {0x4c418325, "Nepal premier won't resign."}, - {0x33955150, "For every action there is an equal and opposite government program."}, - {0x26216a4b, "His money is twice tainted: 'taint yours and 'taint mine."}, - {0x1abbe45e, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"}, - {0xc89a94f7, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"}, - {0xab3abe14, "size: a.out: bad magic"}, - {0xbab102b6, "The major problem is with sendmail. -Mark Horton"}, - {0x999149d7, "Give me a rock, paper and scissors and I will move the world. CCFestoon"}, - {0x6d52a33c, "If the enemy is within range, then so are you."}, - {0x90631e8d, "It's well we cannot hear the screams/That we create in others' dreams."}, - {0x78309130, "You remind me of a TV show, but that's all right: I watch it anyway."}, - {0x7d0a377f, "C is as portable as Stonehedge!!"}, - {0x8c79fd79, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"}, - {0xa20b7167, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"}, - {0x8e0bb443, "How can you write a big system without C++? -Paul Glick"}, + {0x0, 0x0, ""}, + {0xe8b7be43, 0xc1d04330, "a"}, + {0x9e83486d, 0xe2a22936, "ab"}, + {0x352441c2, 0x364b3fb7, "abc"}, + {0xed82cd11, 0x92c80a31, "abcd"}, + {0x8587d865, 0xc450d697, "abcde"}, + {0x4b8e39ef, 0x53bceff1, "abcdef"}, + {0x312a6aa6, 0xe627f441, "abcdefg"}, + {0xaeef2a50, 0xa9421b7, "abcdefgh"}, + {0x8da988af, 0x2ddc99fc, "abcdefghi"}, + {0x3981703a, 0xe6599437, "abcdefghij"}, + {0x6b9cdfe7, 0xb2cc01fe, "Discard medicine more than two years old."}, + {0xc90ef73f, 0xe28207f, "He who has a shady past knows that nice guys finish last."}, + {0xb902341f, 0xbe93f964, "I wouldn't marry him with a ten foot pole."}, + {0x42080e8, 0x9e3be0c3, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"}, + {0x154c6d11, 0xf505ef04, "The days of the digital watch are numbered. -Tom Stoppard"}, + {0x4c418325, 0x85d3dc82, "Nepal premier won't resign."}, + {0x33955150, 0xc5142380, "For every action there is an equal and opposite government program."}, + {0x26216a4b, 0x75eb77dd, "His money is twice tainted: 'taint yours and 'taint mine."}, + {0x1abbe45e, 0x91ebe9f7, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"}, + {0xc89a94f7, 0xf0b1168e, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"}, + {0xab3abe14, 0x572b74e2, "size: a.out: bad magic"}, + {0xbab102b6, 0x8a58a6d5, "The major problem is with sendmail. -Mark Horton"}, + {0x999149d7, 0x9c426c50, "Give me a rock, paper and scissors and I will move the world. CCFestoon"}, + {0x6d52a33c, 0x735400a4, "If the enemy is within range, then so are you."}, + {0x90631e8d, 0xbec49c95, "It's well we cannot hear the screams/That we create in others' dreams."}, + {0x78309130, 0xa95a2079, "You remind me of a TV show, but that's all right: I watch it anyway."}, + {0x7d0a377f, 0xde2e65c5, "C is as portable as Stonehedge!!"}, + {0x8c79fd79, 0x297a88ed, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"}, + {0xa20b7167, 0x66ed1d8b, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"}, + {0x8e0bb443, 0xdcded527, "How can you write a big system without C++? -Paul Glick"}, } func TestGolden(t *testing.T) { - for i := 0; i < len(golden); i++ { - g := golden[i] - c := NewIEEE() - io.WriteString(c, g.in) - s := c.Sum32() - if s != g.out { - t.Errorf("crc32(%s) = 0x%x want 0x%x", g.in, s, g.out) - t.FailNow() + castagnoliTab := MakeTable(Castagnoli) + + for _, g := range golden { + ieee := NewIEEE() + io.WriteString(ieee, g.in) + s := ieee.Sum32() + if s != g.ieee { + t.Errorf("IEEE(%s) = 0x%x want 0x%x", g.in, s, g.ieee) + } + + castagnoli := New(castagnoliTab) + io.WriteString(castagnoli, g.in) + s = castagnoli.Sum32() + if s != g.castagnoli { + t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli) + } + + if len(g.in) > 0 { + // The SSE4.2 implementation of this has code to deal + // with misaligned data so we ensure that we test that + // too. + castagnoli = New(castagnoliTab) + io.WriteString(castagnoli, g.in[:1]) + io.WriteString(castagnoli, g.in[1:]) + s = castagnoli.Sum32() + if s != g.castagnoli { + t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli) + } } } } @@ -69,6 +89,7 @@ func BenchmarkCrc32KB(b *testing.B) { } c := NewIEEE() b.StartTimer() + b.SetBytes(int64(len(data))) for i := 0; i < b.N; i++ { c.Write(data)