diff --git a/src/pkg/compress/flate/huffman_bit_writer.go b/src/pkg/compress/flate/huffman_bit_writer.go index 5df4510a204..3981df5cba4 100644 --- a/src/pkg/compress/flate/huffman_bit_writer.go +++ b/src/pkg/compress/flate/huffman_bit_writer.go @@ -15,9 +15,6 @@ const ( // The largest offset code. offsetCodeCount = 30 - // The largest offset code in the extensions. - extendedOffsetCodeCount = 42 - // The special code used to mark the end of a block. endBlockMarker = 256 @@ -100,11 +97,11 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { return &huffmanBitWriter{ w: w, literalFreq: make([]int32, maxLit), - offsetFreq: make([]int32, extendedOffsetCodeCount), - codegen: make([]uint8, maxLit+extendedOffsetCodeCount+1), + offsetFreq: make([]int32, offsetCodeCount), + codegen: make([]uint8, maxLit+offsetCodeCount+1), codegenFreq: make([]int32, codegenCodeCount), literalEncoding: newHuffmanEncoder(maxLit), - offsetEncoding: newHuffmanEncoder(extendedOffsetCodeCount), + offsetEncoding: newHuffmanEncoder(offsetCodeCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount), } } @@ -290,13 +287,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n } w.writeBits(firstBits, 3) w.writeBits(int32(numLiterals-257), 5) - if numOffsets > offsetCodeCount { - // Extended version of decompressor - w.writeBits(int32(offsetCodeCount+((numOffsets-(1+offsetCodeCount))>>3)), 5) - w.writeBits(int32((numOffsets-(1+offsetCodeCount))&0x7), 3) - } else { - w.writeBits(int32(numOffsets-1), 5) - } + w.writeBits(int32(numOffsets-1), 5) w.writeBits(int32(numCodegens-4), 4) for i := 0; i < numCodegens; i++ { @@ -368,24 +359,17 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { tokens = tokens[0 : n+1] tokens[n] = endBlockMarker - totalLength := -1 // Subtract 1 for endBlock. for _, t := range tokens { switch t.typ() { case literalType: w.literalFreq[t.literal()]++ - totalLength++ - break case matchType: length := t.length() offset := t.offset() - totalLength += int(length + 3) w.literalFreq[lengthCodesStart+lengthCode(length)]++ w.offsetFreq[offsetCode(offset)]++ - break } } - w.literalEncoding.generate(w.literalFreq, 15) - w.offsetEncoding.generate(w.offsetFreq, 15) // get the number of literals numLiterals := len(w.literalFreq) @@ -394,15 +378,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // get the number of offsets numOffsets := len(w.offsetFreq) - for numOffsets > 1 && w.offsetFreq[numOffsets-1] == 0 { + for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 { numOffsets-- } + if numOffsets == 0 { + // We haven't found a single match. If we want to go with the dynamic encoding, + // we should count at least one offset to be sure that the offset huffman tree could be encoded. + w.offsetFreq[0] = 1 + numOffsets = 1 + } + + w.literalEncoding.generate(w.literalFreq, 15) + w.offsetEncoding.generate(w.offsetFreq, 15) + storedBytes := 0 if input != nil { storedBytes = len(input) } var extraBits int64 - var storedSize int64 + var storedSize int64 = math.MaxInt64 if storedBytes <= maxStoreBlockSize && input != nil { storedSize = int64((storedBytes + 5) * 8) // We only bother calculating the costs of the extra bits required by @@ -417,34 +411,29 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // First four offset codes have extra size = 0. extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) } - } else { - storedSize = math.MaxInt32 } - // Figure out which generates smaller code, fixed Huffman, dynamic - // Huffman, or just storing the data. - var fixedSize int64 = math.MaxInt64 - if numOffsets <= offsetCodeCount { - fixedSize = int64(3) + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + - extraBits - } + // Figure out smallest code. + // Fixed Huffman baseline. + var size = int64(3) + + fixedLiteralEncoding.bitLength(w.literalFreq) + + fixedOffsetEncoding.bitLength(w.offsetFreq) + + extraBits + var literalEncoding = fixedLiteralEncoding + var offsetEncoding = fixedOffsetEncoding + + // Dynamic Huffman? + var numCodegens int + // Generate codegen and codegenFrequencies, which indicates how to encode // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets) w.codegenEncoding.generate(w.codegenFreq, 7) - numCodegens := len(w.codegenFreq) + numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } - extensionSummand := 0 - if numOffsets > offsetCodeCount { - extensionSummand = 3 - } dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + - // Following line is an extension. - int64(extensionSummand) + w.codegenEncoding.bitLength(w.codegenFreq) + int64(extraBits) + int64(w.codegenFreq[16]*2) + @@ -454,26 +443,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { w.literalEncoding.bitLength(w.literalFreq) + w.offsetEncoding.bitLength(w.offsetFreq) - if storedSize < fixedSize && storedSize < dynamicSize { - w.writeStoredHeader(storedBytes, eof) - w.writeBytes(input[0:storedBytes]) - return - } - var literalEncoding *huffmanEncoder - var offsetEncoding *huffmanEncoder - - if fixedSize <= dynamicSize { - w.writeFixedHeader(eof) - literalEncoding = fixedLiteralEncoding - offsetEncoding = fixedOffsetEncoding - } else { - // Write the header. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + if dynamicSize < size { + size = dynamicSize literalEncoding = w.literalEncoding offsetEncoding = w.offsetEncoding } - // Write the tokens. + // Stored bytes? + if storedSize < size { + w.writeStoredHeader(storedBytes, eof) + w.writeBytes(input[0:storedBytes]) + return + } + + // Huffman. + if literalEncoding == fixedLiteralEncoding { + w.writeFixedHeader(eof) + } else { + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + } for _, t := range tokens { switch t.typ() { case literalType: diff --git a/src/pkg/compress/flate/huffman_code.go b/src/pkg/compress/flate/huffman_code.go index 6be605f0a59..7ed603a4f43 100644 --- a/src/pkg/compress/flate/huffman_code.go +++ b/src/pkg/compress/flate/huffman_code.go @@ -363,7 +363,12 @@ func (s literalNodeSorter) Less(i, j int) bool { func (s literalNodeSorter) Swap(i, j int) { s.a[i], s.a[j] = s.a[j], s.a[i] } func sortByFreq(a []literalNode) { - s := &literalNodeSorter{a, func(i, j int) bool { return a[i].freq < a[j].freq }} + s := &literalNodeSorter{a, func(i, j int) bool { + if a[i].freq == a[j].freq { + return a[i].literal < a[j].literal + } + return a[i].freq < a[j].freq + }} sort.Sort(s) } diff --git a/src/pkg/compress/flate/inflate.go b/src/pkg/compress/flate/inflate.go index 320b80d0699..64bbf24ff83 100644 --- a/src/pkg/compress/flate/inflate.go +++ b/src/pkg/compress/flate/inflate.go @@ -77,8 +77,6 @@ type huffmanDecoder struct { // Initialize Huffman decoding tables from array of code lengths. func (h *huffmanDecoder) init(bits []int) bool { - // TODO(rsc): Return false sometimes. - // Count number of codes of each length, // compute min and max length. var count [maxCodeLen + 1]int diff --git a/src/pkg/compress/zlib/writer_test.go b/src/pkg/compress/zlib/writer_test.go index a06689ee50f..32f05ab6856 100644 --- a/src/pkg/compress/zlib/writer_test.go +++ b/src/pkg/compress/zlib/writer_test.go @@ -6,6 +6,7 @@ package zlib import ( "bytes" + "fmt" "io" "io/ioutil" "os" @@ -17,15 +18,13 @@ var filenames = []string{ "../testdata/pi.txt", } +var data = []string{ + "test a reasonable sized string that can be compressed", +} + // Tests that compressing and then decompressing the given file at the given compression level and dictionary // yields equivalent bytes to the original file. func testFileLevelDict(t *testing.T, fn string, level int, d string) { - // Read dictionary, if given. - var dict []byte - if d != "" { - dict = []byte(d) - } - // Read the file, as golden output. golden, err := os.Open(fn) if err != nil { @@ -33,17 +32,25 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { return } defer golden.Close() - - // Read the file again, and push it through a pipe that compresses at the write end, and decompresses at the read end. - raw, err := os.Open(fn) - if err != nil { - t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err) + b0, err0 := ioutil.ReadAll(golden) + if err0 != nil { + t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0) return } + testLevelDict(t, fn, b0, level, d) +} + +func testLevelDict(t *testing.T, fn string, b0 []byte, level int, d string) { + // Make dictionary, if given. + var dict []byte + if d != "" { + dict = []byte(d) + } + + // Push data through a pipe that compresses at the write end, and decompresses at the read end. piper, pipew := io.Pipe() defer piper.Close() go func() { - defer raw.Close() defer pipew.Close() zlibw, err := NewWriterDict(pipew, level, dict) if err != nil { @@ -51,25 +58,14 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { return } defer zlibw.Close() - var b [1024]byte - for { - n, err0 := raw.Read(b[0:]) - if err0 != nil && err0 != os.EOF { - t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0) - return - } - _, err1 := zlibw.Write(b[0:n]) - if err1 == os.EPIPE { - // Fail, but do not report the error, as some other (presumably reportable) error broke the pipe. - return - } - if err1 != nil { - t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1) - return - } - if err0 == os.EOF { - break - } + _, err = zlibw.Write(b0) + if err == os.EPIPE { + // Fail, but do not report the error, as some other (presumably reported) error broke the pipe. + return + } + if err != nil { + t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err) + return } }() zlibr, err := NewReaderDict(piper, dict) @@ -79,13 +75,8 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { } defer zlibr.Close() - // Compare the two. - b0, err0 := ioutil.ReadAll(golden) + // Compare the decompressed data. b1, err1 := ioutil.ReadAll(zlibr) - if err0 != nil { - t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0) - return - } if err1 != nil { t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1) return @@ -103,6 +94,18 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { } func TestWriter(t *testing.T) { + for i, s := range data { + b := []byte(s) + tag := fmt.Sprintf("#%d", i) + testLevelDict(t, tag, b, DefaultCompression, "") + testLevelDict(t, tag, b, NoCompression, "") + for level := BestSpeed; level <= BestCompression; level++ { + testLevelDict(t, tag, b, level, "") + } + } +} + +func TestWriterBig(t *testing.T) { for _, fn := range filenames { testFileLevelDict(t, fn, DefaultCompression, "") testFileLevelDict(t, fn, NoCompression, "")