mirror of
https://github.com/golang/go
synced 2024-11-24 21:50:11 -07:00
compress/flate: fix Huffman tree bug
Incorporate refactoring and a regression test from https://golang.org/cl/4538090/ R=rsc, go.peter.90, imkrasin CC=golang-dev, mirtchovski https://golang.org/cl/4524070
This commit is contained in:
parent
e8c87a7ddd
commit
1b5d04c5ae
@ -15,9 +15,6 @@ const (
|
||||
// The largest offset code.
|
||||
offsetCodeCount = 30
|
||||
|
||||
// The largest offset code in the extensions.
|
||||
extendedOffsetCodeCount = 42
|
||||
|
||||
// The special code used to mark the end of a block.
|
||||
endBlockMarker = 256
|
||||
|
||||
@ -100,11 +97,11 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
|
||||
return &huffmanBitWriter{
|
||||
w: w,
|
||||
literalFreq: make([]int32, maxLit),
|
||||
offsetFreq: make([]int32, extendedOffsetCodeCount),
|
||||
codegen: make([]uint8, maxLit+extendedOffsetCodeCount+1),
|
||||
offsetFreq: make([]int32, offsetCodeCount),
|
||||
codegen: make([]uint8, maxLit+offsetCodeCount+1),
|
||||
codegenFreq: make([]int32, codegenCodeCount),
|
||||
literalEncoding: newHuffmanEncoder(maxLit),
|
||||
offsetEncoding: newHuffmanEncoder(extendedOffsetCodeCount),
|
||||
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
|
||||
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
|
||||
}
|
||||
}
|
||||
@ -290,13 +287,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
|
||||
}
|
||||
w.writeBits(firstBits, 3)
|
||||
w.writeBits(int32(numLiterals-257), 5)
|
||||
if numOffsets > offsetCodeCount {
|
||||
// Extended version of decompressor
|
||||
w.writeBits(int32(offsetCodeCount+((numOffsets-(1+offsetCodeCount))>>3)), 5)
|
||||
w.writeBits(int32((numOffsets-(1+offsetCodeCount))&0x7), 3)
|
||||
} else {
|
||||
w.writeBits(int32(numOffsets-1), 5)
|
||||
}
|
||||
w.writeBits(int32(numOffsets-1), 5)
|
||||
w.writeBits(int32(numCodegens-4), 4)
|
||||
|
||||
for i := 0; i < numCodegens; i++ {
|
||||
@ -368,24 +359,17 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
|
||||
tokens = tokens[0 : n+1]
|
||||
tokens[n] = endBlockMarker
|
||||
|
||||
totalLength := -1 // Subtract 1 for endBlock.
|
||||
for _, t := range tokens {
|
||||
switch t.typ() {
|
||||
case literalType:
|
||||
w.literalFreq[t.literal()]++
|
||||
totalLength++
|
||||
break
|
||||
case matchType:
|
||||
length := t.length()
|
||||
offset := t.offset()
|
||||
totalLength += int(length + 3)
|
||||
w.literalFreq[lengthCodesStart+lengthCode(length)]++
|
||||
w.offsetFreq[offsetCode(offset)]++
|
||||
break
|
||||
}
|
||||
}
|
||||
w.literalEncoding.generate(w.literalFreq, 15)
|
||||
w.offsetEncoding.generate(w.offsetFreq, 15)
|
||||
|
||||
// get the number of literals
|
||||
numLiterals := len(w.literalFreq)
|
||||
@ -394,15 +378,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
|
||||
}
|
||||
// get the number of offsets
|
||||
numOffsets := len(w.offsetFreq)
|
||||
for numOffsets > 1 && w.offsetFreq[numOffsets-1] == 0 {
|
||||
for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
|
||||
numOffsets--
|
||||
}
|
||||
if numOffsets == 0 {
|
||||
// We haven't found a single match. If we want to go with the dynamic encoding,
|
||||
// we should count at least one offset to be sure that the offset huffman tree could be encoded.
|
||||
w.offsetFreq[0] = 1
|
||||
numOffsets = 1
|
||||
}
|
||||
|
||||
w.literalEncoding.generate(w.literalFreq, 15)
|
||||
w.offsetEncoding.generate(w.offsetFreq, 15)
|
||||
|
||||
storedBytes := 0
|
||||
if input != nil {
|
||||
storedBytes = len(input)
|
||||
}
|
||||
var extraBits int64
|
||||
var storedSize int64
|
||||
var storedSize int64 = math.MaxInt64
|
||||
if storedBytes <= maxStoreBlockSize && input != nil {
|
||||
storedSize = int64((storedBytes + 5) * 8)
|
||||
// We only bother calculating the costs of the extra bits required by
|
||||
@ -417,34 +411,29 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
|
||||
// First four offset codes have extra size = 0.
|
||||
extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
|
||||
}
|
||||
} else {
|
||||
storedSize = math.MaxInt32
|
||||
}
|
||||
|
||||
// Figure out which generates smaller code, fixed Huffman, dynamic
|
||||
// Huffman, or just storing the data.
|
||||
var fixedSize int64 = math.MaxInt64
|
||||
if numOffsets <= offsetCodeCount {
|
||||
fixedSize = int64(3) +
|
||||
fixedLiteralEncoding.bitLength(w.literalFreq) +
|
||||
fixedOffsetEncoding.bitLength(w.offsetFreq) +
|
||||
extraBits
|
||||
}
|
||||
// Figure out smallest code.
|
||||
// Fixed Huffman baseline.
|
||||
var size = int64(3) +
|
||||
fixedLiteralEncoding.bitLength(w.literalFreq) +
|
||||
fixedOffsetEncoding.bitLength(w.offsetFreq) +
|
||||
extraBits
|
||||
var literalEncoding = fixedLiteralEncoding
|
||||
var offsetEncoding = fixedOffsetEncoding
|
||||
|
||||
// Dynamic Huffman?
|
||||
var numCodegens int
|
||||
|
||||
// Generate codegen and codegenFrequencies, which indicates how to encode
|
||||
// the literalEncoding and the offsetEncoding.
|
||||
w.generateCodegen(numLiterals, numOffsets)
|
||||
w.codegenEncoding.generate(w.codegenFreq, 7)
|
||||
numCodegens := len(w.codegenFreq)
|
||||
numCodegens = len(w.codegenFreq)
|
||||
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
|
||||
numCodegens--
|
||||
}
|
||||
extensionSummand := 0
|
||||
if numOffsets > offsetCodeCount {
|
||||
extensionSummand = 3
|
||||
}
|
||||
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
|
||||
// Following line is an extension.
|
||||
int64(extensionSummand) +
|
||||
w.codegenEncoding.bitLength(w.codegenFreq) +
|
||||
int64(extraBits) +
|
||||
int64(w.codegenFreq[16]*2) +
|
||||
@ -454,26 +443,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
|
||||
w.literalEncoding.bitLength(w.literalFreq) +
|
||||
w.offsetEncoding.bitLength(w.offsetFreq)
|
||||
|
||||
if storedSize < fixedSize && storedSize < dynamicSize {
|
||||
w.writeStoredHeader(storedBytes, eof)
|
||||
w.writeBytes(input[0:storedBytes])
|
||||
return
|
||||
}
|
||||
var literalEncoding *huffmanEncoder
|
||||
var offsetEncoding *huffmanEncoder
|
||||
|
||||
if fixedSize <= dynamicSize {
|
||||
w.writeFixedHeader(eof)
|
||||
literalEncoding = fixedLiteralEncoding
|
||||
offsetEncoding = fixedOffsetEncoding
|
||||
} else {
|
||||
// Write the header.
|
||||
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
|
||||
if dynamicSize < size {
|
||||
size = dynamicSize
|
||||
literalEncoding = w.literalEncoding
|
||||
offsetEncoding = w.offsetEncoding
|
||||
}
|
||||
|
||||
// Write the tokens.
|
||||
// Stored bytes?
|
||||
if storedSize < size {
|
||||
w.writeStoredHeader(storedBytes, eof)
|
||||
w.writeBytes(input[0:storedBytes])
|
||||
return
|
||||
}
|
||||
|
||||
// Huffman.
|
||||
if literalEncoding == fixedLiteralEncoding {
|
||||
w.writeFixedHeader(eof)
|
||||
} else {
|
||||
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
|
||||
}
|
||||
for _, t := range tokens {
|
||||
switch t.typ() {
|
||||
case literalType:
|
||||
|
@ -363,7 +363,12 @@ func (s literalNodeSorter) Less(i, j int) bool {
|
||||
func (s literalNodeSorter) Swap(i, j int) { s.a[i], s.a[j] = s.a[j], s.a[i] }
|
||||
|
||||
func sortByFreq(a []literalNode) {
|
||||
s := &literalNodeSorter{a, func(i, j int) bool { return a[i].freq < a[j].freq }}
|
||||
s := &literalNodeSorter{a, func(i, j int) bool {
|
||||
if a[i].freq == a[j].freq {
|
||||
return a[i].literal < a[j].literal
|
||||
}
|
||||
return a[i].freq < a[j].freq
|
||||
}}
|
||||
sort.Sort(s)
|
||||
}
|
||||
|
||||
|
@ -77,8 +77,6 @@ type huffmanDecoder struct {
|
||||
|
||||
// Initialize Huffman decoding tables from array of code lengths.
|
||||
func (h *huffmanDecoder) init(bits []int) bool {
|
||||
// TODO(rsc): Return false sometimes.
|
||||
|
||||
// Count number of codes of each length,
|
||||
// compute min and max length.
|
||||
var count [maxCodeLen + 1]int
|
||||
|
@ -6,6 +6,7 @@ package zlib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
@ -17,15 +18,13 @@ var filenames = []string{
|
||||
"../testdata/pi.txt",
|
||||
}
|
||||
|
||||
var data = []string{
|
||||
"test a reasonable sized string that can be compressed",
|
||||
}
|
||||
|
||||
// Tests that compressing and then decompressing the given file at the given compression level and dictionary
|
||||
// yields equivalent bytes to the original file.
|
||||
func testFileLevelDict(t *testing.T, fn string, level int, d string) {
|
||||
// Read dictionary, if given.
|
||||
var dict []byte
|
||||
if d != "" {
|
||||
dict = []byte(d)
|
||||
}
|
||||
|
||||
// Read the file, as golden output.
|
||||
golden, err := os.Open(fn)
|
||||
if err != nil {
|
||||
@ -33,17 +32,25 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
|
||||
return
|
||||
}
|
||||
defer golden.Close()
|
||||
|
||||
// Read the file again, and push it through a pipe that compresses at the write end, and decompresses at the read end.
|
||||
raw, err := os.Open(fn)
|
||||
if err != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
|
||||
b0, err0 := ioutil.ReadAll(golden)
|
||||
if err0 != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
|
||||
return
|
||||
}
|
||||
testLevelDict(t, fn, b0, level, d)
|
||||
}
|
||||
|
||||
func testLevelDict(t *testing.T, fn string, b0 []byte, level int, d string) {
|
||||
// Make dictionary, if given.
|
||||
var dict []byte
|
||||
if d != "" {
|
||||
dict = []byte(d)
|
||||
}
|
||||
|
||||
// Push data through a pipe that compresses at the write end, and decompresses at the read end.
|
||||
piper, pipew := io.Pipe()
|
||||
defer piper.Close()
|
||||
go func() {
|
||||
defer raw.Close()
|
||||
defer pipew.Close()
|
||||
zlibw, err := NewWriterDict(pipew, level, dict)
|
||||
if err != nil {
|
||||
@ -51,25 +58,14 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
|
||||
return
|
||||
}
|
||||
defer zlibw.Close()
|
||||
var b [1024]byte
|
||||
for {
|
||||
n, err0 := raw.Read(b[0:])
|
||||
if err0 != nil && err0 != os.EOF {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
|
||||
return
|
||||
}
|
||||
_, err1 := zlibw.Write(b[0:n])
|
||||
if err1 == os.EPIPE {
|
||||
// Fail, but do not report the error, as some other (presumably reportable) error broke the pipe.
|
||||
return
|
||||
}
|
||||
if err1 != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
|
||||
return
|
||||
}
|
||||
if err0 == os.EOF {
|
||||
break
|
||||
}
|
||||
_, err = zlibw.Write(b0)
|
||||
if err == os.EPIPE {
|
||||
// Fail, but do not report the error, as some other (presumably reported) error broke the pipe.
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
|
||||
return
|
||||
}
|
||||
}()
|
||||
zlibr, err := NewReaderDict(piper, dict)
|
||||
@ -79,13 +75,8 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
|
||||
}
|
||||
defer zlibr.Close()
|
||||
|
||||
// Compare the two.
|
||||
b0, err0 := ioutil.ReadAll(golden)
|
||||
// Compare the decompressed data.
|
||||
b1, err1 := ioutil.ReadAll(zlibr)
|
||||
if err0 != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
|
||||
return
|
||||
}
|
||||
if err1 != nil {
|
||||
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
|
||||
return
|
||||
@ -103,6 +94,18 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
|
||||
}
|
||||
|
||||
func TestWriter(t *testing.T) {
|
||||
for i, s := range data {
|
||||
b := []byte(s)
|
||||
tag := fmt.Sprintf("#%d", i)
|
||||
testLevelDict(t, tag, b, DefaultCompression, "")
|
||||
testLevelDict(t, tag, b, NoCompression, "")
|
||||
for level := BestSpeed; level <= BestCompression; level++ {
|
||||
testLevelDict(t, tag, b, level, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriterBig(t *testing.T) {
|
||||
for _, fn := range filenames {
|
||||
testFileLevelDict(t, fn, DefaultCompression, "")
|
||||
testFileLevelDict(t, fn, NoCompression, "")
|
||||
|
Loading…
Reference in New Issue
Block a user