diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index d0206e59cf9..c4adef9ff53 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -6,7 +6,6 @@ package flate import ( "io" - "math" ) const ( @@ -282,6 +281,46 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + numCodegens = len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + w.codegenEncoding.bitLength(w.codegenFreq[:]) + + int(w.codegenFreq[16])*2 + + int(w.codegenFreq[17])*3 + + int(w.codegenFreq[18])*7 + size = header + + litEnc.bitLength(w.literalFreq) + + offEnc.bitLength(w.offsetFreq) + + extraBits + + return size, numCodegens +} + +// fixedSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) fixedSize(extraBits int) int { + return 3 + + fixedLiteralEncoding.bitLength(w.literalFreq) + + fixedOffsetEncoding.bitLength(w.offsetFreq) + + extraBits +} + +// storedSize calculates the stored size, including header. +// The function returns the size in bits and whether the block +// fits inside a single block. +func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { + if in == nil { + return 0, false + } + if len(in) <= maxStoreBlockSize { + return (len(in) + 5) * 8, true + } + return 0, false +} + func (w *huffmanBitWriter) writeCode(c hcode) { if w.err != nil { return @@ -384,6 +423,11 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { w.writeBits(value, 3) } +// writeBlock will write a block of tokens with the smallest encoding. +// The original input can be supplied, and if the huffman encoded data +// is larger than the original bytes, the data will be written as a +// stored block. +// If the input is nil, the tokens will always be Huffman encoded. func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { if w.err != nil { return @@ -392,36 +436,28 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { tokens = append(tokens, endBlockMarker) numLiterals, numOffsets := w.indexTokens(tokens) - storedBytes := 0 - if input != nil { - storedBytes = len(input) - } - var extraBits int64 - var storedSize int64 = math.MaxInt64 - if storedBytes <= maxStoreBlockSize && input != nil { - storedSize = int64((storedBytes + 5) * 8) + var extraBits int + storedSize, storable := w.storedSize(input) + if storable { // We only bother calculating the costs of the extra bits required by // the length of offset fields (which will be the same for both fixed // and dynamic encoding), if we need to compare those two encodings // against stored encoding. for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { // First eight length codes have extra size = 0. - extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart]) + extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) } for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { // First four offset codes have extra size = 0. - extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) + extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode]) } } // Figure out smallest code. // Fixed Huffman baseline. - var size = int64(3) + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + - extraBits var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding + var size = w.fixedSize(extraBits) // Dynamic Huffman? var numCodegens int @@ -430,19 +466,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens = len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- - } - dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq[:]) + - extraBits + - int64(w.codegenFreq[16]*2) + - int64(w.codegenFreq[17]*3) + - int64(w.codegenFreq[18]*7) - dynamicSize := dynamicHeader + - w.literalEncoding.bitLength(w.literalFreq) + - w.offsetEncoding.bitLength(w.offsetFreq) + dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) if dynamicSize < size { size = dynamicSize @@ -451,9 +475,9 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Stored bytes? - if storedSize < size { - w.writeStoredHeader(storedBytes, eof) - w.writeBytes(input[:storedBytes]) + if storable && storedSize < size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) return } @@ -466,12 +490,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // Write the tokens. w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) - } // writeBlockDynamic encodes a block using a dynamic Huffman table. // This should be used if the symbols used have a disproportionate // histogram distribution. +// If input is supplied and the compression savings are below 1/16th of the +// input size the block is stored. func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { if w.err != nil { return @@ -484,9 +509,13 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens := len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- + size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + + // Store bytes, if we don't get a reasonable improvement. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return } // Write Huffman table. @@ -611,29 +640,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens = len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- - } - headerSize := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq[:]) + - int64(w.codegenFreq[16]*2) + - int64(w.codegenFreq[17]*3) + - int64(w.codegenFreq[18]*7) - - // Includes EOB marker - size := headerSize + w.literalEncoding.bitLength(w.literalFreq) - - // Calculate stored size - var storedSize int64 = math.MaxInt64 - var storedBytes = len(input) - if storedBytes <= maxStoreBlockSize { - storedSize = int64(storedBytes+5) * 8 - } + size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) // Store bytes, if we don't get a reasonable improvement. - if storedSize < (size + size>>4) { - w.writeStoredHeader(storedBytes, eof) + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) w.writeBytes(input) return } diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go index 20fb19090d6..bdcbd823b00 100644 --- a/src/compress/flate/huffman_code.go +++ b/src/compress/flate/huffman_code.go @@ -105,11 +105,11 @@ func generateFixedOffsetEncoding() *huffmanEncoder { var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding() var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding() -func (h *huffmanEncoder) bitLength(freq []int32) int64 { - var total int64 +func (h *huffmanEncoder) bitLength(freq []int32) int { + var total int for i, f := range freq { if f != 0 { - total += int64(f) * int64(h.codes[i].len) + total += int(f) * int(h.codes[i].len) } } return total diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect index 0c24742fde2..09dc798ee37 100644 Binary files a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect and b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect differ