mirror of
https://github.com/golang/go
synced 2024-11-25 01:17:56 -07:00
image/jpeg: speed up decoding by inlining the clip function and
writing the idct result directly to the image buffer instead of storing it in an intermediate d.blocks field. Writing to d.blocks was necessary when decoding to an image.RGBA image, but now that we decode to a ycbcr.YCbCr we can write each component directly to the image buffer. Crude "time ./6.out" scores to decode a specific 2592x1944 JPEG 20 times show a 16% speed-up: BEFORE user 0m10.410s user 0m10.400s user 0m10.480s user 0m10.480s user 0m10.460s AFTER user 0m9.050s user 0m9.050s user 0m9.050s user 0m9.070s user 0m9.020s R=r CC=golang-dev https://golang.org/cl/4523052
This commit is contained in:
parent
499ad9448b
commit
4c1e1b815b
@ -2,6 +2,8 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package jpeg
|
||||||
|
|
||||||
// This is a Go translation of idct.c from
|
// This is a Go translation of idct.c from
|
||||||
//
|
//
|
||||||
// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
|
// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
|
||||||
@ -35,8 +37,6 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package jpeg
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
|
w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
|
||||||
w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
|
w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
|
||||||
@ -55,41 +55,45 @@ const (
|
|||||||
r2 = 181 // 256/sqrt(2)
|
r2 = 181 // 256/sqrt(2)
|
||||||
)
|
)
|
||||||
|
|
||||||
// 2-D Inverse Discrete Cosine Transformation, followed by a +128 level shift.
|
// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
|
||||||
|
// +128 level shift and a clip to [0, 255], writing the results to dst.
|
||||||
|
// stride is the number of elements between successive rows of dst.
|
||||||
//
|
//
|
||||||
// The input coefficients should already have been multiplied by the appropriate quantization table.
|
// The input coefficients should already have been multiplied by the
|
||||||
// We use fixed-point computation, with the number of bits for the fractional component varying over the
|
// appropriate quantization table. We use fixed-point computation, with the
|
||||||
// intermediate stages. The final values are expected to range within [0, 255], after a +128 level shift.
|
// number of bits for the fractional component varying over the intermediate
|
||||||
|
// stages.
|
||||||
//
|
//
|
||||||
// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the discrete W transform and
|
// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
|
||||||
// for the discrete Fourier transform", IEEE Trans. on ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
|
// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
|
||||||
func idct(b *block) {
|
// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
|
||||||
|
func idct(dst []byte, stride int, src *block) {
|
||||||
// Horizontal 1-D IDCT.
|
// Horizontal 1-D IDCT.
|
||||||
for y := 0; y < 8; y++ {
|
for y := 0; y < 8; y++ {
|
||||||
// If all the AC components are zero, then the IDCT is trivial.
|
// If all the AC components are zero, then the IDCT is trivial.
|
||||||
if b[y*8+1] == 0 && b[y*8+2] == 0 && b[y*8+3] == 0 &&
|
if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
|
||||||
b[y*8+4] == 0 && b[y*8+5] == 0 && b[y*8+6] == 0 && b[y*8+7] == 0 {
|
src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
|
||||||
dc := b[y*8+0] << 3
|
dc := src[y*8+0] << 3
|
||||||
b[y*8+0] = dc
|
src[y*8+0] = dc
|
||||||
b[y*8+1] = dc
|
src[y*8+1] = dc
|
||||||
b[y*8+2] = dc
|
src[y*8+2] = dc
|
||||||
b[y*8+3] = dc
|
src[y*8+3] = dc
|
||||||
b[y*8+4] = dc
|
src[y*8+4] = dc
|
||||||
b[y*8+5] = dc
|
src[y*8+5] = dc
|
||||||
b[y*8+6] = dc
|
src[y*8+6] = dc
|
||||||
b[y*8+7] = dc
|
src[y*8+7] = dc
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prescale.
|
// Prescale.
|
||||||
x0 := (b[y*8+0] << 11) + 128
|
x0 := (src[y*8+0] << 11) + 128
|
||||||
x1 := b[y*8+4] << 11
|
x1 := src[y*8+4] << 11
|
||||||
x2 := b[y*8+6]
|
x2 := src[y*8+6]
|
||||||
x3 := b[y*8+2]
|
x3 := src[y*8+2]
|
||||||
x4 := b[y*8+1]
|
x4 := src[y*8+1]
|
||||||
x5 := b[y*8+7]
|
x5 := src[y*8+7]
|
||||||
x6 := b[y*8+5]
|
x6 := src[y*8+5]
|
||||||
x7 := b[y*8+3]
|
x7 := src[y*8+3]
|
||||||
|
|
||||||
// Stage 1.
|
// Stage 1.
|
||||||
x8 := w7 * (x4 + x5)
|
x8 := w7 * (x4 + x5)
|
||||||
@ -119,14 +123,14 @@ func idct(b *block) {
|
|||||||
x4 = (r2*(x4-x5) + 128) >> 8
|
x4 = (r2*(x4-x5) + 128) >> 8
|
||||||
|
|
||||||
// Stage 4.
|
// Stage 4.
|
||||||
b[8*y+0] = (x7 + x1) >> 8
|
src[8*y+0] = (x7 + x1) >> 8
|
||||||
b[8*y+1] = (x3 + x2) >> 8
|
src[8*y+1] = (x3 + x2) >> 8
|
||||||
b[8*y+2] = (x0 + x4) >> 8
|
src[8*y+2] = (x0 + x4) >> 8
|
||||||
b[8*y+3] = (x8 + x6) >> 8
|
src[8*y+3] = (x8 + x6) >> 8
|
||||||
b[8*y+4] = (x8 - x6) >> 8
|
src[8*y+4] = (x8 - x6) >> 8
|
||||||
b[8*y+5] = (x0 - x4) >> 8
|
src[8*y+5] = (x0 - x4) >> 8
|
||||||
b[8*y+6] = (x3 - x2) >> 8
|
src[8*y+6] = (x3 - x2) >> 8
|
||||||
b[8*y+7] = (x7 - x1) >> 8
|
src[8*y+7] = (x7 - x1) >> 8
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vertical 1-D IDCT.
|
// Vertical 1-D IDCT.
|
||||||
@ -136,14 +140,14 @@ func idct(b *block) {
|
|||||||
// we do not bother to check for the all-zero case.
|
// we do not bother to check for the all-zero case.
|
||||||
|
|
||||||
// Prescale.
|
// Prescale.
|
||||||
y0 := (b[8*0+x] << 8) + 8192
|
y0 := (src[8*0+x] << 8) + 8192
|
||||||
y1 := b[8*4+x] << 8
|
y1 := src[8*4+x] << 8
|
||||||
y2 := b[8*6+x]
|
y2 := src[8*6+x]
|
||||||
y3 := b[8*2+x]
|
y3 := src[8*2+x]
|
||||||
y4 := b[8*1+x]
|
y4 := src[8*1+x]
|
||||||
y5 := b[8*7+x]
|
y5 := src[8*7+x]
|
||||||
y6 := b[8*5+x]
|
y6 := src[8*5+x]
|
||||||
y7 := b[8*3+x]
|
y7 := src[8*3+x]
|
||||||
|
|
||||||
// Stage 1.
|
// Stage 1.
|
||||||
y8 := w7*(y4+y5) + 4
|
y8 := w7*(y4+y5) + 4
|
||||||
@ -173,18 +177,28 @@ func idct(b *block) {
|
|||||||
y4 = (r2*(y4-y5) + 128) >> 8
|
y4 = (r2*(y4-y5) + 128) >> 8
|
||||||
|
|
||||||
// Stage 4.
|
// Stage 4.
|
||||||
b[8*0+x] = (y7 + y1) >> 14
|
src[8*0+x] = (y7 + y1) >> 14
|
||||||
b[8*1+x] = (y3 + y2) >> 14
|
src[8*1+x] = (y3 + y2) >> 14
|
||||||
b[8*2+x] = (y0 + y4) >> 14
|
src[8*2+x] = (y0 + y4) >> 14
|
||||||
b[8*3+x] = (y8 + y6) >> 14
|
src[8*3+x] = (y8 + y6) >> 14
|
||||||
b[8*4+x] = (y8 - y6) >> 14
|
src[8*4+x] = (y8 - y6) >> 14
|
||||||
b[8*5+x] = (y0 - y4) >> 14
|
src[8*5+x] = (y0 - y4) >> 14
|
||||||
b[8*6+x] = (y3 - y2) >> 14
|
src[8*6+x] = (y3 - y2) >> 14
|
||||||
b[8*7+x] = (y7 - y1) >> 14
|
src[8*7+x] = (y7 - y1) >> 14
|
||||||
}
|
}
|
||||||
|
|
||||||
// Level shift.
|
// Level shift by +128, clip to [0, 255], and write to dst.
|
||||||
for i := range *b {
|
for y := 0; y < 8; y++ {
|
||||||
b[i] += 128
|
for x := 0; x < 8; x++ {
|
||||||
|
c := src[y*8+x]
|
||||||
|
if c < -128 {
|
||||||
|
c = 0
|
||||||
|
} else if c > 127 {
|
||||||
|
c = 255
|
||||||
|
} else {
|
||||||
|
c += 128
|
||||||
|
}
|
||||||
|
dst[y*stride+x] = uint8(c)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,6 @@ type decoder struct {
|
|||||||
huff [maxTc + 1][maxTh + 1]huffman
|
huff [maxTc + 1][maxTh + 1]huffman
|
||||||
quant [maxTq + 1]block
|
quant [maxTq + 1]block
|
||||||
b bits
|
b bits
|
||||||
blocks [nComponent][maxH * maxV]block
|
|
||||||
tmp [1024]byte
|
tmp [1024]byte
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,45 +181,6 @@ func (d *decoder) processDQT(n int) os.Error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clip x to the range [0, 255] inclusive.
|
|
||||||
func clip(x int) uint8 {
|
|
||||||
if x < 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
if x > 255 {
|
|
||||||
return 255
|
|
||||||
}
|
|
||||||
return uint8(x)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store the MCU to the image.
|
|
||||||
func (d *decoder) storeMCU(mx, my int) {
|
|
||||||
h0, v0 := d.comps[0].h, d.comps[0].v
|
|
||||||
// Store the luma blocks.
|
|
||||||
for v := 0; v < v0; v++ {
|
|
||||||
for h := 0; h < h0; h++ {
|
|
||||||
p := 8 * ((v0*my+v)*d.img.YStride + (h0*mx + h))
|
|
||||||
for y := 0; y < 8; y++ {
|
|
||||||
for x := 0; x < 8; x++ {
|
|
||||||
d.img.Y[p] = clip(d.blocks[0][h0*v+h][8*y+x])
|
|
||||||
p++
|
|
||||||
}
|
|
||||||
p += d.img.YStride - 8
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Store the chroma blocks.
|
|
||||||
p := 8 * (my*d.img.CStride + mx)
|
|
||||||
for y := 0; y < 8; y++ {
|
|
||||||
for x := 0; x < 8; x++ {
|
|
||||||
d.img.Cb[p] = clip(d.blocks[1][0][8*y+x])
|
|
||||||
d.img.Cr[p] = clip(d.blocks[2][0][8*y+x])
|
|
||||||
p++
|
|
||||||
}
|
|
||||||
p += d.img.CStride - 8
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Specified in section B.2.3.
|
// Specified in section B.2.3.
|
||||||
func (d *decoder) processSOS(n int) os.Error {
|
func (d *decoder) processSOS(n int) os.Error {
|
||||||
if n != 4+2*nComponent {
|
if n != 4+2*nComponent {
|
||||||
@ -275,14 +235,18 @@ func (d *decoder) processSOS(n int) os.Error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
mcu, expectedRST := 0, uint8(rst0Marker)
|
mcu, expectedRST := 0, uint8(rst0Marker)
|
||||||
var allZeroes block
|
var (
|
||||||
var dc [nComponent]int
|
allZeroes, b block
|
||||||
|
dc [nComponent]int
|
||||||
|
)
|
||||||
for my := 0; my < myy; my++ {
|
for my := 0; my < myy; my++ {
|
||||||
for mx := 0; mx < mxx; mx++ {
|
for mx := 0; mx < mxx; mx++ {
|
||||||
for i := 0; i < nComponent; i++ {
|
for i := 0; i < nComponent; i++ {
|
||||||
qt := &d.quant[d.comps[i].tq]
|
qt := &d.quant[d.comps[i].tq]
|
||||||
for j := 0; j < d.comps[i].h*d.comps[i].v; j++ {
|
for j := 0; j < d.comps[i].h*d.comps[i].v; j++ {
|
||||||
d.blocks[i][j] = allZeroes
|
// TODO(nigeltao): make this a "var b block" once the compiler's escape
|
||||||
|
// analysis is good enough to allocate it on the stack, not the heap.
|
||||||
|
b = allZeroes
|
||||||
|
|
||||||
// Decode the DC coefficient, as specified in section F.2.2.1.
|
// Decode the DC coefficient, as specified in section F.2.2.1.
|
||||||
value, err := d.decodeHuffman(&d.huff[dcTableClass][scanComps[i].td])
|
value, err := d.decodeHuffman(&d.huff[dcTableClass][scanComps[i].td])
|
||||||
@ -297,7 +261,7 @@ func (d *decoder) processSOS(n int) os.Error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
dc[i] += dcDelta
|
dc[i] += dcDelta
|
||||||
d.blocks[i][j][0] = dc[i] * qt[0]
|
b[0] = dc[i] * qt[0]
|
||||||
|
|
||||||
// Decode the AC coefficients, as specified in section F.2.2.2.
|
// Decode the AC coefficients, as specified in section F.2.2.2.
|
||||||
for k := 1; k < blockSize; k++ {
|
for k := 1; k < blockSize; k++ {
|
||||||
@ -316,7 +280,7 @@ func (d *decoder) processSOS(n int) os.Error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
d.blocks[i][j][unzig[k]] = ac * qt[k]
|
b[unzig[k]] = ac * qt[k]
|
||||||
} else {
|
} else {
|
||||||
if val0 != 0x0f {
|
if val0 != 0x0f {
|
||||||
break
|
break
|
||||||
@ -325,10 +289,19 @@ func (d *decoder) processSOS(n int) os.Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
idct(&d.blocks[i][j])
|
// Perform the inverse DCT and store the MCU component to the image.
|
||||||
|
switch i {
|
||||||
|
case 0:
|
||||||
|
mx0 := h0*mx + (j % 2)
|
||||||
|
my0 := v0*my + (j / 2)
|
||||||
|
idct(d.img.Y[8*(my0*d.img.YStride+mx0):], d.img.YStride, &b)
|
||||||
|
case 1:
|
||||||
|
idct(d.img.Cb[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
|
||||||
|
case 2:
|
||||||
|
idct(d.img.Cr[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
|
||||||
|
}
|
||||||
} // for j
|
} // for j
|
||||||
} // for i
|
} // for i
|
||||||
d.storeMCU(mx, my)
|
|
||||||
mcu++
|
mcu++
|
||||||
if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
|
if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
|
||||||
// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
|
// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
|
||||||
|
Loading…
Reference in New Issue
Block a user