diff --git a/src/pkg/image/jpeg/idct.go b/src/pkg/image/jpeg/idct.go
index e5a2f40f5db..b387dfdffd1 100644
--- a/src/pkg/image/jpeg/idct.go
+++ b/src/pkg/image/jpeg/idct.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+package jpeg
+
 // This is a Go translation of idct.c from
 //
 // http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
@@ -35,8 +37,6 @@
  *
  */
 
-package jpeg
-
 const (
 	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
 	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
@@ -55,41 +55,45 @@ const (
 	r2 = 181 // 256/sqrt(2)
 )
 
-// 2-D Inverse Discrete Cosine Transformation, followed by a +128 level shift.
+// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
+// +128 level shift and a clip to [0, 255], writing the results to dst.
+// stride is the number of elements between successive rows of dst.
 //
-// The input coefficients should already have been multiplied by the appropriate quantization table.
-// We use fixed-point computation, with the number of bits for the fractional component varying over the
-// intermediate stages. The final values are expected to range within [0, 255], after a +128 level shift.
+// The input coefficients should already have been multiplied by the
+// appropriate quantization table. We use fixed-point computation, with the
+// number of bits for the fractional component varying over the intermediate
+// stages.
 //
-// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the discrete W transform and
-// for the discrete Fourier transform", IEEE Trans. on ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(b *block) {
+// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+func idct(dst []byte, stride int, src *block) {
 	// Horizontal 1-D IDCT.
 	for y := 0; y < 8; y++ {
 		// If all the AC components are zero, then the IDCT is trivial.
-		if b[y*8+1] == 0 && b[y*8+2] == 0 && b[y*8+3] == 0 &&
-			b[y*8+4] == 0 && b[y*8+5] == 0 && b[y*8+6] == 0 && b[y*8+7] == 0 {
-			dc := b[y*8+0] << 3
-			b[y*8+0] = dc
-			b[y*8+1] = dc
-			b[y*8+2] = dc
-			b[y*8+3] = dc
-			b[y*8+4] = dc
-			b[y*8+5] = dc
-			b[y*8+6] = dc
-			b[y*8+7] = dc
+		if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
+			src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
+			dc := src[y*8+0] << 3
+			src[y*8+0] = dc
+			src[y*8+1] = dc
+			src[y*8+2] = dc
+			src[y*8+3] = dc
+			src[y*8+4] = dc
+			src[y*8+5] = dc
+			src[y*8+6] = dc
+			src[y*8+7] = dc
 			continue
 		}
 
 		// Prescale.
-		x0 := (b[y*8+0] << 11) + 128
-		x1 := b[y*8+4] << 11
-		x2 := b[y*8+6]
-		x3 := b[y*8+2]
-		x4 := b[y*8+1]
-		x5 := b[y*8+7]
-		x6 := b[y*8+5]
-		x7 := b[y*8+3]
+		x0 := (src[y*8+0] << 11) + 128
+		x1 := src[y*8+4] << 11
+		x2 := src[y*8+6]
+		x3 := src[y*8+2]
+		x4 := src[y*8+1]
+		x5 := src[y*8+7]
+		x6 := src[y*8+5]
+		x7 := src[y*8+3]
 
 		// Stage 1.
 		x8 := w7 * (x4 + x5)
@@ -119,14 +123,14 @@ func idct(b *block) {
 		x4 = (r2*(x4-x5) + 128) >> 8
 
 		// Stage 4.
-		b[8*y+0] = (x7 + x1) >> 8
-		b[8*y+1] = (x3 + x2) >> 8
-		b[8*y+2] = (x0 + x4) >> 8
-		b[8*y+3] = (x8 + x6) >> 8
-		b[8*y+4] = (x8 - x6) >> 8
-		b[8*y+5] = (x0 - x4) >> 8
-		b[8*y+6] = (x3 - x2) >> 8
-		b[8*y+7] = (x7 - x1) >> 8
+		src[8*y+0] = (x7 + x1) >> 8
+		src[8*y+1] = (x3 + x2) >> 8
+		src[8*y+2] = (x0 + x4) >> 8
+		src[8*y+3] = (x8 + x6) >> 8
+		src[8*y+4] = (x8 - x6) >> 8
+		src[8*y+5] = (x0 - x4) >> 8
+		src[8*y+6] = (x3 - x2) >> 8
+		src[8*y+7] = (x7 - x1) >> 8
 	}
 
 	// Vertical 1-D IDCT.
@@ -136,14 +140,14 @@ func idct(b *block) {
 		// we do not bother to check for the all-zero case.
 
 		// Prescale.
-		y0 := (b[8*0+x] << 8) + 8192
-		y1 := b[8*4+x] << 8
-		y2 := b[8*6+x]
-		y3 := b[8*2+x]
-		y4 := b[8*1+x]
-		y5 := b[8*7+x]
-		y6 := b[8*5+x]
-		y7 := b[8*3+x]
+		y0 := (src[8*0+x] << 8) + 8192
+		y1 := src[8*4+x] << 8
+		y2 := src[8*6+x]
+		y3 := src[8*2+x]
+		y4 := src[8*1+x]
+		y5 := src[8*7+x]
+		y6 := src[8*5+x]
+		y7 := src[8*3+x]
 
 		// Stage 1.
 		y8 := w7*(y4+y5) + 4
@@ -173,18 +177,28 @@ func idct(b *block) {
 		y4 = (r2*(y4-y5) + 128) >> 8
 
 		// Stage 4.
-		b[8*0+x] = (y7 + y1) >> 14
-		b[8*1+x] = (y3 + y2) >> 14
-		b[8*2+x] = (y0 + y4) >> 14
-		b[8*3+x] = (y8 + y6) >> 14
-		b[8*4+x] = (y8 - y6) >> 14
-		b[8*5+x] = (y0 - y4) >> 14
-		b[8*6+x] = (y3 - y2) >> 14
-		b[8*7+x] = (y7 - y1) >> 14
+		src[8*0+x] = (y7 + y1) >> 14
+		src[8*1+x] = (y3 + y2) >> 14
+		src[8*2+x] = (y0 + y4) >> 14
+		src[8*3+x] = (y8 + y6) >> 14
+		src[8*4+x] = (y8 - y6) >> 14
+		src[8*5+x] = (y0 - y4) >> 14
+		src[8*6+x] = (y3 - y2) >> 14
+		src[8*7+x] = (y7 - y1) >> 14
 	}
 
-	// Level shift.
-	for i := range *b {
-		b[i] += 128
+	// Level shift by +128, clip to [0, 255], and write to dst.
+	for y := 0; y < 8; y++ {
+		for x := 0; x < 8; x++ {
+			c := src[y*8+x]
+			if c < -128 {
+				c = 0
+			} else if c > 127 {
+				c = 255
+			} else {
+				c += 128
+			}
+			dst[y*stride+x] = uint8(c)
+		}
 	}
 }
diff --git a/src/pkg/image/jpeg/reader.go b/src/pkg/image/jpeg/reader.go
index 21a6fff9698..74df9ac4b76 100644
--- a/src/pkg/image/jpeg/reader.go
+++ b/src/pkg/image/jpeg/reader.go
@@ -96,7 +96,6 @@ type decoder struct {
 	huff          [maxTc + 1][maxTh + 1]huffman
 	quant         [maxTq + 1]block
 	b             bits
-	blocks        [nComponent][maxH * maxV]block
 	tmp           [1024]byte
 }
 
@@ -182,45 +181,6 @@ func (d *decoder) processDQT(n int) os.Error {
 	return nil
 }
 
-// Clip x to the range [0, 255] inclusive.
-func clip(x int) uint8 {
-	if x < 0 {
-		return 0
-	}
-	if x > 255 {
-		return 255
-	}
-	return uint8(x)
-}
-
-// Store the MCU to the image.
-func (d *decoder) storeMCU(mx, my int) {
-	h0, v0 := d.comps[0].h, d.comps[0].v
-	// Store the luma blocks.
-	for v := 0; v < v0; v++ {
-		for h := 0; h < h0; h++ {
-			p := 8 * ((v0*my+v)*d.img.YStride + (h0*mx + h))
-			for y := 0; y < 8; y++ {
-				for x := 0; x < 8; x++ {
-					d.img.Y[p] = clip(d.blocks[0][h0*v+h][8*y+x])
-					p++
-				}
-				p += d.img.YStride - 8
-			}
-		}
-	}
-	// Store the chroma blocks.
-	p := 8 * (my*d.img.CStride + mx)
-	for y := 0; y < 8; y++ {
-		for x := 0; x < 8; x++ {
-			d.img.Cb[p] = clip(d.blocks[1][0][8*y+x])
-			d.img.Cr[p] = clip(d.blocks[2][0][8*y+x])
-			p++
-		}
-		p += d.img.CStride - 8
-	}
-}
-
 // Specified in section B.2.3.
 func (d *decoder) processSOS(n int) os.Error {
 	if n != 4+2*nComponent {
@@ -275,14 +235,18 @@ func (d *decoder) processSOS(n int) os.Error {
 	}
 
 	mcu, expectedRST := 0, uint8(rst0Marker)
-	var allZeroes block
-	var dc [nComponent]int
+	var (
+		allZeroes, b block
+		dc           [nComponent]int
+	)
 	for my := 0; my < myy; my++ {
 		for mx := 0; mx < mxx; mx++ {
 			for i := 0; i < nComponent; i++ {
 				qt := &d.quant[d.comps[i].tq]
 				for j := 0; j < d.comps[i].h*d.comps[i].v; j++ {
-					d.blocks[i][j] = allZeroes
+					// TODO(nigeltao): make this a "var b block" once the compiler's escape
+					// analysis is good enough to allocate it on the stack, not the heap.
+					b = allZeroes
 
 					// Decode the DC coefficient, as specified in section F.2.2.1.
 					value, err := d.decodeHuffman(&d.huff[dcTableClass][scanComps[i].td])
@@ -297,7 +261,7 @@ func (d *decoder) processSOS(n int) os.Error {
 						return err
 					}
 					dc[i] += dcDelta
-					d.blocks[i][j][0] = dc[i] * qt[0]
+					b[0] = dc[i] * qt[0]
 
 					// Decode the AC coefficients, as specified in section F.2.2.2.
 					for k := 1; k < blockSize; k++ {
@@ -316,7 +280,7 @@ func (d *decoder) processSOS(n int) os.Error {
 							if err != nil {
 								return err
 							}
-							d.blocks[i][j][unzig[k]] = ac * qt[k]
+							b[unzig[k]] = ac * qt[k]
 						} else {
 							if val0 != 0x0f {
 								break
@@ -325,10 +289,19 @@ func (d *decoder) processSOS(n int) os.Error {
 						}
 					}
 
-					idct(&d.blocks[i][j])
+					// Perform the inverse DCT and store the MCU component to the image.
+					switch i {
+					case 0:
+						mx0 := h0*mx + (j % 2)
+						my0 := v0*my + (j / 2)
+						idct(d.img.Y[8*(my0*d.img.YStride+mx0):], d.img.YStride, &b)
+					case 1:
+						idct(d.img.Cb[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
+					case 2:
+						idct(d.img.Cr[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
+					}
 				} // for j
 			} // for i
-			d.storeMCU(mx, my)
 			mcu++
 			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
 				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,