go/src/runtime/mkduff.go

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ignore

// runtime·duffzero is a Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// the routine to zero chunks of memory.
// Do not change duffzero without also
// changing clearfat in cmd/?g/ggen.go.

// runtime·duffcopy is a Duff's device for copying memory.
// The compiler jumps to computed addresses within
// the routine to copy chunks of memory.
// Source and destination must not overlap.
// Do not change duffcopy without also
// changing blockcopy in cmd/?g/cgen.go.

// See the zero* and copy* generators below
// for architecture-specific comments.

// mkduff generates duff_*.s.
package main

import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"log"
)

func main() {
	gen("amd64", notags, zeroAMD64, copyAMD64)
	gen("386", notags, zero386, copy386)
	gen("arm", notags, zeroARM, copyARM)
	gen("arm64", notags, zeroARM64, copyARM64)
	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
}

func gen(arch string, tags, zero, copy func(io.Writer)) {
	var buf bytes.Buffer

	fmt.Fprintln(&buf, "// AUTO-GENERATED by mkduff.go")
	fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
	fmt.Fprintln(&buf, "// See mkduff.go for comments.")
	tags(&buf)
	fmt.Fprintln(&buf, "#include \"textflag.h\"")
	fmt.Fprintln(&buf)
	zero(&buf)
	fmt.Fprintln(&buf)
	copy(&buf)

	if err := ioutil.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {
		log.Fatalln(err)
	}
}

func notags(w io.Writer) { fmt.Fprintln(w) }

func zeroAMD64(w io.Writer) {
	// AX: zero
	// DI: ptr to memory to be zeroed
	// DI is updated as a side effect.
	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
	for i := 0; i < 31; i++ {
		fmt.Fprintln(w, "\tMOVQ\tAX,(DI)")
		fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)")
		fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)")
		fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)")
		fmt.Fprintln(w, "\tADDQ\t$32,DI")
		fmt.Fprintln(w)
	}
	for i := 0; i < 4; i++ {
		fmt.Fprintln(w, "\tSTOSQ")
	}
	fmt.Fprintln(w, "\tRET")
}

func copyAMD64(w io.Writer) {
	// SI: ptr to source memory
	// DI: ptr to destination memory
	// SI and DI are updated as a side effect.
	//
	// This is equivalent to a sequence of MOVSQ but
	// for some reason that is 3.5x slower than this code.
	// The STOSQ in duffzero seem fine, though.
	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVQ\t(SI), CX")
		fmt.Fprintln(w, "\tADDQ\t$8, SI")
		fmt.Fprintln(w, "\tMOVQ\tCX, (DI)")
		fmt.Fprintln(w, "\tADDQ\t$8, DI")
		fmt.Fprintln(w)
	}
	fmt.Fprintln(w, "\tRET")
}

func zero386(w io.Writer) {
	// AX: zero
	// DI: ptr to memory to be zeroed
	// DI is updated as a side effect.
	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tSTOSL")
	}
	fmt.Fprintln(w, "\tRET")
}

func copy386(w io.Writer) {
	// SI: ptr to source memory
	// DI: ptr to destination memory
	// SI and DI are updated as a side effect.
	//
	// This is equivalent to a sequence of MOVSL but
	// for some reason MOVSL is really slow.
	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVL\t(SI), CX")
		fmt.Fprintln(w, "\tADDL\t$4, SI")
		fmt.Fprintln(w, "\tMOVL\tCX, (DI)")
		fmt.Fprintln(w, "\tADDL\t$4, DI")
		fmt.Fprintln(w)
	}
	fmt.Fprintln(w, "\tRET")
}

func zeroARM(w io.Writer) {
	// R0: zero
	// R1: ptr to memory to be zeroed
	// R1 is updated as a side effect.
	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R1)")
	}
	fmt.Fprintln(w, "\tRET")
}

func copyARM(w io.Writer) {
	// R0: scratch space
	// R1: ptr to source memory
	// R2: ptr to destination memory
	// R1 and R2 are updated as a side effect
	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVW.P\t4(R1), R0")
		fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R2)")
		fmt.Fprintln(w)
	}
	fmt.Fprintln(w, "\tRET")
}

func zeroARM64(w io.Writer) {
	// ZR: always zero
	// R16 (aka REGRT1): ptr to memory to be zeroed - 8
	// On return, R16 points to the last zeroed dword.
	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
	}
	fmt.Fprintln(w, "\tRET")
}

func copyARM64(w io.Writer) {
	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
}

func tagsPPC64x(w io.Writer) {
	fmt.Fprintln(w)
	fmt.Fprintln(w, "// +build ppc64 ppc64le")
	fmt.Fprintln(w)
}

func zeroPPC64x(w io.Writer) {
	// R0: always zero
	// R3 (aka REGRT1): ptr to memory to be zeroed - 8
	// On return, R3 points to the last zeroed dword.
	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
	for i := 0; i < 128; i++ {
		fmt.Fprintln(w, "\tMOVDU\tR0, 8(R3)")
	}
	fmt.Fprintln(w, "\tRETURN")
}

func copyPPC64x(w io.Writer) {
	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
}
runtime: auto-generate duff routines This makes it easier to experiment with alternative implementations. While we're here, update the comments. No functional changes. Passes toolstash -cmp. Change-Id: I428535754908f0fdd7cc36c214ddb6e1e60f376e Reviewed-on: https://go-review.googlesource.com/8310 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> 2015-03-31 10:19:10 -06:00			`// Copyright 2015 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// +build ignore`

			`// runtime·duffzero is a Duff's device for zeroing memory.`
			`// The compiler jumps to computed addresses within`
			`// the routine to zero chunks of memory.`
			`// Do not change duffzero without also`
			`// changing clearfat in cmd/?g/ggen.go.`

			`// runtime·duffcopy is a Duff's device for copying memory.`
			`// The compiler jumps to computed addresses within`
			`// the routine to copy chunks of memory.`
			`// Source and destination must not overlap.`
			`// Do not change duffcopy without also`
cmd/internal/gc, cmd/[56789]g: rename stackcopy to blockcopy To avoid confusion with the runtime concept of copying stack. Change-Id: I33442377b71012c2482c2d0ddd561492c71e70d0 Reviewed-on: https://go-review.googlesource.com/8639 Reviewed-by: Dave Cheney <dave@cheney.net> Reviewed-by: Russ Cox <rsc@golang.org> 2015-04-08 11:34:42 -06:00			`// changing blockcopy in cmd/?g/cgen.go.`
runtime: auto-generate duff routines This makes it easier to experiment with alternative implementations. While we're here, update the comments. No functional changes. Passes toolstash -cmp. Change-Id: I428535754908f0fdd7cc36c214ddb6e1e60f376e Reviewed-on: https://go-review.googlesource.com/8310 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> 2015-03-31 10:19:10 -06:00
			`// See the zero* and copy* generators below`
			`// for architecture-specific comments.`

			`// mkduff generates duff_*.s.`
			`package main`

			`import (`
			`"bytes"`
			`"fmt"`
			`"io"`
			`"io/ioutil"`
			`"log"`
			`)`

			`func main() {`
			`gen("amd64", notags, zeroAMD64, copyAMD64)`
			`gen("386", notags, zero386, copy386)`
			`gen("arm", notags, zeroARM, copyARM)`
			`gen("arm64", notags, zeroARM64, copyARM64)`
			`gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)`
			`}`

			`func gen(arch string, tags, zero, copy func(io.Writer)) {`
			`var buf bytes.Buffer`

			`fmt.Fprintln(&buf, "// AUTO-GENERATED by mkduff.go")`
			`fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")`
			`fmt.Fprintln(&buf, "// See mkduff.go for comments.")`
			`tags(&buf)`
			`fmt.Fprintln(&buf, "#include \"textflag.h\"")`
			`fmt.Fprintln(&buf)`
			`zero(&buf)`
			`fmt.Fprintln(&buf)`
			`copy(&buf)`

			`if err := ioutil.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`

			`func notags(w io.Writer) { fmt.Fprintln(w) }`

			`func zeroAMD64(w io.Writer) {`
			`// AX: zero`
			`// DI: ptr to memory to be zeroed`
			`// DI is updated as a side effect.`
			`fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")`
cmd/6g, runtime: improve duffzero throughput It is faster to execute MOVQ AX,(DI) MOVQ AX,8(DI) MOVQ AX,16(DI) MOVQ AX,24(DI) ADDQ $32,DI than STOSQ STOSQ STOSQ STOSQ However, in order to be able to jump into the middle of a block of MOVQs, the call site needs to pre-adjust DI. If we're clearing a small area, the cost of that DI pre-adjustment isn't repaid. This CL switches the DUFFZERO implementation to use a hybrid strategy, in which small clears use STOSQ as before, but large clears use mostly MOVQ/ADDQ blocks. benchmark old ns/op new ns/op delta BenchmarkClearFat8 0.55 0.55 +0.00% BenchmarkClearFat12 0.82 0.83 +1.22% BenchmarkClearFat16 0.55 0.55 +0.00% BenchmarkClearFat24 0.82 0.82 +0.00% BenchmarkClearFat32 2.20 1.94 -11.82% BenchmarkClearFat40 1.92 1.66 -13.54% BenchmarkClearFat48 2.21 1.93 -12.67% BenchmarkClearFat56 3.03 2.20 -27.39% BenchmarkClearFat64 3.26 2.48 -23.93% BenchmarkClearFat72 3.57 2.76 -22.69% BenchmarkClearFat80 3.83 3.05 -20.37% BenchmarkClearFat88 4.14 3.30 -20.29% BenchmarkClearFat128 5.54 4.69 -15.34% BenchmarkClearFat256 9.95 9.09 -8.64% BenchmarkClearFat512 18.7 17.9 -4.28% BenchmarkClearFat1024 36.2 35.4 -2.21% Change-Id: Ic786406d9b3cab68d5a231688f9e66fcd1bd7103 Reviewed-on: https://go-review.googlesource.com/2585 Reviewed-by: Keith Randall <khr@golang.org> 2015-04-15 12:05:01 -06:00			`for i := 0; i < 31; i++ {`
			`fmt.Fprintln(w, "\tMOVQ\tAX,(DI)")`
			`fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)")`
			`fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)")`
			`fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)")`
			`fmt.Fprintln(w, "\tADDQ\t$32,DI")`
			`fmt.Fprintln(w)`
			`}`
			`for i := 0; i < 4; i++ {`
runtime: auto-generate duff routines This makes it easier to experiment with alternative implementations. While we're here, update the comments. No functional changes. Passes toolstash -cmp. Change-Id: I428535754908f0fdd7cc36c214ddb6e1e60f376e Reviewed-on: https://go-review.googlesource.com/8310 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> 2015-03-31 10:19:10 -06:00			`fmt.Fprintln(w, "\tSTOSQ")`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func copyAMD64(w io.Writer) {`
			`// SI: ptr to source memory`
			`// DI: ptr to destination memory`
			`// SI and DI are updated as a side effect.`
			`//`
			`// This is equivalent to a sequence of MOVSQ but`
			`// for some reason that is 3.5x slower than this code.`
			`// The STOSQ in duffzero seem fine, though.`
			`fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVQ\t(SI), CX")`
			`fmt.Fprintln(w, "\tADDQ\t$8, SI")`
			`fmt.Fprintln(w, "\tMOVQ\tCX, (DI)")`
			`fmt.Fprintln(w, "\tADDQ\t$8, DI")`
			`fmt.Fprintln(w)`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func zero386(w io.Writer) {`
			`// AX: zero`
			`// DI: ptr to memory to be zeroed`
			`// DI is updated as a side effect.`
			`fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tSTOSL")`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func copy386(w io.Writer) {`
			`// SI: ptr to source memory`
			`// DI: ptr to destination memory`
			`// SI and DI are updated as a side effect.`
			`//`
			`// This is equivalent to a sequence of MOVSL but`
			`// for some reason MOVSL is really slow.`
			`fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVL\t(SI), CX")`
			`fmt.Fprintln(w, "\tADDL\t$4, SI")`
			`fmt.Fprintln(w, "\tMOVL\tCX, (DI)")`
			`fmt.Fprintln(w, "\tADDL\t$4, DI")`
			`fmt.Fprintln(w)`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func zeroARM(w io.Writer) {`
			`// R0: zero`
			`// R1: ptr to memory to be zeroed`
			`// R1 is updated as a side effect.`
			`fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R1)")`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func copyARM(w io.Writer) {`
			`// R0: scratch space`
			`// R1: ptr to source memory`
			`// R2: ptr to destination memory`
			`// R1 and R2 are updated as a side effect`
			`fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVW.P\t4(R1), R0")`
			`fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R2)")`
			`fmt.Fprintln(w)`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func zeroARM64(w io.Writer) {`
			`// ZR: always zero`
			`// R16 (aka REGRT1): ptr to memory to be zeroed - 8`
			`// On return, R16 points to the last zeroed dword.`
			`fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")`
			`}`
			`fmt.Fprintln(w, "\tRET")`
			`}`

			`func copyARM64(w io.Writer) {`
			`fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")`
			`}`

			`func tagsPPC64x(w io.Writer) {`
			`fmt.Fprintln(w)`
			`fmt.Fprintln(w, "// +build ppc64 ppc64le")`
			`fmt.Fprintln(w)`
			`}`

			`func zeroPPC64x(w io.Writer) {`
			`// R0: always zero`
			`// R3 (aka REGRT1): ptr to memory to be zeroed - 8`
			`// On return, R3 points to the last zeroed dword.`
			`fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")`
			`for i := 0; i < 128; i++ {`
			`fmt.Fprintln(w, "\tMOVDU\tR0, 8(R3)")`
			`}`
			`fmt.Fprintln(w, "\tRETURN")`
			`}`

			`func copyPPC64x(w io.Writer) {`
			`fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")`
			`}`