1
0
mirror of https://github.com/golang/go synced 2024-10-02 06:28:33 -06:00
go/src/runtime/mkduff.go

220 lines
6.0 KiB
Go
Raw Normal View History

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// runtime·duffzero is a Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// the routine to zero chunks of memory.
// Do not change duffzero without also
// changing the uses in cmd/compile/internal/*/*.go.
// runtime·duffcopy is a Duff's device for copying memory.
// The compiler jumps to computed addresses within
// the routine to copy chunks of memory.
// Source and destination must not overlap.
// Do not change duffcopy without also
// changing the uses in cmd/compile/internal/*/*.go.
// See the zero* and copy* generators below
// for architecture-specific comments.
// mkduff generates duff_*.s.
package main
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"log"
)
func main() {
gen("amd64", notags, zeroAMD64, copyAMD64)
gen("386", notags, zero386, copy386)
gen("arm", notags, zeroARM, copyARM)
gen("arm64", notags, zeroARM64, copyARM64)
gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
}
func gen(arch string, tags, zero, copy func(io.Writer)) {
var buf bytes.Buffer
fmt.Fprintln(&buf, "// Code generated by mkduff.go; DO NOT EDIT.")
fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
fmt.Fprintln(&buf, "// See mkduff.go for comments.")
tags(&buf)
fmt.Fprintln(&buf, "#include \"textflag.h\"")
fmt.Fprintln(&buf)
zero(&buf)
fmt.Fprintln(&buf)
copy(&buf)
if err := ioutil.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {
log.Fatalln(err)
}
}
func notags(w io.Writer) { fmt.Fprintln(w) }
func zeroAMD64(w io.Writer) {
// X0: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 16; i++ {
fmt.Fprintln(w, "\tMOVUPS\tX0,(DI)")
fmt.Fprintln(w, "\tMOVUPS\tX0,16(DI)")
fmt.Fprintln(w, "\tMOVUPS\tX0,32(DI)")
fmt.Fprintln(w, "\tMOVUPS\tX0,48(DI)")
cmd/compile/internal/ssa: use sse to zero on amd64 Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-09 13:50:58 -06:00
fmt.Fprintln(w, "\tLEAQ\t64(DI),DI") // We use lea instead of add, to avoid clobbering flags
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}
func copyAMD64(w io.Writer) {
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
//
// This is equivalent to a sequence of MOVSQ but
// for some reason that is 3.5x slower than this code.
// The STOSQ in duffzero seem fine, though.
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
for i := 0; i < 64; i++ {
fmt.Fprintln(w, "\tMOVUPS\t(SI), X0")
fmt.Fprintln(w, "\tADDQ\t$16, SI")
fmt.Fprintln(w, "\tMOVUPS\tX0, (DI)")
fmt.Fprintln(w, "\tADDQ\t$16, DI")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}
func zero386(w io.Writer) {
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tSTOSL")
}
fmt.Fprintln(w, "\tRET")
}
func copy386(w io.Writer) {
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
//
// This is equivalent to a sequence of MOVSL but
// for some reason MOVSL is really slow.
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVL\t(SI), CX")
fmt.Fprintln(w, "\tADDL\t$4, SI")
fmt.Fprintln(w, "\tMOVL\tCX, (DI)")
fmt.Fprintln(w, "\tADDL\t$4, DI")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}
func zeroARM(w io.Writer) {
// R0: zero
// R1: ptr to memory to be zeroed
// R1 is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R1)")
}
fmt.Fprintln(w, "\tRET")
}
func copyARM(w io.Writer) {
// R0: scratch space
// R1: ptr to source memory
// R2: ptr to destination memory
// R1 and R2 are updated as a side effect
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVW.P\t4(R1), R0")
fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R2)")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}
func zeroARM64(w io.Writer) {
// ZR: always zero
// R16 (aka REGRT1): ptr to memory to be zeroed
// On return, R16 points to the last zeroed dword.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
for i := 0; i < 63; i++ {
fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
}
fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
fmt.Fprintln(w, "\tRET")
}
func copyARM64(w io.Writer) {
// R16 (aka REGRT1): ptr to source memory
// R17 (aka REGRT2): ptr to destination memory
// R27 (aka REGTMP): scratch space
// R16 and R17 are updated as a side effect
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVD.P\t8(R16), R27")
fmt.Fprintln(w, "\tMOVD.P\tR27, 8(R17)")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}
func tagsPPC64x(w io.Writer) {
fmt.Fprintln(w)
fmt.Fprintln(w, "// +build ppc64 ppc64le")
fmt.Fprintln(w)
}
func zeroPPC64x(w io.Writer) {
// R0: always zero
// R3 (aka REGRT1): ptr to memory to be zeroed - 8
// On return, R3 points to the last zeroed dword.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVDU\tR0, 8(R3)")
}
fmt.Fprintln(w, "\tRET")
}
func copyPPC64x(w io.Writer) {
fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
}
func tagsMIPS64x(w io.Writer) {
fmt.Fprintln(w)
fmt.Fprintln(w, "// +build mips64 mips64le")
fmt.Fprintln(w)
}
func zeroMIPS64x(w io.Writer) {
// R0: always zero
// R1 (aka REGRT1): ptr to memory to be zeroed - 8
// On return, R1 points to the last zeroed dword.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVV\tR0, 8(R1)")
fmt.Fprintln(w, "\tADDV\t$8, R1")
}
fmt.Fprintln(w, "\tRET")
}
func copyMIPS64x(w io.Writer) {
fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
}