2015-03-31 10:19:10 -06:00
|
|
|
// Copyright 2015 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// +build ignore
|
|
|
|
|
|
|
|
// runtime·duffzero is a Duff's device for zeroing memory.
|
|
|
|
// The compiler jumps to computed addresses within
|
|
|
|
// the routine to zero chunks of memory.
|
|
|
|
// Do not change duffzero without also
|
|
|
|
// changing clearfat in cmd/?g/ggen.go.
|
|
|
|
|
|
|
|
// runtime·duffcopy is a Duff's device for copying memory.
|
|
|
|
// The compiler jumps to computed addresses within
|
|
|
|
// the routine to copy chunks of memory.
|
|
|
|
// Source and destination must not overlap.
|
|
|
|
// Do not change duffcopy without also
|
2015-04-08 11:34:42 -06:00
|
|
|
// changing blockcopy in cmd/?g/cgen.go.
|
2015-03-31 10:19:10 -06:00
|
|
|
|
|
|
|
// See the zero* and copy* generators below
|
|
|
|
// for architecture-specific comments.
|
|
|
|
|
|
|
|
// mkduff generates duff_*.s.
|
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"io/ioutil"
|
|
|
|
"log"
|
|
|
|
)
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
gen("amd64", notags, zeroAMD64, copyAMD64)
|
|
|
|
gen("386", notags, zero386, copy386)
|
|
|
|
gen("arm", notags, zeroARM, copyARM)
|
|
|
|
gen("arm64", notags, zeroARM64, copyARM64)
|
|
|
|
gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
|
|
|
|
}
|
|
|
|
|
|
|
|
func gen(arch string, tags, zero, copy func(io.Writer)) {
|
|
|
|
var buf bytes.Buffer
|
|
|
|
|
|
|
|
fmt.Fprintln(&buf, "// AUTO-GENERATED by mkduff.go")
|
|
|
|
fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
|
|
|
|
fmt.Fprintln(&buf, "// See mkduff.go for comments.")
|
|
|
|
tags(&buf)
|
|
|
|
fmt.Fprintln(&buf, "#include \"textflag.h\"")
|
|
|
|
fmt.Fprintln(&buf)
|
|
|
|
zero(&buf)
|
|
|
|
fmt.Fprintln(&buf)
|
|
|
|
copy(&buf)
|
|
|
|
|
|
|
|
if err := ioutil.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {
|
|
|
|
log.Fatalln(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func notags(w io.Writer) { fmt.Fprintln(w) }
|
|
|
|
|
|
|
|
func zeroAMD64(w io.Writer) {
|
|
|
|
// AX: zero
|
|
|
|
// DI: ptr to memory to be zeroed
|
|
|
|
// DI is updated as a side effect.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
|
cmd/6g, runtime: improve duffzero throughput
It is faster to execute
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
than
STOSQ
STOSQ
STOSQ
STOSQ
However, in order to be able to jump into
the middle of a block of MOVQs, the call
site needs to pre-adjust DI.
If we're clearing a small area, the cost
of that DI pre-adjustment isn't repaid.
This CL switches the DUFFZERO implementation
to use a hybrid strategy, in which small
clears use STOSQ as before, but large clears
use mostly MOVQ/ADDQ blocks.
benchmark old ns/op new ns/op delta
BenchmarkClearFat8 0.55 0.55 +0.00%
BenchmarkClearFat12 0.82 0.83 +1.22%
BenchmarkClearFat16 0.55 0.55 +0.00%
BenchmarkClearFat24 0.82 0.82 +0.00%
BenchmarkClearFat32 2.20 1.94 -11.82%
BenchmarkClearFat40 1.92 1.66 -13.54%
BenchmarkClearFat48 2.21 1.93 -12.67%
BenchmarkClearFat56 3.03 2.20 -27.39%
BenchmarkClearFat64 3.26 2.48 -23.93%
BenchmarkClearFat72 3.57 2.76 -22.69%
BenchmarkClearFat80 3.83 3.05 -20.37%
BenchmarkClearFat88 4.14 3.30 -20.29%
BenchmarkClearFat128 5.54 4.69 -15.34%
BenchmarkClearFat256 9.95 9.09 -8.64%
BenchmarkClearFat512 18.7 17.9 -4.28%
BenchmarkClearFat1024 36.2 35.4 -2.21%
Change-Id: Ic786406d9b3cab68d5a231688f9e66fcd1bd7103
Reviewed-on: https://go-review.googlesource.com/2585
Reviewed-by: Keith Randall <khr@golang.org>
2015-04-15 12:05:01 -06:00
|
|
|
for i := 0; i < 31; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\tAX,(DI)")
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)")
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)")
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)")
|
|
|
|
fmt.Fprintln(w, "\tADDQ\t$32,DI")
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
}
|
|
|
|
for i := 0; i < 4; i++ {
|
2015-03-31 10:19:10 -06:00
|
|
|
fmt.Fprintln(w, "\tSTOSQ")
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyAMD64(w io.Writer) {
|
|
|
|
// SI: ptr to source memory
|
|
|
|
// DI: ptr to destination memory
|
|
|
|
// SI and DI are updated as a side effect.
|
|
|
|
//
|
|
|
|
// This is equivalent to a sequence of MOVSQ but
|
|
|
|
// for some reason that is 3.5x slower than this code.
|
|
|
|
// The STOSQ in duffzero seem fine, though.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\t(SI), CX")
|
|
|
|
fmt.Fprintln(w, "\tADDQ\t$8, SI")
|
|
|
|
fmt.Fprintln(w, "\tMOVQ\tCX, (DI)")
|
|
|
|
fmt.Fprintln(w, "\tADDQ\t$8, DI")
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func zero386(w io.Writer) {
|
|
|
|
// AX: zero
|
|
|
|
// DI: ptr to memory to be zeroed
|
|
|
|
// DI is updated as a side effect.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tSTOSL")
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func copy386(w io.Writer) {
|
|
|
|
// SI: ptr to source memory
|
|
|
|
// DI: ptr to destination memory
|
|
|
|
// SI and DI are updated as a side effect.
|
|
|
|
//
|
|
|
|
// This is equivalent to a sequence of MOVSL but
|
|
|
|
// for some reason MOVSL is really slow.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVL\t(SI), CX")
|
|
|
|
fmt.Fprintln(w, "\tADDL\t$4, SI")
|
|
|
|
fmt.Fprintln(w, "\tMOVL\tCX, (DI)")
|
|
|
|
fmt.Fprintln(w, "\tADDL\t$4, DI")
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func zeroARM(w io.Writer) {
|
|
|
|
// R0: zero
|
|
|
|
// R1: ptr to memory to be zeroed
|
|
|
|
// R1 is updated as a side effect.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R1)")
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyARM(w io.Writer) {
|
|
|
|
// R0: scratch space
|
|
|
|
// R1: ptr to source memory
|
|
|
|
// R2: ptr to destination memory
|
|
|
|
// R1 and R2 are updated as a side effect
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVW.P\t4(R1), R0")
|
|
|
|
fmt.Fprintln(w, "\tMOVW.P\tR0, 4(R2)")
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func zeroARM64(w io.Writer) {
|
|
|
|
// ZR: always zero
|
|
|
|
// R16 (aka REGRT1): ptr to memory to be zeroed - 8
|
|
|
|
// On return, R16 points to the last zeroed dword.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRET")
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyARM64(w io.Writer) {
|
|
|
|
fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
|
|
|
|
}
|
|
|
|
|
|
|
|
func tagsPPC64x(w io.Writer) {
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
fmt.Fprintln(w, "// +build ppc64 ppc64le")
|
|
|
|
fmt.Fprintln(w)
|
|
|
|
}
|
|
|
|
|
|
|
|
func zeroPPC64x(w io.Writer) {
|
|
|
|
// R0: always zero
|
|
|
|
// R3 (aka REGRT1): ptr to memory to be zeroed - 8
|
|
|
|
// On return, R3 points to the last zeroed dword.
|
|
|
|
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
|
|
|
|
for i := 0; i < 128; i++ {
|
|
|
|
fmt.Fprintln(w, "\tMOVDU\tR0, 8(R3)")
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\tRETURN")
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyPPC64x(w io.Writer) {
|
|
|
|
fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
|
|
|
|
}
|