1
0
mirror of https://github.com/golang/go synced 2024-09-29 13:24:28 -06:00

runtime, cmd/compile: implement and use DUFFCOPY on MIPS64

OS: Linux loongson 3.10.84 mips64el
CPU: Loongson 3A3000 quad core

name                   old time/op    new time/op    delta
BinaryTree17              23.5s ± 1%     23.2s ± 0%  -1.12%  (p=0.008 n=5+5)
Fannkuch11                10.2s ± 0%     10.1s ± 0%  -0.19%  (p=0.008 n=5+5)
FmtFprintfEmpty           450ns ± 0%     446ns ± 1%  -0.89%  (p=0.024 n=5+5)
FmtFprintfString          722ns ± 1%     721ns ± 1%    ~     (p=0.762 n=5+5)
FmtFprintfInt             693ns ± 2%     691ns ± 2%    ~     (p=0.889 n=5+5)
FmtFprintfIntInt          912ns ± 1%     911ns ± 0%    ~     (p=0.722 n=5+5)
FmtFprintfPrefixedInt    1.35µs ± 2%    1.35µs ± 2%    ~     (p=1.000 n=5+5)
FmtFprintfFloat          1.79µs ± 0%    1.78µs ± 0%    ~     (p=0.683 n=5+5)
FmtManyArgs              3.46µs ± 1%    3.48µs ± 1%    ~     (p=0.246 n=5+5)
GobDecode                48.8ms ± 1%    48.6ms ± 0%    ~     (p=0.222 n=5+5)
GobEncode                37.7ms ± 1%    37.4ms ± 1%    ~     (p=0.095 n=5+5)
Gzip                      1.72s ± 1%     1.72s ± 0%    ~     (p=0.905 n=5+4)
Gunzip                    342ms ± 0%     342ms ± 0%    ~     (p=0.421 n=5+5)
HTTPClientServer          219µs ± 1%     219µs ± 1%    ~     (p=1.000 n=5+5)
JSONEncode               89.1ms ± 1%    89.4ms ± 1%    ~     (p=0.222 n=5+5)
JSONDecode                292ms ± 1%     291ms ± 0%    ~     (p=0.421 n=5+5)
Mandelbrot200            15.7ms ± 0%    15.6ms ± 0%    ~     (p=0.690 n=5+5)
GoParse                  19.5ms ± 1%    19.6ms ± 1%    ~     (p=0.310 n=5+5)
RegexpMatchEasy0_32       534ns ± 1%     529ns ± 1%    ~     (p=0.056 n=5+5)
RegexpMatchEasy0_1K      2.75µs ± 0%    2.74µs ± 0%  -0.46%  (p=0.008 n=5+5)
RegexpMatchEasy1_32       572ns ± 2%     565ns ± 3%    ~     (p=0.310 n=5+5)
RegexpMatchEasy1_1K      4.15µs ± 0%    4.15µs ± 1%    ~     (p=0.548 n=5+5)
RegexpMatchMedium_32     31.2ns ± 0%    31.1ns ± 0%  -0.45%  (p=0.016 n=5+4)
RegexpMatchMedium_1K      235µs ± 1%     235µs ± 0%    ~     (p=1.000 n=5+5)
RegexpMatchHard_32       13.9µs ± 1%    13.5µs ± 1%  -2.74%  (p=0.008 n=5+5)
RegexpMatchHard_1K        416µs ± 2%     410µs ± 2%    ~     (p=0.056 n=5+5)
Revcomp                   6.36s ± 0%     6.34s ± 0%  -0.31%  (p=0.008 n=5+5)
Template                  352ms ± 1%     353ms ± 0%  +0.45%  (p=0.032 n=5+5)
TimeParse                2.04µs ± 4%    2.01µs ± 0%    ~     (p=0.056 n=5+5)
TimeFormat               2.97µs ± 0%    2.97µs ± 0%    ~     (p=1.000 n=5+5)

name                   old speed      new speed      delta
GobDecode              15.7MB/s ± 1%  15.8MB/s ± 0%    ~     (p=0.206 n=5+5)
GobEncode              20.4MB/s ± 1%  20.5MB/s ± 1%    ~     (p=0.056 n=5+5)
Gzip                   11.3MB/s ± 1%  11.3MB/s ± 0%    ~     (p=0.841 n=5+4)
Gunzip                 56.7MB/s ± 0%  56.8MB/s ± 0%    ~     (p=0.389 n=5+5)
JSONEncode             21.8MB/s ± 1%  21.7MB/s ± 1%    ~     (p=0.246 n=5+5)
JSONDecode             6.66MB/s ± 0%  6.67MB/s ± 0%    ~     (p=0.857 n=4+5)
GoParse                2.97MB/s ± 1%  2.96MB/s ± 1%    ~     (p=0.238 n=5+5)
RegexpMatchEasy0_32    59.9MB/s ± 1%  60.5MB/s ± 1%  +0.92%  (p=0.032 n=5+5)
RegexpMatchEasy0_1K     372MB/s ± 0%   374MB/s ± 0%  +0.46%  (p=0.008 n=5+5)
RegexpMatchEasy1_32    56.0MB/s ± 2%  56.7MB/s ± 3%    ~     (p=0.310 n=5+5)
RegexpMatchEasy1_1K     247MB/s ± 0%   247MB/s ± 1%    ~     (p=0.548 n=5+5)
RegexpMatchMedium_32   32.0MB/s ± 0%  32.1MB/s ± 0%    ~     (p=0.135 n=5+5)
RegexpMatchMedium_1K   4.35MB/s ± 1%  4.35MB/s ± 1%    ~     (p=0.825 n=5+5)
RegexpMatchHard_32     2.30MB/s ± 1%  2.37MB/s ± 1%  +2.78%  (p=0.008 n=5+5)
RegexpMatchHard_1K     2.47MB/s ± 1%  2.50MB/s ± 2%    ~     (p=0.095 n=5+5)
Revcomp                40.0MB/s ± 0%  40.1MB/s ± 0%  +0.31%  (p=0.016 n=5+5)
Template               5.51MB/s ± 1%  5.49MB/s ± 0%    ~     (p=0.190 n=5+5)

Change-Id: I540a2e4e7992376ce04f93b332f64fc3b6071237
Reviewed-on: https://go-review.googlesource.com/c/go/+/185078
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
Meng Zhuo 2019-07-08 23:45:44 +08:00 committed by Cherry Zhang
parent b58d9bb8c8
commit 307544f427
7 changed files with 720 additions and 4 deletions

View File

@ -426,6 +426,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p4.Reg = mips.REG_R1
p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p2)
case ssa.OpMIPS64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Duffcopy
p.To.Offset = v.AuxInt
case ssa.OpMIPS64LoweredMove:
// SUBV $8, R1
// MOVV 8(R1), Rtmp

View File

@ -370,6 +370,18 @@
(MOVVstore [8] dst (MOVVload [8] src mem)
(MOVVstore dst (MOVVload src mem) mem)))
// medium move uses a duff device
(Move [s] {t} dst src mem)
&& s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0
&& !config.noDuffDevice ->
(DUFFCOPY [16 * (128 - s/8)] dst src mem)
// 16 and 128 are magic constants. 16 is the number of bytes to encode:
// MOVV (R1), R23
// ADDV $8, R1
// MOVV R23, (R2)
// ADDV $8, R2
// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy.
// large or unaligned move uses a loop
(Move [s] {t} dst src mem)
&& s > 24 || t.(*types.Type).Alignment()%8 != 0 ->

View File

@ -292,6 +292,24 @@ func init() {
faultOnNilArg0: true,
},
// duffcopy
// arg0 = address of dst memory (in R2, changed as side effect)
// arg1 = address of src memory (in R1, changed as side effect)
// arg2 = mem
// auxint = offset into duffcopy code to start executing
// returns mem
{
name: "DUFFCOPY",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R2"), buildReg("R1")},
clobbers: buildReg("R1 R2 R31"),
},
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// large or unaligned zeroing
// arg0 = address of memory to zero (in R1, changed as side effect)
// arg1 = address of the last element to zero

View File

@ -1635,6 +1635,7 @@ const (
OpMIPS64CALLclosure
OpMIPS64CALLinter
OpMIPS64DUFFZERO
OpMIPS64DUFFCOPY
OpMIPS64LoweredZero
OpMIPS64LoweredMove
OpMIPS64LoweredAtomicLoad8
@ -21768,6 +21769,20 @@ var opcodeTable = [...]opInfo{
clobbers: 134217730, // R1 R31
},
},
{
name: "DUFFCOPY",
auxType: auxInt64,
argLen: 3,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
{0, 4}, // R2
{1, 2}, // R1
},
clobbers: 134217734, // R1 R2 R31
},
},
{
name: "LoweredZero",
auxType: auxInt64,

View File

@ -6969,6 +6969,25 @@ func rewriteValueMIPS64_OpMove_10(v *Value) bool {
return true
}
// match: (Move [s] {t} dst src mem)
// cond: s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice
// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
for {
s := v.AuxInt
t := v.Aux
mem := v.Args[2]
dst := v.Args[0]
src := v.Args[1]
if !(s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice) {
break
}
v.reset(OpMIPS64DUFFCOPY)
v.AuxInt = 16 * (128 - s/8)
v.AddArg(dst)
v.AddArg(src)
v.AddArg(mem)
return true
}
// match: (Move [s] {t} dst src mem)
// cond: s > 24 || t.(*types.Type).Alignment()%8 != 0
// result: (LoweredMove [t.(*types.Type).Alignment()] dst src (ADDVconst <src.Type> src [s-moveSize(t.(*types.Type).Alignment(), config)]) mem)
for {

View File

@ -265,7 +265,645 @@ TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
ADDV $8, R1
RET
// TODO: Implement runtime·duffcopy.
TEXT runtime·duffcopy(SB),NOSPLIT|NOFRAME,$0-0
MOVV R0, 2(R0)
TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
MOVV (R1), R23
ADDV $8, R1
MOVV R23, (R2)
ADDV $8, R2
RET

View File

@ -216,5 +216,13 @@ func zeroMIPS64x(w io.Writer) {
}
func copyMIPS64x(w io.Writer) {
fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVV\t(R1), R23")
fmt.Fprintln(w, "\tADDV\t$8, R1")
fmt.Fprintln(w, "\tMOVV\tR23, (R2)")
fmt.Fprintln(w, "\tADDV\t$8, R2")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
}