go/src/runtime/memmove_arm64.s

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove(SB), NOSPLIT, $-8-24
	MOVD	to+0(FP), R3
	MOVD	from+8(FP), R4
	MOVD	n+16(FP), R5
	CMP	$0, R5
	BNE	check
	RET

check:
	AND	$~7, R5, R7	// R7 is N&~7
	// TODO(mwhudson): this is written this way to avoid tickling
	// warnings from addpool when written as AND $7, R5, R6 (see
	// https://golang.org/issue/12708)
	SUB	R7, R5, R6	// R6 is N&7

	CMP	R3, R4
	BLT	backward

	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
	// R3 and R4 are advanced as we copy.

        // (There may be implementations of armv8 where copying by bytes until
        // at least one of source or dest is word aligned is a worthwhile
        // optimization, but the on the one tested so far (xgene) it did not
        // make a significance difference.)

	CMP	$0, R7		// Do we need to do any word-by-word copying?
	BEQ	noforwardlarge

	ADD	R3, R7, R9	// R9 points just past where we copy by word

forwardlargeloop:
	MOVD.P	8(R4), R8	// R8 is just a scratch register
	MOVD.P	R8, 8(R3)
	CMP	R3, R9
	BNE	forwardlargeloop

noforwardlarge:
	CMP	$0, R6		// Do we need to do any byte-by-byte copying?
	BNE	forwardtail
	RET

forwardtail:
	ADD	R3, R6, R9	// R9 points just past the destination memory

forwardtailloop:
	MOVBU.P 1(R4), R8
	MOVBU.P	R8, 1(R3)
	CMP	R3, R9
	BNE	forwardtailloop
	RET

backward:
	// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
	// R3 and R4 are advanced to the end of the destination/source buffers
	// respectively and moved back as we copy.

	ADD	R4, R5, R4	// R4 points just past the last source byte
	ADD	R3, R5, R3	// R3 points just past the last destination byte

	CMP	$0, R6		// Do we need to do any byte-by-byte copying?
	BEQ	nobackwardtail

	SUB	R6, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
backwardtailloop:
	MOVBU.W	-1(R4), R8
	MOVBU.W	R8, -1(R3)
	CMP	R9, R3
	BNE	backwardtailloop

nobackwardtail:
	CMP     $0, R7		// Do we need to do any word-by-word copying?
	BNE	backwardlarge
	RET

backwardlarge:
        SUB	R7, R3, R9      // R9 points at the lowest destination byte

backwardlargeloop:
	MOVD.W	-8(R4), R8
	MOVD.W	R8, -8(R3)
	CMP	R9, R3
	BNE	backwardlargeloop
	RET
runtime: add support for linux/arm64 Change-Id: Ibda6a5bedaff57fd161d63fc04ad260931d34413 Reviewed-on: https://go-review.googlesource.com/7142 Reviewed-by: Russ Cox <rsc@golang.org> 2015-03-08 07:20:20 -06:00			`// Copyright 2014 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`#include "textflag.h"`

			`// void runtime·memmove(void, void, uintptr)`
			`TEXT runtime·memmove(SB), NOSPLIT, $-8-24`
			`MOVD to+0(FP), R3`
			`MOVD from+8(FP), R4`
			`MOVD n+16(FP), R5`
			`CMP $0, R5`
			`BNE check`
			`RET`

			`check:`
runtime: adjust the arm64 memmove and memclr to operate by word as much as they can Not only is this an obvious optimization: benchmark old MB/s new MB/s speedup BenchmarkMemmove1-4 35.35 29.65 0.84x BenchmarkMemmove2-4 63.78 52.53 0.82x BenchmarkMemmove3-4 89.72 73.96 0.82x BenchmarkMemmove4-4 109.94 95.73 0.87x BenchmarkMemmove5-4 127.60 112.80 0.88x BenchmarkMemmove6-4 143.59 126.67 0.88x BenchmarkMemmove7-4 157.90 138.92 0.88x BenchmarkMemmove8-4 167.18 231.81 1.39x BenchmarkMemmove9-4 175.23 252.07 1.44x BenchmarkMemmove10-4 165.68 261.10 1.58x BenchmarkMemmove11-4 174.43 263.31 1.51x BenchmarkMemmove12-4 180.76 267.56 1.48x BenchmarkMemmove13-4 189.06 284.93 1.51x BenchmarkMemmove14-4 186.31 284.72 1.53x BenchmarkMemmove15-4 195.75 281.62 1.44x BenchmarkMemmove16-4 202.96 439.23 2.16x BenchmarkMemmove32-4 264.77 775.77 2.93x BenchmarkMemmove64-4 306.81 1209.64 3.94x BenchmarkMemmove128-4 357.03 1515.41 4.24x BenchmarkMemmove256-4 380.77 2066.01 5.43x BenchmarkMemmove512-4 385.05 2556.45 6.64x BenchmarkMemmove1024-4 381.23 2804.10 7.36x BenchmarkMemmove2048-4 379.06 2814.83 7.43x BenchmarkMemmove4096-4 387.43 3064.96 7.91x BenchmarkMemmoveUnaligned1-4 28.91 25.40 0.88x BenchmarkMemmoveUnaligned2-4 56.13 47.56 0.85x BenchmarkMemmoveUnaligned3-4 74.32 69.31 0.93x BenchmarkMemmoveUnaligned4-4 97.02 83.58 0.86x BenchmarkMemmoveUnaligned5-4 110.17 103.62 0.94x BenchmarkMemmoveUnaligned6-4 124.95 113.26 0.91x BenchmarkMemmoveUnaligned7-4 142.37 130.82 0.92x BenchmarkMemmoveUnaligned8-4 151.20 205.64 1.36x BenchmarkMemmoveUnaligned9-4 166.97 215.42 1.29x BenchmarkMemmoveUnaligned10-4 148.49 221.22 1.49x BenchmarkMemmoveUnaligned11-4 159.47 239.57 1.50x BenchmarkMemmoveUnaligned12-4 163.52 247.32 1.51x BenchmarkMemmoveUnaligned13-4 167.55 256.54 1.53x BenchmarkMemmoveUnaligned14-4 175.12 251.03 1.43x BenchmarkMemmoveUnaligned15-4 192.10 267.13 1.39x BenchmarkMemmoveUnaligned16-4 190.76 378.87 1.99x BenchmarkMemmoveUnaligned32-4 259.02 562.98 2.17x BenchmarkMemmoveUnaligned64-4 317.72 842.44 2.65x BenchmarkMemmoveUnaligned128-4 355.43 1274.49 3.59x BenchmarkMemmoveUnaligned256-4 378.17 1815.74 4.80x BenchmarkMemmoveUnaligned512-4 362.15 2180.81 6.02x BenchmarkMemmoveUnaligned1024-4 376.07 2453.58 6.52x BenchmarkMemmoveUnaligned2048-4 381.66 2568.32 6.73x BenchmarkMemmoveUnaligned4096-4 398.51 2669.36 6.70x BenchmarkMemclr5-4 113.83 107.93 0.95x BenchmarkMemclr16-4 223.84 389.63 1.74x BenchmarkMemclr64-4 421.99 1209.58 2.87x BenchmarkMemclr256-4 525.94 2411.58 4.59x BenchmarkMemclr4096-4 581.66 4372.20 7.52x BenchmarkMemclr65536-4 565.84 4747.48 8.39x BenchmarkGoMemclr5-4 194.63 160.31 0.82x BenchmarkGoMemclr16-4 295.30 630.07 2.13x BenchmarkGoMemclr64-4 480.24 1884.03 3.92x BenchmarkGoMemclr256-4 540.23 2926.49 5.42x but it turns out that it's necessary to avoid the GC seeing partially written pointers. It's of course possible to be more sophisticated (using ldp/stp to move 16 bytes at a time in the core loop and unrolling the tail copying loops being the obvious ideas) but I wanted something simple and (reasonably) obviously correct. Fixes #12552 Change-Id: Iaeaf8a812cd06f4747ba2f792de1ded738890735 Reviewed-on: https://go-review.googlesource.com/14813 Reviewed-by: Austin Clements <austin@google.com> 2015-09-21 20:34:39 -06:00			`AND $~7, R5, R7 // R7 is N&~7`
			`// TODO(mwhudson): this is written this way to avoid tickling`
			`// warnings from addpool when written as AND $7, R5, R6 (see`
			`// https://golang.org/issue/12708)`
			`SUB R7, R5, R6 // R6 is N&7`

runtime: add support for linux/arm64 Change-Id: Ibda6a5bedaff57fd161d63fc04ad260931d34413 Reviewed-on: https://go-review.googlesource.com/7142 Reviewed-by: Russ Cox <rsc@golang.org> 2015-03-08 07:20:20 -06:00			`CMP R3, R4`
			`BLT backward`

runtime: adjust the arm64 memmove and memclr to operate by word as much as they can Not only is this an obvious optimization: benchmark old MB/s new MB/s speedup BenchmarkMemmove1-4 35.35 29.65 0.84x BenchmarkMemmove2-4 63.78 52.53 0.82x BenchmarkMemmove3-4 89.72 73.96 0.82x BenchmarkMemmove4-4 109.94 95.73 0.87x BenchmarkMemmove5-4 127.60 112.80 0.88x BenchmarkMemmove6-4 143.59 126.67 0.88x BenchmarkMemmove7-4 157.90 138.92 0.88x BenchmarkMemmove8-4 167.18 231.81 1.39x BenchmarkMemmove9-4 175.23 252.07 1.44x BenchmarkMemmove10-4 165.68 261.10 1.58x BenchmarkMemmove11-4 174.43 263.31 1.51x BenchmarkMemmove12-4 180.76 267.56 1.48x BenchmarkMemmove13-4 189.06 284.93 1.51x BenchmarkMemmove14-4 186.31 284.72 1.53x BenchmarkMemmove15-4 195.75 281.62 1.44x BenchmarkMemmove16-4 202.96 439.23 2.16x BenchmarkMemmove32-4 264.77 775.77 2.93x BenchmarkMemmove64-4 306.81 1209.64 3.94x BenchmarkMemmove128-4 357.03 1515.41 4.24x BenchmarkMemmove256-4 380.77 2066.01 5.43x BenchmarkMemmove512-4 385.05 2556.45 6.64x BenchmarkMemmove1024-4 381.23 2804.10 7.36x BenchmarkMemmove2048-4 379.06 2814.83 7.43x BenchmarkMemmove4096-4 387.43 3064.96 7.91x BenchmarkMemmoveUnaligned1-4 28.91 25.40 0.88x BenchmarkMemmoveUnaligned2-4 56.13 47.56 0.85x BenchmarkMemmoveUnaligned3-4 74.32 69.31 0.93x BenchmarkMemmoveUnaligned4-4 97.02 83.58 0.86x BenchmarkMemmoveUnaligned5-4 110.17 103.62 0.94x BenchmarkMemmoveUnaligned6-4 124.95 113.26 0.91x BenchmarkMemmoveUnaligned7-4 142.37 130.82 0.92x BenchmarkMemmoveUnaligned8-4 151.20 205.64 1.36x BenchmarkMemmoveUnaligned9-4 166.97 215.42 1.29x BenchmarkMemmoveUnaligned10-4 148.49 221.22 1.49x BenchmarkMemmoveUnaligned11-4 159.47 239.57 1.50x BenchmarkMemmoveUnaligned12-4 163.52 247.32 1.51x BenchmarkMemmoveUnaligned13-4 167.55 256.54 1.53x BenchmarkMemmoveUnaligned14-4 175.12 251.03 1.43x BenchmarkMemmoveUnaligned15-4 192.10 267.13 1.39x BenchmarkMemmoveUnaligned16-4 190.76 378.87 1.99x BenchmarkMemmoveUnaligned32-4 259.02 562.98 2.17x BenchmarkMemmoveUnaligned64-4 317.72 842.44 2.65x BenchmarkMemmoveUnaligned128-4 355.43 1274.49 3.59x BenchmarkMemmoveUnaligned256-4 378.17 1815.74 4.80x BenchmarkMemmoveUnaligned512-4 362.15 2180.81 6.02x BenchmarkMemmoveUnaligned1024-4 376.07 2453.58 6.52x BenchmarkMemmoveUnaligned2048-4 381.66 2568.32 6.73x BenchmarkMemmoveUnaligned4096-4 398.51 2669.36 6.70x BenchmarkMemclr5-4 113.83 107.93 0.95x BenchmarkMemclr16-4 223.84 389.63 1.74x BenchmarkMemclr64-4 421.99 1209.58 2.87x BenchmarkMemclr256-4 525.94 2411.58 4.59x BenchmarkMemclr4096-4 581.66 4372.20 7.52x BenchmarkMemclr65536-4 565.84 4747.48 8.39x BenchmarkGoMemclr5-4 194.63 160.31 0.82x BenchmarkGoMemclr16-4 295.30 630.07 2.13x BenchmarkGoMemclr64-4 480.24 1884.03 3.92x BenchmarkGoMemclr256-4 540.23 2926.49 5.42x but it turns out that it's necessary to avoid the GC seeing partially written pointers. It's of course possible to be more sophisticated (using ldp/stp to move 16 bytes at a time in the core loop and unrolling the tail copying loops being the obvious ideas) but I wanted something simple and (reasonably) obviously correct. Fixes #12552 Change-Id: Iaeaf8a812cd06f4747ba2f792de1ded738890735 Reviewed-on: https://go-review.googlesource.com/14813 Reviewed-by: Austin Clements <austin@google.com> 2015-09-21 20:34:39 -06:00			`// Copying forward proceeds by copying R7/8 words then copying R6 bytes.`
			`// R3 and R4 are advanced as we copy.`

			`// (There may be implementations of armv8 where copying by bytes until`
			`// at least one of source or dest is word aligned is a worthwhile`
			`// optimization, but the on the one tested so far (xgene) it did not`
			`// make a significance difference.)`

			`CMP $0, R7 // Do we need to do any word-by-word copying?`
			`BEQ noforwardlarge`

			`ADD R3, R7, R9 // R9 points just past where we copy by word`

			`forwardlargeloop:`
			`MOVD.P 8(R4), R8 // R8 is just a scratch register`
			`MOVD.P R8, 8(R3)`
			`CMP R3, R9`
			`BNE forwardlargeloop`

			`noforwardlarge:`
			`CMP $0, R6 // Do we need to do any byte-by-byte copying?`
			`BNE forwardtail`
			`RET`

			`forwardtail:`
			`ADD R3, R6, R9 // R9 points just past the destination memory`

			`forwardtailloop:`
			`MOVBU.P 1(R4), R8`
			`MOVBU.P R8, 1(R3)`
			`CMP R3, R9`
			`BNE forwardtailloop`
runtime: add support for linux/arm64 Change-Id: Ibda6a5bedaff57fd161d63fc04ad260931d34413 Reviewed-on: https://go-review.googlesource.com/7142 Reviewed-by: Russ Cox <rsc@golang.org> 2015-03-08 07:20:20 -06:00			`RET`

			`backward:`
runtime: adjust the arm64 memmove and memclr to operate by word as much as they can Not only is this an obvious optimization: benchmark old MB/s new MB/s speedup BenchmarkMemmove1-4 35.35 29.65 0.84x BenchmarkMemmove2-4 63.78 52.53 0.82x BenchmarkMemmove3-4 89.72 73.96 0.82x BenchmarkMemmove4-4 109.94 95.73 0.87x BenchmarkMemmove5-4 127.60 112.80 0.88x BenchmarkMemmove6-4 143.59 126.67 0.88x BenchmarkMemmove7-4 157.90 138.92 0.88x BenchmarkMemmove8-4 167.18 231.81 1.39x BenchmarkMemmove9-4 175.23 252.07 1.44x BenchmarkMemmove10-4 165.68 261.10 1.58x BenchmarkMemmove11-4 174.43 263.31 1.51x BenchmarkMemmove12-4 180.76 267.56 1.48x BenchmarkMemmove13-4 189.06 284.93 1.51x BenchmarkMemmove14-4 186.31 284.72 1.53x BenchmarkMemmove15-4 195.75 281.62 1.44x BenchmarkMemmove16-4 202.96 439.23 2.16x BenchmarkMemmove32-4 264.77 775.77 2.93x BenchmarkMemmove64-4 306.81 1209.64 3.94x BenchmarkMemmove128-4 357.03 1515.41 4.24x BenchmarkMemmove256-4 380.77 2066.01 5.43x BenchmarkMemmove512-4 385.05 2556.45 6.64x BenchmarkMemmove1024-4 381.23 2804.10 7.36x BenchmarkMemmove2048-4 379.06 2814.83 7.43x BenchmarkMemmove4096-4 387.43 3064.96 7.91x BenchmarkMemmoveUnaligned1-4 28.91 25.40 0.88x BenchmarkMemmoveUnaligned2-4 56.13 47.56 0.85x BenchmarkMemmoveUnaligned3-4 74.32 69.31 0.93x BenchmarkMemmoveUnaligned4-4 97.02 83.58 0.86x BenchmarkMemmoveUnaligned5-4 110.17 103.62 0.94x BenchmarkMemmoveUnaligned6-4 124.95 113.26 0.91x BenchmarkMemmoveUnaligned7-4 142.37 130.82 0.92x BenchmarkMemmoveUnaligned8-4 151.20 205.64 1.36x BenchmarkMemmoveUnaligned9-4 166.97 215.42 1.29x BenchmarkMemmoveUnaligned10-4 148.49 221.22 1.49x BenchmarkMemmoveUnaligned11-4 159.47 239.57 1.50x BenchmarkMemmoveUnaligned12-4 163.52 247.32 1.51x BenchmarkMemmoveUnaligned13-4 167.55 256.54 1.53x BenchmarkMemmoveUnaligned14-4 175.12 251.03 1.43x BenchmarkMemmoveUnaligned15-4 192.10 267.13 1.39x BenchmarkMemmoveUnaligned16-4 190.76 378.87 1.99x BenchmarkMemmoveUnaligned32-4 259.02 562.98 2.17x BenchmarkMemmoveUnaligned64-4 317.72 842.44 2.65x BenchmarkMemmoveUnaligned128-4 355.43 1274.49 3.59x BenchmarkMemmoveUnaligned256-4 378.17 1815.74 4.80x BenchmarkMemmoveUnaligned512-4 362.15 2180.81 6.02x BenchmarkMemmoveUnaligned1024-4 376.07 2453.58 6.52x BenchmarkMemmoveUnaligned2048-4 381.66 2568.32 6.73x BenchmarkMemmoveUnaligned4096-4 398.51 2669.36 6.70x BenchmarkMemclr5-4 113.83 107.93 0.95x BenchmarkMemclr16-4 223.84 389.63 1.74x BenchmarkMemclr64-4 421.99 1209.58 2.87x BenchmarkMemclr256-4 525.94 2411.58 4.59x BenchmarkMemclr4096-4 581.66 4372.20 7.52x BenchmarkMemclr65536-4 565.84 4747.48 8.39x BenchmarkGoMemclr5-4 194.63 160.31 0.82x BenchmarkGoMemclr16-4 295.30 630.07 2.13x BenchmarkGoMemclr64-4 480.24 1884.03 3.92x BenchmarkGoMemclr256-4 540.23 2926.49 5.42x but it turns out that it's necessary to avoid the GC seeing partially written pointers. It's of course possible to be more sophisticated (using ldp/stp to move 16 bytes at a time in the core loop and unrolling the tail copying loops being the obvious ideas) but I wanted something simple and (reasonably) obviously correct. Fixes #12552 Change-Id: Iaeaf8a812cd06f4747ba2f792de1ded738890735 Reviewed-on: https://go-review.googlesource.com/14813 Reviewed-by: Austin Clements <austin@google.com> 2015-09-21 20:34:39 -06:00			`// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.`
			`// R3 and R4 are advanced to the end of the destination/source buffers`
			`// respectively and moved back as we copy.`

			`ADD R4, R5, R4 // R4 points just past the last source byte`
			`ADD R3, R5, R3 // R3 points just past the last destination byte`

			`CMP $0, R6 // Do we need to do any byte-by-byte copying?`
			`BEQ nobackwardtail`

			`SUB R6, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.`
			`backwardtailloop:`
			`MOVBU.W -1(R4), R8`
			`MOVBU.W R8, -1(R3)`
			`CMP R9, R3`
			`BNE backwardtailloop`

			`nobackwardtail:`
			`CMP $0, R7 // Do we need to do any word-by-word copying?`
			`BNE backwardlarge`
			`RET`

			`backwardlarge:`
			`SUB R7, R3, R9 // R9 points at the lowest destination byte`

			`backwardlargeloop:`
			`MOVD.W -8(R4), R8`
			`MOVD.W R8, -8(R3)`
			`CMP R9, R3`
			`BNE backwardlargeloop`
runtime: add support for linux/arm64 Change-Id: Ibda6a5bedaff57fd161d63fc04ad260931d34413 Reviewed-on: https://go-review.googlesource.com/7142 Reviewed-by: Russ Cox <rsc@golang.org> 2015-03-08 07:20:20 -06:00			`RET`