diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 2829945af0b..0ceedcd7d22 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -56,32 +56,34 @@ var ARM struct {
 // The booleans in ARM64 contain the correspondingly named cpu feature bit.
 // The struct is padded to avoid false sharing.
 var ARM64 struct {
-	_           CacheLinePad
-	HasFP       bool
-	HasASIMD    bool
-	HasEVTSTRM  bool
-	HasAES      bool
-	HasPMULL    bool
-	HasSHA1     bool
-	HasSHA2     bool
-	HasCRC32    bool
-	HasATOMICS  bool
-	HasFPHP     bool
-	HasASIMDHP  bool
-	HasCPUID    bool
-	HasASIMDRDM bool
-	HasJSCVT    bool
-	HasFCMA     bool
-	HasLRCPC    bool
-	HasDCPOP    bool
-	HasSHA3     bool
-	HasSM3      bool
-	HasSM4      bool
-	HasASIMDDP  bool
-	HasSHA512   bool
-	HasSVE      bool
-	HasASIMDFHM bool
-	_           CacheLinePad
+	_            CacheLinePad
+	HasFP        bool
+	HasASIMD     bool
+	HasEVTSTRM   bool
+	HasAES       bool
+	HasPMULL     bool
+	HasSHA1      bool
+	HasSHA2      bool
+	HasCRC32     bool
+	HasATOMICS   bool
+	HasFPHP      bool
+	HasASIMDHP   bool
+	HasCPUID     bool
+	HasASIMDRDM  bool
+	HasJSCVT     bool
+	HasFCMA      bool
+	HasLRCPC     bool
+	HasDCPOP     bool
+	HasSHA3      bool
+	HasSM3       bool
+	HasSM4       bool
+	HasASIMDDP   bool
+	HasSHA512    bool
+	HasSVE       bool
+	HasASIMDFHM  bool
+	IsNeoverseN1 bool
+	IsZeus       bool
+	_            CacheLinePad
 }
 
 var MIPS64X struct {
diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go
index 533bea24700..8fde39f03e1 100644
--- a/src/internal/cpu/cpu_arm64.go
+++ b/src/internal/cpu/cpu_arm64.go
@@ -18,6 +18,7 @@ const (
 	hwcap_SHA2    = 1 << 6
 	hwcap_CRC32   = 1 << 7
 	hwcap_ATOMICS = 1 << 8
+	hwcap_CPUID   = 1 << 11
 )
 
 func doinit() {
@@ -28,6 +29,8 @@ func doinit() {
 		{Name: "sha2", Feature: &ARM64.HasSHA2},
 		{Name: "crc32", Feature: &ARM64.HasCRC32},
 		{Name: "atomics", Feature: &ARM64.HasATOMICS},
+		{Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1},
+		{Name: "isZeus", Feature: &ARM64.IsZeus},
 	}
 
 	switch GOOS {
@@ -40,12 +43,32 @@ func doinit() {
 		ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1)
 		ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2)
 		ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32)
+		ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID)
 
 		// The Samsung S9+ kernel reports support for atomics, but not all cores
 		// actually support them, resulting in SIGILL. See issue #28431.
 		// TODO(elias.naur): Only disable the optimization on bad chipsets on android.
 		ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android"
 
+		// Check to see if executing on a NeoverseN1 and in order to do that,
+		// check the AUXV for the CPUID bit. The getMIDR function executes an
+		// instruction which would normally be an illegal instruction, but it's
+		// trapped by the kernel, the value sanitized and then returned. Without
+		// the CPUID bit the kernel will not trap the instruction and the process
+		// will be terminated with SIGILL.
+		if ARM64.HasCPUID {
+			midr := getMIDR()
+			part_num := uint16((midr >> 4) & 0xfff)
+			implementor := byte((midr >> 24) & 0xff)
+
+			if implementor == 'A' && part_num == 0xd0c {
+				ARM64.IsNeoverseN1 = true
+			}
+			if implementor == 'A' && part_num == 0xd40 {
+				ARM64.IsZeus = true
+			}
+		}
+
 	case "freebsd":
 		// Retrieve info from system register ID_AA64ISAR0_EL1.
 		isar0 := getisar0()
@@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool {
 }
 
 func getisar0() uint64
+
+func getMIDR() uint64
diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s
index d85914973f9..d6e7f443739 100644
--- a/src/internal/cpu/cpu_arm64.s
+++ b/src/internal/cpu/cpu_arm64.s
@@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0
 	MRS	ID_AA64ISAR0_EL1, R0
 	MOVD	R0, ret+0(FP)
 	RET
+
+// func getMIDR() uint64
+TEXT ·getMIDR(SB), NOSPLIT, $0-8
+	MRS	MIDR_EL1, R0
+	MOVD	R0, ret+0(FP)
+	RET
diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go
new file mode 100644
index 00000000000..7576bef4a75
--- /dev/null
+++ b/src/runtime/cpuflags_arm64.go
@@ -0,0 +1,17 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+)
+
+var arm64UseAlignedLoads bool
+
+func init() {
+	if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
+		arm64UseAlignedLoads = true
+	}
+}
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dbb7e9a28a0..43d27629e5b 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -6,152 +6,236 @@
 
 // See memmove Go doc for important implementation constraints.
 
+// Register map
+//
+// dstin  R0
+// src    R1
+// count  R2
+// dst    R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data   R6-R17
+// tmp1   R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-	MOVD	to+0(FP), R3
-	MOVD	from+8(FP), R4
-	MOVD	n+16(FP), R5
-	CBNZ	R5, check
-	RET
+	MOVD	to+0(FP), R0
+	MOVD	from+8(FP), R1
+	MOVD	n+16(FP), R2
+	CBZ	R2, copy0
 
-check:
-	CMP	$16, R5
+	// Small copies: 1..16 bytes
+	CMP	$16, R2
 	BLE	copy16
 
-	AND	$~31, R5, R7	// R7 is N&~31
-	SUB	R7, R5, R6	// R6 is N&31
+	// Large copies
+	CMP	$128, R2
+	BHI	copy_long
+	CMP	$32, R2
+	BHI	copy32_128
 
-	CMP	R3, R4
-	BLT	backward
-
-	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
-	// R3 and R4 are advanced as we copy.
-
-	// (There may be implementations of armv8 where copying by bytes until
-	// at least one of source or dest is word aligned is a worthwhile
-	// optimization, but the on the one tested so far (xgene) it did not
-	// make a significance difference.)
-
-	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
-
-	ADD	R3, R7, R9	// R9 points just past where we copy by word
-
-forwardlargeloop:
-	// Copy 32 bytes at a time.
-	LDP.P	32(R4), (R8, R10)
-	STP.P	(R8, R10), 32(R3)
-	LDP	-16(R4), (R11, R12)
-	STP	(R11, R12), -16(R3)
-	SUB 	$32, R7, R7
-	CBNZ	R7, forwardlargeloop
-
-noforwardlarge:
-	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
+	// Small copies: 17..32 bytes.
+	LDP	(R1), (R6, R7)
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	LDP	-16(R4), (R12, R13)
+	STP	(R6, R7), (R0)
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	STP	(R12, R13), -16(R5)
 	RET
 
-forwardtail:
-	// There are R6 <= 31 bytes remaining to copy.
-	// This is large enough to still contain pointers,
-	// which must be copied atomically.
-	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
-	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
-	LDP.P	16(R4), (R8, R10)
-	STP.P	(R8, R10), 16(R3)
-
-	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
-	MOVD.P	8(R4), R8
-	MOVD.P	R8, 8(R3)
-
-	AND	$7, R6
-	CBNZ	R6, 2(PC)
-	RET
-
-	ADD	R3, R6, R9	// R9 points just past the destination memory
-
-forwardtailloop:
-	MOVBU.P 1(R4), R8
-	MOVBU.P	R8, 1(R3)
-	CMP	R3, R9
-	BNE	forwardtailloop
-	RET
-
-	// Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
 copy16:
-	ADD	R4, R5, R8	// R8 points just past the last source byte
-	ADD	R3, R5, R9	// R9 points just past the last destination byte
-	CMP	$8, R5
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	CMP	$8, R2
 	BLT	copy7
-	MOVD	(R4), R6
-	MOVD	-8(R8), R7
-	MOVD	R6, (R3)
-	MOVD	R7, -8(R9)
+	MOVD	(R1), R6
+	MOVD	-8(R4), R7
+	MOVD	R6, (R0)
+	MOVD	R7, -8(R5)
 	RET
 
 copy7:
-	TBZ	$2, R5, copy3
-	MOVWU	(R4), R6
-	MOVWU	-4(R8), R7
-	MOVW	R6, (R3)
-	MOVW	R7, -4(R9)
+	TBZ	$2, R2, copy3
+	MOVWU	(R1), R6
+	MOVWU	-4(R4), R7
+	MOVW	R6, (R0)
+	MOVW	R7, -4(R5)
 	RET
 
 copy3:
-	TBZ	$1, R5, copy1
-	MOVHU	(R4), R6
-	MOVHU	-2(R8), R7
-	MOVH	R6, (R3)
-	MOVH	R7, -2(R9)
+	TBZ	$1, R2, copy1
+	MOVHU	(R1), R6
+	MOVHU	-2(R4), R7
+	MOVH	R6, (R0)
+	MOVH	R7, -2(R5)
 	RET
 
 copy1:
-	MOVBU	(R4), R6
-	MOVB	R6, (R3)
+	MOVBU	(R1), R6
+	MOVB	R6, (R0)
+
+copy0:
 	RET
 
-backward:
-	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
-	// R3 and R4 are advanced to the end of the destination/source buffers
-	// respectively and moved back as we copy.
-
-	ADD	R4, R5, R4	// R4 points just past the last source byte
-	ADD	R3, R5, R3	// R3 points just past the last destination byte
-
-	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
-
-	AND	$7, R6, R12
-	CBZ	R12, backwardtaillarge
-
-	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
-	// Copy sub-pointer-size tail.
-	MOVBU.W	-1(R4), R8
-	MOVBU.W	R8, -1(R3)
-	CMP	R9, R3
-	BNE	backwardtailloop
-
-backwardtaillarge:
-	// Do 8/16-byte write if possible.
-	// See comment at forwardtail.
-	TBZ	$3, R6, 3(PC)
-	MOVD.W	-8(R4), R8
-	MOVD.W	R8, -8(R3)
-
-	TBZ	$4, R6, 3(PC)
-	LDP.W	-16(R4), (R8, R10)
-	STP.W	(R8, R10), -16(R3)
-
-nobackwardtail:
-	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
+	// Medium copies: 33..128 bytes.
+copy32_128:
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	LDP	(R1), (R6, R7)
+	LDP	16(R1), (R8, R9)
+	LDP	-32(R4), (R10, R11)
+	LDP	-16(R4), (R12, R13)
+	CMP	$64, R2
+	BHI	copy128
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
 	RET
 
-backwardlarge:
-	SUB	R7, R3, R9	// R9 points at the lowest destination byte
+	// Copy 65..128 bytes.
+copy128:
+	LDP	32(R1), (R14, R15)
+	LDP	48(R1), (R16, R17)
+	CMP	$96, R2
+	BLS	copy96
+	LDP	-64(R4), (R2, R3)
+	LDP	-48(R4), (R1, R4)
+	STP	(R2, R3), -64(R5)
+	STP	(R1, R4), -48(R5)
 
-backwardlargeloop:
-	LDP	-16(R4), (R8, R10)
-	STP	(R8, R10), -16(R3)
-	LDP.W	-32(R4), (R11, R12)
-	STP.W	(R11, R12), -32(R3)
-	CMP	R9, R3
-	BNE	backwardlargeloop
+copy96:
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R14, R15), 32(R0)
+	STP	(R16, R17), 48(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
+	RET
+
+	// Copy more than 128 bytes.
+copy_long:
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	MOVD	ZR, R7
+	MOVD	ZR, R8
+
+	CMP	$1024, R2
+	BLT	backward_check
+	// feature detect to decide how to align
+	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
+	CBNZ	R6, use_aligned_loads
+	MOVD	R0, R7
+	MOVD	R5, R8
+	B	backward_check
+use_aligned_loads:
+	MOVD	R1, R7
+	MOVD	R4, R8
+	// R7 and R8 are used here for the realignment calculation. In
+	// the use_aligned_loads case, R7 is the src pointer and R8 is
+	// srcend pointer, which is used in the backward copy case.
+	// When doing aligned stores, R7 is the dst pointer and R8 is
+	// the dstend pointer.
+
+backward_check:
+	// Use backward copy if there is an overlap.
+	SUB	R1, R0, R14
+	CBZ	R14, copy0
+	CMP	R2, R14
+	BCC	copy_long_backward
+
+	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+	LDP	(R1), (R12, R13)     // Load  A
+	AND	$15, R7, R14         // Calculate the realignment offset
+	SUB	R14, R1, R1
+	SUB	R14, R0, R3          // move dst back same amount as src
+	ADD	R14, R2, R2
+	LDP	16(R1), (R6, R7)     // Load   B
+	STP	(R12, R13), (R0)     // Store A
+	LDP	32(R1), (R8, R9)     // Load    C
+	LDP	48(R1), (R10, R11)   // Load     D
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+	SUBS	$144, R2, R2
+	BLS	copy64_from_end
+
+loop64:
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	32(R1), (R8, R9)     // Load    C
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	48(R1), (R10, R11)   // Load     D
+	STP.W	(R12, R13), 64(R3)   // Store     E
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	SUBS	$64, R2, R2
+	BHI	loop64
+
+	// Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+	LDP	-64(R4), (R14, R15)  // Load       F
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	-48(R4), (R6, R7)    // Load        G
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	-32(R4), (R8, R9)    // Load         H
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	-16(R4), (R10, R11)  // Load          I
+	STP	(R12, R13), 64(R3)   // Store     E
+	STP	(R14, R15), -64(R5)  // Store      F
+	STP	(R6, R7), -48(R5)    // Store       G
+	STP	(R8, R9), -32(R5)    // Store        H
+	STP	(R10, R11), -16(R5)  // Store         I
+	RET
+
+	// Large backward copy for overlapping copies.
+	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+	LDP	-16(R4), (R12, R13)
+	AND	$15, R8, R14
+	SUB	R14, R4, R4
+	SUB	R14, R2, R2
+	LDP	-16(R4), (R6, R7)
+	STP	(R12, R13), -16(R5)
+	LDP	-32(R4), (R8, R9)
+	LDP	-48(R4), (R10, R11)
+	LDP.W	-64(R4), (R12, R13)
+	SUB	R14, R5, R5
+	SUBS	$128, R2, R2
+	BLS	copy64_from_start
+
+loop64_backward:
+	STP	(R6, R7), -16(R5)
+	LDP	-16(R4), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	-32(R4), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	-48(R4), (R10, R11)
+	STP.W	(R12, R13), -64(R5)
+	LDP.W	-64(R4), (R12, R13)
+	SUBS	$64, R2, R2
+	BHI	loop64_backward
+
+	// Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+	LDP	48(R1), (R2, R3)
+	STP	(R6, R7), -16(R5)
+	LDP	32(R1), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	16(R1), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	(R1), (R10, R11)
+	STP	(R12, R13), -64(R5)
+	STP	(R2, R3), 48(R0)
+	STP	(R6, R7), 32(R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), (R0)
 	RET
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index b549433f71c..7c9d2ada45f 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -286,6 +286,9 @@ var bufSizes = []int{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 	32, 64, 128, 256, 512, 1024, 2048, 4096,
 }
+var bufSizesOverlap = []int{
+	32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
 
 func BenchmarkMemmove(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
@@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[:n])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n+1)
@@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[1:n+1])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n)
@@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+1)
+		for i := 0; i < b.N; i++ {
+			copy(x[1:n+1], x[:n])
+		}
+	})
+}
+
 func TestMemclr(t *testing.T) {
 	size := 512
 	if testing.Short() {