diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 26a6205e615..b8f283b8fdc 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -41,9 +41,20 @@ tail:
 	CMPQ	BX, $256
 	JBE	_129through256
 
+	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
+	JNE	skip_erms
+
+	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
+	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
+	// in the Intel Optimization Guide shows better performance for ERMSB starting
+	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
+	CMPQ    BX, $2048
+	JAE	loop_preheader_erms
+
+skip_erms:
 #ifndef hasAVX2
 	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
-	JE loop_preheader_avx2
+	JE	loop_preheader_avx2
 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
 
 loop:
@@ -71,12 +82,13 @@ loop:
 #endif
 
 loop_preheader_avx2:
-	VPXOR Y0, Y0, Y0
+	VPXOR X0, X0, X0
 	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
 	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
 	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
 	CMPQ    BX, $0x2000000
-	JAE     loop_preheader_avx2_huge
+	JAE	loop_preheader_avx2_huge
+
 loop_avx2:
 	VMOVDQU	Y0, 0(DI)
 	VMOVDQU	Y0, 32(DI)
@@ -92,6 +104,25 @@ loop_avx2:
 	VMOVDQU  Y0, -128(DI)(BX*1)
 	VZEROUPPER
 	RET
+
+loop_preheader_erms:
+#ifndef hasAVX2
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE	loop_erms
+#endif
+
+	VPXOR X0, X0, X0
+	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
+	// write protocol, ERMS could show the same or slower performance comparing to
+	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
+	CMPQ	BX, $0x2000000
+	JAE	loop_preheader_avx2_huge
+
+loop_erms:
+	MOVQ	BX, CX
+	REP;	STOSB
+	RET
+
 loop_preheader_avx2_huge:
 	// Align to 32 byte boundary
 	VMOVDQU  Y0, 0(DI)