From 2423370136d4b1915d06bb1aaacbedaa900bc5c7 Mon Sep 17 00:00:00 2001 From: qmuntal Date: Fri, 25 Nov 2022 14:22:36 +0100 Subject: [PATCH] utf16: reduce utf16.Decode allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL avoids allocating in utf16.Decode for code point sequences with less than 64 elements. It does so by splitting the function in two, one that can be inlined that preallocates a buffer and the other that does the heavy-lifting. The mid-stack inliner will allocate the buffer in the caller stack, and in many cases this will be enough to avoid the allocation. unicode/utf16 benchmarks: name old time/op new time/op delta DecodeValidASCII-12 60.1ns ± 3% 16.0ns ±20% -73.40% (p=0.000 n=8+10) DecodeValidJapaneseChars-12 61.3ns ±10% 14.9ns ±39% -75.71% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeValidASCII-12 48.0B ± 0% 0.0B -100.00% (p=0.000 n=10+10) DecodeValidJapaneseChars-12 48.0B ± 0% 0.0B -100.00% (p=0.000 n=10+10) name old allocs/op new allocs/op delta DecodeValidASCII-12 1.00 ± 0% 0.00 -100.00% (p=0.000 n=10+10) DecodeValidJapaneseChars-12 1.00 ± 0% 0.00 -100.00% (p=0.000 n=10+10) I've also benchmarked os.File.ReadDir with this change applied to demonstrate that it does make a difference in the caller site, in this case via syscall.UTF16ToString: name old time/op new time/op delta ReadDir-12 592µs ± 8% 620µs ±16% ~ (p=0.280 n=10+10) name old alloc/op new alloc/op delta ReadDir-12 30.4kB ± 0% 22.4kB ± 0% -26.10% (p=0.000 n=8+10) name old allocs/op new allocs/op delta ReadDir-12 402 ± 0% 272 ± 0% -32.34% (p=0.000 n=10+10) Change-Id: I65cf5caa3fd3b3a466c0ed837a50a96e975bbe6b Reviewed-on: https://go-review.googlesource.com/c/go/+/453415 Reviewed-by: Damien Neil Reviewed-by: Cherry Mui Reviewed-by: Alex Brainman TryBot-Result: Gopher Robot Run-TryBot: Quim Muntal --- src/cmd/compile/internal/test/inl_test.go | 3 +++ src/unicode/utf16/utf16.go | 22 +++++++++++++++------- src/unicode/utf16/utf16_test.go | 17 +++++++++++++++++ 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index 201f5773e97..e59104df531 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -123,6 +123,9 @@ func TestIntendedInlining(t *testing.T) { "AppendRune", "ValidRune", }, + "unicode/utf16": { + "Decode", + }, "reflect": { "Value.Bool", "Value.Bytes", diff --git a/src/unicode/utf16/utf16.go b/src/unicode/utf16/utf16.go index 38d8be60602..1c6d2c66c30 100644 --- a/src/unicode/utf16/utf16.go +++ b/src/unicode/utf16/utf16.go @@ -103,23 +103,31 @@ func AppendRune(a []uint16, r rune) []uint16 { // Decode returns the Unicode code point sequence represented // by the UTF-16 encoding s. func Decode(s []uint16) []rune { - a := make([]rune, len(s)) - n := 0 + // Preallocate capacity to hold up to 64 runes. + // Decode inlines, so the allocation can live on the stack. + buf := make([]rune, 0, 64) + return decode(s, buf) +} + +// decode appends to buf the Unicode code point sequence represented +// by the UTF-16 encoding s and return the extended buffer. +func decode(s []uint16, buf []rune) []rune { for i := 0; i < len(s); i++ { + var ar rune switch r := s[i]; { case r < surr1, surr3 <= r: // normal rune - a[n] = rune(r) + ar = rune(r) case surr1 <= r && r < surr2 && i+1 < len(s) && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence - a[n] = DecodeRune(rune(r), rune(s[i+1])) + ar = DecodeRune(rune(r), rune(s[i+1])) i++ default: // invalid surrogate sequence - a[n] = replacementChar + ar = replacementChar } - n++ + buf = append(buf, ar) } - return a[:n] + return buf } diff --git a/src/unicode/utf16/utf16_test.go b/src/unicode/utf16/utf16_test.go index be339b1fdf1..a5a503d3874 100644 --- a/src/unicode/utf16/utf16_test.go +++ b/src/unicode/utf16/utf16_test.go @@ -5,6 +5,7 @@ package utf16_test import ( + "internal/testenv" "reflect" "testing" "unicode" @@ -103,6 +104,22 @@ var decodeTests = []decodeTest{ {[]uint16{0xdfff}, []rune{0xfffd}}, } +func TestAllocationsDecode(t *testing.T) { + testenv.SkipIfOptimizationOff(t) + + for _, tt := range decodeTests { + allocs := testing.AllocsPerRun(10, func() { + out := Decode(tt.in) + if out == nil { + t.Errorf("Decode(%x) = nil", tt.in) + } + }) + if allocs > 0 { + t.Errorf("Decode allocated %v times", allocs) + } + } +} + func TestDecode(t *testing.T) { for _, tt := range decodeTests { out := Decode(tt.in)