1
0
mirror of https://github.com/golang/go synced 2024-11-11 19:21:37 -07:00

unicode/utf16: add func RuneLen

This CL adds func RuneLen, while here, also uses RuneLen to simplify
code in Encode.

Fixes #44940

Change-Id: Ifd3b537f69880dfd32a69a6733d8d3c2b5d4ecba
Reviewed-on: https://go-review.googlesource.com/c/go/+/569755
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Commit-Queue: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
This commit is contained in:
Jes Cok 2024-03-07 21:36:47 +08:00 committed by Gopher Robot
parent e0ba596c15
commit ef4f2a0597
5 changed files with 43 additions and 5 deletions

1
api/next/44940.txt Normal file
View File

@ -0,0 +1 @@
pkg unicode/utf16, func RuneLen(int32) int #44940

View File

@ -0,0 +1,3 @@
The [`unicode/utf16.RuneLen`](/pkg/unicode/utf16#RuneLen) function returns
the number of 16-bit words in the UTF-16 encoding of the rune. It returns -1
if the rune is not a valid value to encode in UTF-16.

View File

@ -6,6 +6,9 @@ package utf16
// Extra names for constants so we can validate them during testing. // Extra names for constants so we can validate them during testing.
const ( const (
Surr1 = surr1
Surr3 = surr3
SurrSelf = surrSelf
MaxRune = maxRune MaxRune = maxRune
ReplacementChar = replacementChar ReplacementChar = replacementChar
) )

View File

@ -52,6 +52,19 @@ func EncodeRune(r rune) (r1, r2 rune) {
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
} }
// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune.
// It returns -1 if the rune is not a valid value to encode in UTF-16.
func RuneLen(r rune) int {
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
return 1
case surrSelf <= r && r <= maxRune:
return 2
default:
return -1
}
}
// Encode returns the UTF-16 encoding of the Unicode code point sequence s. // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 { func Encode(s []rune) []uint16 {
n := len(s) n := len(s)
@ -64,13 +77,11 @@ func Encode(s []rune) []uint16 {
a := make([]uint16, n) a := make([]uint16, n)
n = 0 n = 0
for _, v := range s { for _, v := range s {
switch { switch RuneLen(v) {
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: case 1: // normal rune
// normal rune
a[n] = uint16(v) a[n] = uint16(v)
n++ n++
case surrSelf <= v && v <= maxRune: case 2: // needs surrogate sequence
// needs surrogate sequence
r1, r2 := EncodeRune(v) r1, r2 := EncodeRune(v)
a[n] = uint16(r1) a[n] = uint16(r1)
a[n+1] = uint16(r2) a[n+1] = uint16(r2)

View File

@ -22,6 +22,26 @@ func TestConstants(t *testing.T) {
} }
} }
func TestRuneLen(t *testing.T) {
for _, tt := range []struct {
r rune
length int
}{
{0, 1},
{Surr1 - 1, 1},
{Surr3, 1},
{SurrSelf - 1, 1},
{SurrSelf, 2},
{MaxRune, 2},
{MaxRune + 1, -1},
{-1, -1},
} {
if length := RuneLen(tt.r); length != tt.length {
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length)
}
}
}
type encodeTest struct { type encodeTest struct {
in []rune in []rune
out []uint16 out []uint16