From 57d9de3ac88feba88c51922154edc1a9da71ee48 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 21 Apr 2010 16:27:18 -0700 Subject: [PATCH] utf16: add DecodeRune, EncodeRune R=r CC=golang-dev https://golang.org/cl/970041 --- src/pkg/utf16/utf16.go | 35 +++++++++++++++++++++++++++++++---- src/pkg/utf16/utf16_test.go | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/pkg/utf16/utf16.go b/src/pkg/utf16/utf16.go index 3031624526..372e38a718 100644 --- a/src/pkg/utf16/utf16.go +++ b/src/pkg/utf16/utf16.go @@ -18,6 +18,33 @@ const ( surrSelf = 0x10000 ) +// IsSurrogate returns true if the specified Unicode code point +// can appear in a surrogate pair. +func IsSurrogate(rune int) bool { + return surr1 <= rune && rune < surr3 +} + +// DecodeRune returns the UTF-16 decoding of a surrogate pair. +// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns +// the Unicode replacement code point U+FFFD. +func DecodeRune(r1, r2 int) int { + if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { + return (int(r1)-surr1)<<10 | (int(r2) - surr2) + 0x10000 + } + return unicode.ReplacementChar +} + +// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. +// If the rune is not a valid Unicode code point or does not need encoding, +// EncodeRune returns U+FFFD, U+FFFD. +func EncodeRune(rune int) (r1, r2 int) { + if rune < surrSelf || rune > unicode.MaxRune || IsSurrogate(rune) { + return unicode.ReplacementChar, unicode.ReplacementChar + } + rune -= surrSelf + return surr1 + (rune>>10)&0x3ff, surr2 + rune&0x3ff +} + // Encode returns the UTF-16 encoding of the Unicode code point sequence s. func Encode(s []int) []uint16 { n := len(s) @@ -38,9 +65,9 @@ func Encode(s []int) []uint16 { a[n] = uint16(v) n++ default: - v -= surrSelf - a[n] = uint16(surr1 + (v>>10)&0x3ff) - a[n+1] = uint16(surr2 + v&0x3ff) + r1, r2 := EncodeRune(v) + a[n] = uint16(r1) + a[n+1] = uint16(r2) n += 2 } } @@ -57,7 +84,7 @@ func Decode(s []uint16) []int { case surr1 <= r && r < surr2 && i+1 < len(s) && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence - a[n] = (int(r)-surr1)<<10 | (int(s[i+1]) - surr2) + 0x10000 + a[n] = DecodeRune(int(r), int(s[i+1])) i++ n++ case surr1 <= r && r < surr3: diff --git a/src/pkg/utf16/utf16_test.go b/src/pkg/utf16/utf16_test.go index c6e269aad0..c0848aa387 100644 --- a/src/pkg/utf16/utf16_test.go +++ b/src/pkg/utf16/utf16_test.go @@ -8,6 +8,7 @@ import ( "fmt" "reflect" "testing" + "unicode" ) type encodeTest struct { @@ -32,6 +33,41 @@ func TestEncode(t *testing.T) { } } +func TestEncodeRune(t *testing.T) { + for i, tt := range encodeTests { + j := 0 + for _, r := range tt.in { + r1, r2 := EncodeRune(r) + if r < 0x10000 || r > unicode.MaxRune { + if j >= len(tt.out) { + t.Errorf("#%d: ran out of tt.out", i) + break + } + if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { + t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2) + } + j++ + } else { + if j+1 >= len(tt.out) { + t.Errorf("#%d: ran out of tt.out", i) + break + } + if r1 != int(tt.out[j]) || r2 != int(tt.out[j+1]) { + t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1]) + } + j += 2 + dec := DecodeRune(r1, r2) + if dec != r { + t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r) + } + } + } + if j != len(tt.out) { + t.Errorf("#%d: EncodeRune didn't generate enough output", i) + } + } +} + type decodeTest struct { in []uint16 out []int