mirror of
https://github.com/golang/go
synced 2024-11-21 22:54:40 -07:00
cmd/gc: string conversion for surrogates
This is required by the spec to produce the replacement char. The fix lies in lib9's rune code. R=golang-dev, nigeltao, rsc CC=golang-dev https://golang.org/cl/6443109
This commit is contained in:
parent
b7627d3d1f
commit
363ec80dec
@ -36,12 +36,14 @@ enum
|
|||||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||||
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
|
||||||
/* 0001 1111 1111 1111 1111 1111 */
|
|
||||||
|
|
||||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
|
||||||
|
SurrogateMin = 0xD800,
|
||||||
|
SurrogateMax = 0xDFFF,
|
||||||
|
|
||||||
Bad = Runeerror,
|
Bad = Runeerror,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -122,6 +124,8 @@ charntorune(Rune *rune, const char *str, int length)
|
|||||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||||
if(l <= Rune2)
|
if(l <= Rune2)
|
||||||
goto bad;
|
goto bad;
|
||||||
|
if (SurrogateMin <= l && l <= SurrogateMax)
|
||||||
|
goto bad;
|
||||||
*rune = l;
|
*rune = l;
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
@ -138,7 +142,7 @@ charntorune(Rune *rune, const char *str, int length)
|
|||||||
goto bad;
|
goto bad;
|
||||||
if (c < T5) {
|
if (c < T5) {
|
||||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
if (l <= Rune3)
|
if (l <= Rune3 || l > Runemax)
|
||||||
goto bad;
|
goto bad;
|
||||||
*rune = l;
|
*rune = l;
|
||||||
return 4;
|
return 4;
|
||||||
@ -208,6 +212,8 @@ chartorune(Rune *rune, const char *str)
|
|||||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||||
if(l <= Rune2)
|
if(l <= Rune2)
|
||||||
goto bad;
|
goto bad;
|
||||||
|
if (SurrogateMin <= l && l <= SurrogateMax)
|
||||||
|
goto bad;
|
||||||
*rune = l;
|
*rune = l;
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
@ -221,7 +227,7 @@ chartorune(Rune *rune, const char *str)
|
|||||||
goto bad;
|
goto bad;
|
||||||
if (c < T5) {
|
if (c < T5) {
|
||||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
if (l <= Rune3)
|
if (l <= Rune3 || l > Runemax)
|
||||||
goto bad;
|
goto bad;
|
||||||
*rune = l;
|
*rune = l;
|
||||||
return 4;
|
return 4;
|
||||||
@ -273,13 +279,15 @@ runetochar(char *str, const Rune *rune)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the Rune is out of range, convert it to the error rune.
|
* If the Rune is out of range or a surrogate half, convert it to the error rune.
|
||||||
* Do this test here because the error rune encodes to three bytes.
|
* Do this test here because the error rune encodes to three bytes.
|
||||||
* Doing it earlier would duplicate work, since an out of range
|
* Doing it earlier would duplicate work, since an out of range
|
||||||
* Rune wouldn't have fit in one or two bytes.
|
* Rune wouldn't have fit in one or two bytes.
|
||||||
*/
|
*/
|
||||||
if (c > Runemax)
|
if (c > Runemax)
|
||||||
c = Runeerror;
|
c = Runeerror;
|
||||||
|
if (SurrogateMin <= c && c <= SurrogateMax)
|
||||||
|
c = Runeerror;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* three character sequence
|
* three character sequence
|
||||||
|
@ -69,7 +69,7 @@ var utf8map = []Utf8Map{
|
|||||||
|
|
||||||
var surrogateMap = []Utf8Map{
|
var surrogateMap = []Utf8Map{
|
||||||
{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
|
{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
|
||||||
{0xdfff, "\xed bf bf"}, // surrogate max decodes to (RuneError, 1)
|
{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
var testStrings = []string{
|
var testStrings = []string{
|
||||||
@ -355,7 +355,9 @@ var validTests = []ValidTest{
|
|||||||
{string([]byte{66, 250}), false},
|
{string([]byte{66, 250}), false},
|
||||||
{string([]byte{66, 250, 67}), false},
|
{string([]byte{66, 250, 67}), false},
|
||||||
{"a\uFFFDb", true},
|
{"a\uFFFDb", true},
|
||||||
{string("\xF7\xBF\xBF\xBF"), true}, // U+1FFFFF
|
{string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
|
||||||
|
{string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
|
||||||
|
{string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
|
||||||
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
|
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
|
||||||
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
|
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
|
||||||
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
|
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
|
||||||
|
@ -93,7 +93,7 @@ func main() {
|
|||||||
"backslashes 2 (backquote)")
|
"backslashes 2 (backquote)")
|
||||||
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)")
|
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)")
|
||||||
|
|
||||||
// test large runes. perhaps not the most logical place for this test.
|
// test large and surrogate-half runes. perhaps not the most logical place for these tests.
|
||||||
var r int32
|
var r int32
|
||||||
r = 0x10ffff // largest rune value
|
r = 0x10ffff // largest rune value
|
||||||
s = string(r)
|
s = string(r)
|
||||||
@ -101,6 +101,28 @@ func main() {
|
|||||||
r = 0x10ffff + 1
|
r = 0x10ffff + 1
|
||||||
s = string(r)
|
s = string(r)
|
||||||
assert(s, "\xef\xbf\xbd", "too-large rune")
|
assert(s, "\xef\xbf\xbd", "too-large rune")
|
||||||
|
r = 0xD800
|
||||||
|
s = string(r)
|
||||||
|
assert(s, "\xef\xbf\xbd", "surrogate rune min")
|
||||||
|
r = 0xDFFF
|
||||||
|
s = string(r)
|
||||||
|
assert(s, "\xef\xbf\xbd", "surrogate rune max")
|
||||||
|
r = -1
|
||||||
|
s = string(r)
|
||||||
|
assert(s, "\xef\xbf\xbd", "negative rune")
|
||||||
|
|
||||||
|
// the large rune tests again, this time using constants instead of a variable.
|
||||||
|
// these conversions will be done at compile time.
|
||||||
|
s = string(0x10ffff) // largest rune value
|
||||||
|
assert(s, "\xf4\x8f\xbf\xbf", "largest rune constant")
|
||||||
|
s = string(0x10ffff + 1)
|
||||||
|
assert(s, "\xef\xbf\xbd", "too-large rune constant")
|
||||||
|
s = string(0xD800)
|
||||||
|
assert(s, "\xef\xbf\xbd", "surrogate rune min constant")
|
||||||
|
s = string(0xDFFF)
|
||||||
|
assert(s, "\xef\xbf\xbd", "surrogate rune max constant")
|
||||||
|
s = string(-1)
|
||||||
|
assert(s, "\xef\xbf\xbd", "negative rune")
|
||||||
|
|
||||||
assert(string(gr1), gx1, "global ->[]rune")
|
assert(string(gr1), gx1, "global ->[]rune")
|
||||||
assert(string(gr2), gx2fix, "global invalid ->[]rune")
|
assert(string(gr2), gx2fix, "global invalid ->[]rune")
|
||||||
|
Loading…
Reference in New Issue
Block a user