From 5ea413e2a0414c4bd6dbf9d00d760b6ed8b156ec Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Wed, 27 Jul 2011 15:54:23 -0700 Subject: [PATCH] unicode: fix case-mapping for roman numerals!! Hurray! Also fix the mystical U+0345 COMBINING GREEK YPOGEGRAMMENI, so everyone is satisfied. Also add a -local flag to use local files for faster turnaround when debugging. R=rsc CC=golang-dev https://golang.org/cl/4825054 --- src/pkg/unicode/letter_test.go | 4 ++ src/pkg/unicode/maketables.go | 84 ++++++++++++++++++++----------- src/pkg/unicode/tables.go | 91 +++------------------------------- 3 files changed, 65 insertions(+), 114 deletions(-) diff --git a/src/pkg/unicode/letter_test.go b/src/pkg/unicode/letter_test.go index c4e26df580..8d2665a44f 100644 --- a/src/pkg/unicode/letter_test.go +++ b/src/pkg/unicode/letter_test.go @@ -212,6 +212,10 @@ var caseTest = []caseT{ {UpperCase, 0x10450, 0x10450}, {LowerCase, 0x10450, 0x10450}, {TitleCase, 0x10450, 0x10450}, + + // Non-letters with case. + {LowerCase, 0x2161, 0x2171}, + {UpperCase, 0x0345, 0x0399}, } func TestIsLetter(t *testing.T) { diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index 0020dc427c..b586bc655f 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -14,6 +14,7 @@ import ( "http" "log" "os" + "path/filepath" "sort" "strconv" "strings" @@ -54,10 +55,47 @@ var cases = flag.Bool("cases", var test = flag.Bool("test", false, "test existing tables; can be used to compare web data with package data") +var localFiles = flag.Bool("local", + false, + "data files have been copied to current directory; for debugging only") var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) var logger = log.New(os.Stderr, "", log.Lshortfile) +type reader struct { + *bufio.Reader + fd *os.File + resp *http.Response +} + +func open(url string) *reader { + file := filepath.Base(url) + if *localFiles { + fd, err := os.Open(file) + if err != nil { + logger.Fatal(err) + } + return &reader{bufio.NewReader(fd), fd, nil} + } + resp, err := http.Get(*dataURL) + if err != nil { + logger.Fatal(err) + } + if resp.StatusCode != 200 { + logger.Fatalf("bad GET status for %s: %d", file, resp.Status) + } + return &reader{bufio.NewReader(resp.Body), nil, resp} + +} + +func (r *reader) close() { + if r.fd != nil { + r.fd.Close() + } else { + r.resp.Body.Close() + } +} + var category = map[string]bool{ // Nd Lu etc. // We use one-character names to identify merged categories @@ -192,7 +230,7 @@ func parseCategory(line string) (state State) { char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) case "Lt": char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) - case "Lm", "Lo": + default: char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) } switch { @@ -272,14 +310,7 @@ func loadChars() { if *dataURL == "" { flag.Set("data", *url+"UnicodeData.txt") } - resp, err := http.Get(*dataURL) - if err != nil { - logger.Fatal(err) - } - if resp.StatusCode != 200 { - logger.Fatal("bad GET status for UnicodeData.txt", resp.Status) - } - input := bufio.NewReader(resp.Body) + input := open(*dataURL) var first uint32 = 0 for { line, err := input.ReadString('\n') @@ -310,21 +341,14 @@ func loadChars() { first = 0 } } - resp.Body.Close() + input.close() } func loadCasefold() { if *casefoldingURL == "" { flag.Set("casefolding", *url+"CaseFolding.txt") } - resp, err := http.Get(*casefoldingURL) - if err != nil { - logger.Fatal(err) - } - if resp.StatusCode != 200 { - logger.Fatal("bad GET status for CaseFolding.txt", resp.Status) - } - input := bufio.NewReader(resp.Body) + input := open(*casefoldingURL) for { line, err := input.ReadString('\n') if err != nil { @@ -355,7 +379,7 @@ func loadCasefold() { } chars[p1].foldCase = int(p2) } - resp.Body.Close() + input.close() } const progHeader = `// Generated by running @@ -662,15 +686,7 @@ func printScriptOrProperty(doProps bool) { if flaglist == "" { return } - var err os.Error - resp, err := http.Get(*url + file) - if err != nil { - logger.Fatal(err) - } - if resp.StatusCode != 200 { - logger.Fatal("bad GET status for ", file, ":", resp.Status) - } - input := bufio.NewReader(resp.Body) + input := open(*url + file) for { line, err := input.ReadString('\n') if err != nil { @@ -681,7 +697,7 @@ func printScriptOrProperty(doProps bool) { } parseScript(line[0:len(line)-1], table) } - resp.Body.Close() + input.close() // Find out which scripts to dump list := strings.Split(flaglist, ",") @@ -864,6 +880,16 @@ func getCaseState(i int) (c *caseState) { case ch.titleCase: c._case = CaseTitle } + // Some things such as roman numeral U+2161 don't describe themselves + // as upper case, but have a lower case. Second-guess them. + if c._case == CaseNone && ch.lowerCase != 0 { + c._case = CaseUpper + } + // Same in the other direction. + if c._case == CaseNone && ch.upperCase != 0 { + c._case = CaseLower + } + if ch.upperCase != 0 { c.deltaToUpper = ch.upperCase - i } diff --git a/src/pkg/unicode/tables.go b/src/pkg/unicode/tables.go index 9fe43017e5..88b5c0fbaa 100644 --- a/src/pkg/unicode/tables.go +++ b/src/pkg/unicode/tables.go @@ -5266,6 +5266,7 @@ var _CaseRanges = []CaseRange{ {0x028A, 0x028B, d{-217, 0, -217}}, {0x028C, 0x028C, d{-71, 0, -71}}, {0x0292, 0x0292, d{-219, 0, -219}}, + {0x0345, 0x0345, d{84, 0, 84}}, {0x0370, 0x0373, d{UpperLower, UpperLower, UpperLower}}, {0x0376, 0x0377, d{UpperLower, UpperLower, UpperLower}}, {0x037B, 0x037D, d{130, 0, 130}}, @@ -5375,7 +5376,11 @@ var _CaseRanges = []CaseRange{ {0x212B, 0x212B, d{0, -8262, 0}}, {0x2132, 0x2132, d{0, 28, 0}}, {0x214E, 0x214E, d{-28, 0, -28}}, + {0x2160, 0x216F, d{0, 16, 0}}, + {0x2170, 0x217F, d{-16, 0, -16}}, {0x2183, 0x2184, d{UpperLower, UpperLower, UpperLower}}, + {0x24B6, 0x24CF, d{0, 26, 0}}, + {0x24D0, 0x24E9, d{-26, 0, -26}}, {0x2C00, 0x2C2E, d{0, 48, 0}}, {0x2C30, 0x2C5E, d{-48, 0, -48}}, {0x2C60, 0x2C61, d{UpperLower, UpperLower, UpperLower}}, @@ -5734,90 +5739,6 @@ var caseOrbit = []foldPair{ {0x2126, 0x03A9}, {0x212A, 0x004B}, {0x212B, 0x00C5}, - {0x2160, 0x2170}, - {0x2161, 0x2171}, - {0x2162, 0x2172}, - {0x2163, 0x2173}, - {0x2164, 0x2174}, - {0x2165, 0x2175}, - {0x2166, 0x2176}, - {0x2167, 0x2177}, - {0x2168, 0x2178}, - {0x2169, 0x2179}, - {0x216A, 0x217A}, - {0x216B, 0x217B}, - {0x216C, 0x217C}, - {0x216D, 0x217D}, - {0x216E, 0x217E}, - {0x216F, 0x217F}, - {0x2170, 0x2160}, - {0x2171, 0x2161}, - {0x2172, 0x2162}, - {0x2173, 0x2163}, - {0x2174, 0x2164}, - {0x2175, 0x2165}, - {0x2176, 0x2166}, - {0x2177, 0x2167}, - {0x2178, 0x2168}, - {0x2179, 0x2169}, - {0x217A, 0x216A}, - {0x217B, 0x216B}, - {0x217C, 0x216C}, - {0x217D, 0x216D}, - {0x217E, 0x216E}, - {0x217F, 0x216F}, - {0x24B6, 0x24D0}, - {0x24B7, 0x24D1}, - {0x24B8, 0x24D2}, - {0x24B9, 0x24D3}, - {0x24BA, 0x24D4}, - {0x24BB, 0x24D5}, - {0x24BC, 0x24D6}, - {0x24BD, 0x24D7}, - {0x24BE, 0x24D8}, - {0x24BF, 0x24D9}, - {0x24C0, 0x24DA}, - {0x24C1, 0x24DB}, - {0x24C2, 0x24DC}, - {0x24C3, 0x24DD}, - {0x24C4, 0x24DE}, - {0x24C5, 0x24DF}, - {0x24C6, 0x24E0}, - {0x24C7, 0x24E1}, - {0x24C8, 0x24E2}, - {0x24C9, 0x24E3}, - {0x24CA, 0x24E4}, - {0x24CB, 0x24E5}, - {0x24CC, 0x24E6}, - {0x24CD, 0x24E7}, - {0x24CE, 0x24E8}, - {0x24CF, 0x24E9}, - {0x24D0, 0x24B6}, - {0x24D1, 0x24B7}, - {0x24D2, 0x24B8}, - {0x24D3, 0x24B9}, - {0x24D4, 0x24BA}, - {0x24D5, 0x24BB}, - {0x24D6, 0x24BC}, - {0x24D7, 0x24BD}, - {0x24D8, 0x24BE}, - {0x24D9, 0x24BF}, - {0x24DA, 0x24C0}, - {0x24DB, 0x24C1}, - {0x24DC, 0x24C2}, - {0x24DD, 0x24C3}, - {0x24DE, 0x24C4}, - {0x24DF, 0x24C5}, - {0x24E0, 0x24C6}, - {0x24E1, 0x24C7}, - {0x24E2, 0x24C8}, - {0x24E3, 0x24C9}, - {0x24E4, 0x24CA}, - {0x24E5, 0x24CB}, - {0x24E6, 0x24CC}, - {0x24E7, 0x24CD}, - {0x24E8, 0x24CE}, - {0x24E9, 0x24CF}, } // FoldCategory maps a category name to a table of @@ -6102,4 +6023,4 @@ var FoldScript = map[string]*RangeTable{} // Range entries: 3391 16-bit, 659 32-bit, 4050 total. // Range bytes: 20346 16-bit, 7908 32-bit, 28254 total. -// Fold orbit bytes: 147 pairs, 588 bytes +// Fold orbit bytes: 63 pairs, 252 bytes