1
0
mirror of https://github.com/golang/go synced 2024-11-22 02:44:39 -07:00

exp/locale/collate: Added functionality to parse and process LDML files

for both locale-specific exemplar characters and tailorings to
the collation table.
Some specifices:
- Moved stringSet to the beginning of the file and added some functionality
  to parse command line files.
- openReader now modifies the input URL for localFiles to guarantee that
  any http source listed in the generated file is indeed this source.
- Note that the implementation of the Tailoring API used by maketables.go
  is not yet checked in. So for now adding tailorings are simply no-ops.
- The generated file of exemplar characters will be used somewhere else.
  Here is a snippet of how the body of the generated file looks like:

type exemplarType int
const (
        exCharacters exemplarType = iota
        exContractions
        exPunctuation
        exAuxiliary
        exCurrency
        exIndex
        exN
)

var exemplarCharacters = map[string][exN]string{
        "af": [exN]string{
                0: "a á â b c d e é è ê ë f g h i î ï j k l m n o ô ö p q r s t u û v w x y z",
                3: "á à â ä ã æ ç é è ê ë í ì î ï ó ò ô ö ú ù û ü ý",
                4: "a b c d e f g h i j k l m n o p q r s t u v w x y z",
        },
        ...
}

R=r
CC=golang-dev
https://golang.org/cl/6501070
This commit is contained in:
Marcel van Lohuizen 2012-09-01 14:15:00 +02:00
parent 18aa55c169
commit 5a78e5ea4c
2 changed files with 543 additions and 43 deletions

View File

@ -10,13 +10,16 @@
package main
import (
"archive/zip"
"bufio"
"bytes"
"encoding/xml"
"exp/locale/collate"
"exp/locale/collate/build"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"os"
@ -26,17 +29,155 @@ import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
var ducet = flag.String("ducet",
var (
// TODO: change default to "http://unicode.org/Public/UCA/"+unicode.Version+"/CollationAuxiliary.zip"
root = flag.String("root",
"http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
"URL of the Default Unicode Collation Element Table (DUCET).")
var test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data")
var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
`URL of the Default Unicode Collation Element Table (DUCET). This can be a zip
file containing the file allkeys_CLDR.txt or an allkeys.txt file.`)
cldr = flag.String("cldr",
"http://www.unicode.org/Public/cldr/2.0.1/core.zip",
"URL of CLDR archive.")
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
localFiles = flag.Bool("local", false,
"data files have been copied to the current directory; for debugging only.")
short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
draft = flag.Bool("draft", false, `Use draft versions, when available.`)
tags = flag.String("tags", "", "build tags to be included after +build directive")
pkg = flag.String("package", "collate",
"the name of the package in which the generated file is to be included")
tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
"comma-spearated list of tables to generate.")
exclude = flagStringSet("exclude", "zh2", "",
"comma-separated list of languages to exclude.")
include = flagStringSet("include", "", "",
"comma-separated list of languages to include. Include trumps exclude.")
types = flagStringSetAllowAll("types", "", "",
"comma-separated list of types that should be included in addition to the standard type.")
)
// stringSet implements an ordered set based on a list. It implements flag.Value
// to allow a set to be specified as a comma-separated list.
type stringSet struct {
s []string
allowed *stringSet
dirty bool // needs compaction if true
all bool
allowAll bool
}
func flagStringSet(name, def, allowed, usage string) *stringSet {
ss := &stringSet{}
if allowed != "" {
usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
ss.allowed = &stringSet{}
failOnError(ss.allowed.Set(allowed))
}
ss.Set(def)
flag.Var(ss, name, usage)
return ss
}
func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
ss := &stringSet{allowAll: true}
if allowed == "" {
flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
} else {
ss.allowed = &stringSet{}
failOnError(ss.allowed.Set(allowed))
flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
}
ss.Set(def)
return ss
}
func (ss stringSet) Len() int {
return len(ss.s)
}
func (ss stringSet) String() string {
return strings.Join(ss.s, ",")
}
func (ss *stringSet) Set(s string) error {
if ss.allowAll && s == "all" {
ss.s = nil
ss.all = true
return nil
}
ss.s = ss.s[:0]
for _, s := range strings.Split(s, ",") {
if s := strings.TrimSpace(s); s != "" {
if ss.allowed != nil && !ss.allowed.contains(s) {
return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
}
ss.add(s)
}
}
ss.compact()
return nil
}
func (ss *stringSet) add(s string) {
ss.s = append(ss.s, s)
ss.dirty = true
}
func (ss *stringSet) values() []string {
ss.compact()
return ss.s
}
func (ss *stringSet) contains(s string) bool {
if ss.all {
return true
}
for _, v := range ss.s {
if v == s {
return true
}
}
return false
}
func (ss *stringSet) compact() {
if !ss.dirty {
return
}
a := ss.s
sort.Strings(a)
k := 0
for i := 1; i < len(a); i++ {
if a[k] != a[i] {
a[k+1] = a[i]
k++
}
}
ss.s = a[:k+1]
ss.dirty = false
}
func skipLang(l string) bool {
if include.Len() > 0 {
return !include.contains(l)
}
return exclude.contains(l)
}
func skipAlt(a string) bool {
if *draft && a == "proposed" {
return false
}
if *short && a == "short" {
return false
}
return true
}
func failOnError(e error) {
if e != nil {
@ -44,31 +185,55 @@ func failOnError(e error) {
}
}
// openReader opens the url or file given by url and returns it as an io.ReadCloser
// openReader opens the URL or file given by url and returns it as an io.ReadCloser
// or nil on error.
func openReader(url string) (io.ReadCloser, error) {
func openReader(url *string) (io.ReadCloser, error) {
if *localFiles {
pwd, _ := os.Getwd()
url = "file://" + path.Join(pwd, path.Base(url))
*url = "file://" + path.Join(pwd, path.Base(*url))
}
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
c := &http.Client{Transport: t}
resp, err := c.Get(url)
resp, err := c.Get(*url)
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status)
return nil, fmt.Errorf(`bad GET status for "%s": %s`, *url, resp.Status)
}
return resp.Body, nil
}
func openArchive(url *string) *zip.Reader {
f, err := openReader(url)
failOnError(err)
buffer, err := ioutil.ReadAll(f)
f.Close()
failOnError(err)
archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
failOnError(err)
return archive
}
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
func parseUCA(builder *build.Builder) {
r, err := openReader(*ducet)
var r io.ReadCloser
var err error
if strings.HasSuffix(*root, ".zip") {
for _, f := range openArchive(root).File {
if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
r, err = f.Open()
}
}
if r == nil {
err = fmt.Errorf("file allkeys_CLDR.txt not found in archive %q", *root)
}
} else {
r, err = openReader(root)
}
failOnError(err)
defer r.Close()
input := bufio.NewReader(r)
@ -147,30 +312,350 @@ func convHex(line int, s string) int {
var testInput = stringSet{}
type stringSet struct {
set []string
// LDML holds all collation information parsed from an LDML XML file.
// The format of these files is defined in http://unicode.org/reports/tr35/.
type LDML struct {
XMLName xml.Name `xml:"ldml"`
Language Attr `xml:"identity>language"`
Territory Attr `xml:"identity>territory"`
Chars *struct {
ExemplarCharacters []AttrValue `xml:"exemplarCharacters"`
MoreInformaton string `xml:"moreInformation,omitempty"`
} `xml:"characters"`
Default Attr `xml:"collations>default"`
Collations []Collation `xml:"collations>collation"`
}
func (ss *stringSet) add(s string) {
ss.set = append(ss.set, s)
type Attr struct {
XMLName xml.Name
Attr string `xml:"type,attr"`
}
func (ss *stringSet) values() []string {
ss.compact()
return ss.set
func (t Attr) String() string {
return t.Attr
}
func (ss *stringSet) compact() {
a := ss.set
sort.Strings(a)
k := 0
for i := 1; i < len(a); i++ {
if a[k] != a[i] {
a[k+1] = a[i]
k++
type AttrValue struct {
Type string `xml:"type,attr"`
Key string `xml:"key,attr,omitempty"`
Draft string `xml:"draft,attr,omitempty"`
Value string `xml:",innerxml"`
}
type Collation struct {
Type string `xml:"type,attr"`
Alt string `xml:"alt,attr"`
SuppressContraction string `xml:"suppress_contractions,omitempty"`
Settings *Settings `xml:"settings"`
Optimize string `xml:"optimize"`
Rules Rules `xml:"rules"`
}
type Optimize struct {
XMLName xml.Name `xml:"optimize"`
Data string `xml:"chardata"`
}
type Suppression struct {
XMLName xml.Name `xml:"suppress_contractions"`
Data string `xml:"chardata"`
}
type Settings struct {
Strength string `xml:"strenght,attr,omitempty"`
Backwards string `xml:"backwards,attr,omitempty"`
Normalization string `xml:"normalization,attr,omitempty"`
CaseLevel string `xml:"caseLevel,attr,omitempty"`
CaseFirst string `xml:"caseFirst,attr,omitempty"`
HiraganaQuarternary string `xml:"hiraganaQuartenary,attr,omitempty"`
Numeric string `xml:"numeric,attr,omitempty"`
VariableTop string `xml:"variableTop,attr,omitempty"`
}
type Rules struct {
XMLName xml.Name `xml:"rules"`
Any []RuleElem `xml:",any"`
}
type RuleElem struct {
XMLName xml.Name
Value string `xml:",innerxml"`
Before string `xml:"before,attr"`
Any []RuleElem `xml:",any"` // for <x> elements
}
var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
func (r *RuleElem) rewrite() {
// Convert hexadecimal Unicode codepoint notation to a string.
if m := charRe.FindAllStringSubmatch(r.Value, -1); m != nil {
runes := []rune{}
for _, sa := range m {
runes = append(runes, rune(convHex(-1, sa[1])))
}
r.Value = string(runes)
}
// Strip spaces from reset positions.
if m := tagRe.FindStringSubmatch(r.Value); m != nil {
r.Value = fmt.Sprintf("<%s/>", m[1])
}
for _, rr := range r.Any {
rr.rewrite()
}
}
func decodeXML(f *zip.File) *LDML {
r, err := f.Open()
failOnError(err)
d := xml.NewDecoder(r)
var x LDML
err = d.Decode(&x)
failOnError(err)
return &x
}
var mainLocales = []string{}
// charsets holds a list of exemplar characters per category.
type charSets map[string][]string
func (p charSets) fprint(w io.Writer) {
fmt.Fprintln(w, "[exN]string{")
for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
if set := p[k]; len(set) != 0 {
fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
}
}
fmt.Fprintln(w, "\t},")
}
var localeChars = make(map[string]charSets)
const exemplarHeader = `
type exemplarType int
const (
exCharacters exemplarType = iota
exContractions
exPunctuation
exAuxiliary
exCurrency
exIndex
exN
)
`
func printExemplarCharacters(w io.Writer) {
fmt.Fprintln(w, exemplarHeader)
fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
for _, loc := range mainLocales {
fmt.Fprintf(w, "\t%q: ", loc)
localeChars[loc].fprint(w)
}
fmt.Fprintln(w, "}")
}
var mainRe = regexp.MustCompile(`.*/main/(.*)\.xml`)
// parseMain parses XML files in the main directory of the CLDR core.zip file.
func parseMain() {
for _, f := range openArchive(cldr).File {
if m := mainRe.FindStringSubmatch(f.Name); m != nil {
locale := m[1]
x := decodeXML(f)
if skipLang(x.Language.Attr) {
continue
}
if x.Chars != nil {
for _, ec := range x.Chars.ExemplarCharacters {
if ec.Draft != "" {
continue
}
if _, ok := localeChars[locale]; !ok {
mainLocales = append(mainLocales, locale)
localeChars[locale] = make(charSets)
}
localeChars[locale][ec.Type] = parseCharacters(ec.Value)
}
}
}
}
}
func parseCharacters(chars string) []string {
parseSingle := func(s string) (r rune, tail string, escaped bool) {
if s[0] == '\\' {
if s[1] == 'u' || s[1] == 'U' {
r, _, tail, err := strconv.UnquoteChar(s, 0)
failOnError(err)
return r, tail, false
} else if strings.HasPrefix(s[1:], "&amp;") {
return '&', s[6:], false
}
return rune(s[1]), s[2:], true
} else if strings.HasPrefix(s, "&quot;") {
return '"', s[6:], false
}
r, sz := utf8.DecodeRuneInString(s)
return r, s[sz:], false
}
chars = strings.Trim(chars, "[ ]")
list := []string{}
var r, last, end rune
for len(chars) > 0 {
if chars[0] == '{' { // character sequence
buf := []rune{}
for chars = chars[1:]; len(chars) > 0; {
r, chars, _ = parseSingle(chars)
if r == '}' {
break
}
if r == ' ' {
log.Fatalf("space not supported in sequence %q", chars)
}
buf = append(buf, r)
}
list = append(list, string(buf))
last = 0
} else { // single character
escaped := false
r, chars, escaped = parseSingle(chars)
if r != ' ' {
if r == '-' && !escaped {
if last == 0 {
log.Fatal("'-' should be preceded by a character")
}
end, chars, _ = parseSingle(chars)
for ; last <= end; last++ {
list = append(list, string(last))
}
last = 0
} else {
list = append(list, string(r))
last = r
}
}
}
}
return list
}
var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
func parseCollation(b *build.Builder) {
for _, f := range openArchive(cldr).File {
if m := fileRe.FindStringSubmatch(f.Name); m != nil {
lang := m[1]
x := decodeXML(f)
if skipLang(x.Language.Attr) {
continue
}
def := "standard"
if x.Default.Attr != "" {
def = x.Default.Attr
}
todo := make(map[string]Collation)
for _, c := range x.Collations {
if c.Type != def && !types.contains(c.Type) {
continue
}
if c.Alt != "" && skipAlt(c.Alt) {
continue
}
for j, _ := range c.Rules.Any {
c.Rules.Any[j].rewrite()
}
locale := lang
if c.Type != def {
locale += "_u_co_" + c.Type
}
_, exists := todo[locale]
if c.Alt != "" || !exists {
todo[locale] = c
}
}
for _, c := range x.Collations {
locale := lang
if c.Type != def {
locale += "_u_co_" + c.Type
}
if d, ok := todo[locale]; ok && d.Alt == c.Alt {
insertCollation(b, locale, &c)
}
}
}
}
}
var lmap = map[byte]collate.Level{
'p': collate.Primary,
's': collate.Secondary,
't': collate.Tertiary,
'i': collate.Identity,
}
// cldrIndex is a Unicode-reserved sentinel value used.
// We ignore any rule that starts with this rune.
// See http://unicode.org/reports/tr35/#Collation_Elements for details.
const cldrIndex = 0xFDD0
func insertTailoring(t *build.Tailoring, r RuleElem, context, extend string) {
switch l := r.XMLName.Local; l {
case "p", "s", "t", "i":
if []rune(r.Value)[0] != cldrIndex {
str := context + r.Value
if *test {
testInput.add(str)
}
err := t.Insert(lmap[l[0]], str, extend)
failOnError(err)
}
case "pc", "sc", "tc", "ic":
level := lmap[l[0]]
for _, s := range r.Value {
str := context + string(s)
if *test {
testInput.add(str)
}
err := t.Insert(level, str, extend)
failOnError(err)
}
default:
log.Fatalf("unsupported tag: %q", l)
}
}
func insertCollation(builder *build.Builder, locale string, c *Collation) {
t := builder.Tailoring(locale)
for _, r := range c.Rules.Any {
switch r.XMLName.Local {
case "reset":
if r.Before == "" {
failOnError(t.SetAnchor(r.Value))
} else {
failOnError(t.SetAnchorBefore(r.Value))
}
case "x":
var context, extend string
for _, r1 := range r.Any {
switch r1.XMLName.Local {
case "context":
context = r1.Value
case "extend":
extend = r1.Value
}
}
for _, r1 := range r.Any {
if t := r1.XMLName.Local; t == "context" || t == "extend" {
continue
}
insertTailoring(t, r1, context, extend)
}
default:
insertTailoring(t, r, "", "")
}
}
ss.set = a[:k+1]
}
func testCollator(c *collate.Collator) {
@ -214,7 +699,16 @@ func printCollators(c *collate.Collator) {
func main() {
flag.Parse()
b := build.NewBuilder()
if *root != "" {
parseUCA(b)
}
if *cldr != "" {
if tables.contains("chars") {
parseMain()
}
parseCollation(b)
}
c, err := b.Build()
failOnError(err)
@ -222,18 +716,24 @@ func main() {
testCollator(c)
} else {
fmt.Println("// Generated by running")
fmt.Printf("// maketables --ducet=%s\n", *ducet)
fmt.Printf("// maketables -root=%s -cldr=%s\n", *root, *cldr)
fmt.Println("// DO NOT EDIT")
fmt.Println("// TODO: implement more compact representation for sparse blocks.")
if *tags != "" {
fmt.Printf("// +build %s\n", *tags)
}
fmt.Println("")
fmt.Println("package collate")
fmt.Printf("package %s\n", *pkg)
if tables.contains("collate") {
fmt.Println("")
fmt.Println(`import "exp/norm"`)
fmt.Println("")
printCollators(c)
_, err = b.Print(os.Stdout)
failOnError(err)
}
if tables.contains("chars") {
printExemplarCharacters(os.Stdout)
}
}
}

View File

@ -1,5 +1,5 @@
// Generated by running
// maketables --ducet=http://unicode.org/Public/UCA/6.0.0/allkeys.txt
// maketables -root=file:///Users/mpvl/Projects/go/hg/gopub/src/pkg/exp/locale/collate/allkeys.txt -cldr=file:///Users/mpvl/Projects/go/hg/gopub/src/pkg/exp/locale/collate/core.zip
// DO NOT EDIT
// TODO: implement more compact representation for sparse blocks.