1
0
mirror of https://github.com/golang/go synced 2024-11-26 14:56:47 -07:00

exp/locale/collate/tools/colcmp: implementation of colcmp tool used for comparing

various implementation of collation.  The tool provides commands for soring,
regressing one implementation against another, and benchmarking.
Currently it includes collation implementations for the Go collator, ICU,
and one using Darwin's CoreFoundation framework.
To avoid building this tool in the default build, the colcmp tag has been
added to all files. This allows other tools/colcmp in this directory (e.g. it may make
sense to move maketables here) to be put in this directory as well.

R=r, rsc, mpvl
CC=golang-dev
https://golang.org/cl/6496118
This commit is contained in:
Marcel van Lohuizen 2012-09-24 13:22:03 +09:00
parent 0d82e69811
commit 5e47b77990
7 changed files with 2066 additions and 0 deletions

View File

@ -0,0 +1,7 @@
# Copyright 2012 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
chars:
go run ../maketables.go -tables=chars -package=main > chars.go
gofmt -w chars.go

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,95 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"exp/locale/collate"
"log"
"unicode/utf16"
)
// Input holds an input string in both UTF-8 and UTF-16 format.
type Input struct {
index int // used for restoring to original random order
UTF8 []byte
UTF16 []uint16
key []byte // used for sorting
}
func (i Input) String() string {
return string(i.UTF8)
}
func makeInput(s8 []byte, s16 []uint16) Input {
return Input{UTF8: s8, UTF16: s16}
}
func makeInputString(s string) Input {
return Input{
UTF8: []byte(s),
UTF16: utf16.Encode([]rune(s)),
}
}
// Collator is an interface for architecture-specific implementations of collation.
type Collator interface {
// Key generates a sort key for the given input. Implemenations
// may return nil if a collator does not support sort keys.
Key(s Input) []byte
// Compare returns -1 if a < b, 1 if a > b and 0 if a == b.
Compare(a, b Input) int
}
// CollatorFactory creates a Collator for a given locale.
type CollatorFactory struct {
name string
makeFn func(locale string) (Collator, error)
description string
}
var collators = []CollatorFactory{}
// AddFactory registers f as a factory for an implementation of Collator.
func AddFactory(f CollatorFactory) {
collators = append(collators, f)
}
func getCollator(name, locale string) Collator {
for _, f := range collators {
if f.name == name {
col, err := f.makeFn(locale)
if err != nil {
log.Fatal(err)
}
return col
}
}
log.Fatalf("collator of type %q not found", name)
return nil
}
// goCollator is an implemention of Collator using go's own collator.
type goCollator struct {
c *collate.Collator
buf collate.Buffer
}
func init() {
AddFactory(CollatorFactory{"go", newGoCollator, "Go's native collator implementation."})
}
func newGoCollator(locale string) (Collator, error) {
c := &goCollator{c: collate.New(locale)}
return c, nil
}
func (c *goCollator) Key(b Input) []byte {
return c.c.Key(&c.buf, b.UTF8)
}
func (c *goCollator) Compare(a, b Input) int {
return c.c.Compare(&c.buf, a.UTF8, b.UTF8)
}

View File

@ -0,0 +1,528 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"bytes"
"exp/norm"
"flag"
"fmt"
"io"
"log"
"os"
"runtime/pprof"
"sort"
"strconv"
"strings"
"text/template"
"time"
)
var (
doNorm = flag.Bool("norm", false, "normalize input strings")
cases = flag.Bool("case", false, "generate case variants")
verbose = flag.Bool("verbose", false, "print results")
debug = flag.Bool("debug", false, "output debug information")
locale = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.")
col = flag.String("col", "go", "collator to test")
gold = flag.String("gold", "go", "collator used as the gold standard")
usecmp = flag.Bool("usecmp", false,
`use comparison instead of sort keys when sorting. Must be "test", "gold" or "both"`)
cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
exclude = flag.String("exclude", "", "exclude errors that contain any of the characters")
limit = flag.Int("limit", 5000000, "maximum number of samples to generate for one run")
)
func failOnError(err error) {
if err != nil {
log.Panic(err)
}
}
// Test holds test data for testing a locale-collator pair.
// Test also provides functionality that is commonly used by the various commands.
type Test struct {
ctxt *Context
Name string
Locale string
ColName string
Col Collator
UseCompare bool
Input []Input
Duration time.Duration
start time.Time
msg string
count int
}
func (t *Test) clear() {
t.Col = nil
t.Input = nil
}
const (
msgGeneratingInput = "generating input"
msgGeneratingKeys = "generating keys"
msgSorting = "sorting"
)
var lastLen = 0
func (t *Test) SetStatus(msg string) {
if *debug || *verbose {
fmt.Printf("%s: %s...\n", t.Name, msg)
} else if t.ctxt.out != nil {
fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen))
fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
fmt.Fprint(t.ctxt.out, msg, "...")
lastLen = len(msg) + 3
fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
}
}
// Start is used by commands to signal the start of an operation.
func (t *Test) Start(msg string) {
t.SetStatus(msg)
t.count = 0
t.msg = msg
t.start = time.Now()
}
// Stop is used by commands to signal the end of an operation.
func (t *Test) Stop() (time.Duration, int) {
d := time.Now().Sub(t.start)
t.Duration += d
if *debug || *verbose {
fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000)
}
return d, t.count
}
// generateKeys generates sort keys for all the inputs.
func (t *Test) generateKeys() {
for i, s := range t.Input {
b := t.Col.Key(s)
t.Input[i].key = b
if *debug {
fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b)
}
}
}
// Sort sorts the inputs. It generates sort keys if this is required by the
// chosen sort method.
func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) {
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
failOnError(err)
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
if t.UseCompare || t.Col.Key(t.Input[0]) == nil {
t.Start(msgSorting)
sort.Sort(&testCompare{*t})
tsort, nsort = t.Stop()
} else {
t.Start(msgGeneratingKeys)
t.generateKeys()
t.count = len(t.Input)
tkey, nkey = t.Stop()
t.Start(msgSorting)
sort.Sort(t)
tsort, nsort = t.Stop()
}
return
}
func (t *Test) Swap(a, b int) {
t.Input[a], t.Input[b] = t.Input[b], t.Input[a]
}
func (t *Test) Less(a, b int) bool {
t.count++
return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1
}
func (t Test) Len() int {
return len(t.Input)
}
type testCompare struct {
Test
}
func (t *testCompare) Less(a, b int) bool {
t.count++
return t.Col.Compare(t.Input[a], t.Input[b]) == -1
}
type testRestore struct {
Test
}
func (t *testRestore) Less(a, b int) bool {
return t.Input[a].index < t.Input[b].index
}
// GenerateInput generates input phrases for the locale tested by t.
func (t *Test) GenerateInput() {
t.Input = nil
if t.ctxt.lastLocale != t.Locale {
gen := phraseGenerator{}
gen.init(t.Locale)
t.SetStatus(msgGeneratingInput)
t.ctxt.lastInput = nil // allow the previous value to be garbage collected.
t.Input = gen.generate(*doNorm)
t.ctxt.lastInput = t.Input
t.ctxt.lastLocale = t.Locale
} else {
t.Input = t.ctxt.lastInput
for i := range t.Input {
t.Input[i].key = nil
}
sort.Sort(&testRestore{*t})
}
}
// Context holds all tests and settings translated from command line options.
type Context struct {
test []*Test
last *Test
lastLocale string
lastInput []Input
out io.Writer
}
func (ts *Context) Printf(format string, a ...interface{}) {
ts.assertBuf()
fmt.Fprintf(ts.out, format, a...)
}
func (ts *Context) Print(a ...interface{}) {
ts.assertBuf()
fmt.Fprint(ts.out, a...)
}
// assertBuf sets up an io.Writer for ouput, if it doesn't already exist.
// In debug and verbose mode, output is buffered so that the regular output
// will not interfere with the additional output. Otherwise, output is
// written directly to stdout for a more responsive feel.
func (ts *Context) assertBuf() {
if ts.out != nil {
return
}
if *debug || *verbose {
ts.out = &bytes.Buffer{}
} else {
ts.out = os.Stdout
}
}
// flush flushes the contents of ts.out to stdout, if it is not stdout already.
func (ts *Context) flush() {
if ts.out != nil {
if _, ok := ts.out.(io.ReadCloser); !ok {
io.Copy(os.Stdout, ts.out.(io.Reader))
}
}
}
// parseTests creates all tests from command lines and returns
// a Context to hold them.
func parseTests() *Context {
ctxt := &Context{}
colls := strings.Split(*col, ",")
for _, loc := range strings.Split(*locale, ",") {
loc = strings.TrimSpace(loc)
for _, name := range colls {
name = strings.TrimSpace(name)
col := getCollator(name, loc)
ctxt.test = append(ctxt.test, &Test{
ctxt: ctxt,
Locale: loc,
ColName: name,
UseCompare: *usecmp,
Col: col,
})
}
}
return ctxt
}
func (c *Context) Len() int {
return len(c.test)
}
func (c *Context) Test(i int) *Test {
if c.last != nil {
c.last.clear()
}
c.last = c.test[i]
return c.last
}
func parseInput(args []string) []Input {
input := []Input{}
for _, s := range args {
rs := []rune{}
for len(s) > 0 {
var r rune
r, _, s, _ = strconv.UnquoteChar(s, '\'')
rs = append(rs, r)
}
s = string(rs)
if *doNorm {
s = norm.NFC.String(s)
}
input = append(input, makeInputString(s))
}
return input
}
// A Command is an implementation of a colcmp command.
type Command struct {
Run func(cmd *Context, args []string)
Usage string
Short string
Long string
}
func (cmd Command) Name() string {
return strings.SplitN(cmd.Usage, " ", 2)[0]
}
var commands = []*Command{
cmdSort,
cmdBench,
cmdRegress,
}
const sortHelp = `
Sort sorts a given list of strings. Strings are separated by whitespace.
`
var cmdSort = &Command{
Run: runSort,
Usage: "sort <string>*",
Short: "sort a given list of strings",
Long: sortHelp,
}
func runSort(ctxt *Context, args []string) {
input := parseInput(args)
if len(input) == 0 {
log.Fatalf("Nothing to sort.")
}
if ctxt.Len() > 1 {
ctxt.Print("COLL LOCALE RESULT\n")
}
for i := 0; i < ctxt.Len(); i++ {
t := ctxt.Test(i)
t.Input = append(t.Input, input...)
t.Sort()
if ctxt.Len() > 1 {
ctxt.Printf("%-5s %-5s ", t.ColName, t.Locale)
}
for _, s := range t.Input {
ctxt.Print(string(s.UTF8), " ")
}
ctxt.Print("\n")
}
}
const benchHelp = `
Bench runs a benchmark for the given list of collator implementations.
If no collator implementations are given, the go collator will be used.
`
var cmdBench = &Command{
Run: runBench,
Usage: "bench",
Short: "benchmark a given list of collator implementations",
Long: benchHelp,
}
func runBench(ctxt *Context, args []string) {
ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL")
for i := 0; i < ctxt.Len(); i++ {
t := ctxt.Test(i)
ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName)
t.GenerateInput()
ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000))
tkey, tsort, nkey, nsort := t.Sort()
p := func(dur time.Duration, n int) {
s := ""
if dur > 0 {
s = fmt.Sprintf("%6.3fs ", dur.Seconds())
if n > 0 {
s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n)))
}
}
ctxt.Printf("%-24s ", s)
}
p(tkey, nkey)
p(tsort, nsort)
total := 0
for _, s := range t.Input {
total += len(s.key)
}
ctxt.Printf("%-5d ", total/t.Len())
ctxt.Printf("%6.3fs\n", t.Duration.Seconds())
if *debug {
for _, s := range t.Input {
fmt.Print(string(s.UTF8), " ")
}
fmt.Println()
}
}
}
const regressHelp = `
Regress runs a monkey test by comparing the results of randomly generated tests
between two implementations of a collator. The user may optionally pass a list
of strings to regress against instead of the default test set.
`
var cmdRegress = &Command{
Run: runRegress,
Usage: "regress -gold=<col> -test=<col> [string]*",
Short: "run a monkey test between two collators",
Long: regressHelp,
}
const failedKeyCompare = `
%d: incorrect comparison result for input:
a: %q (%.4X)
key: %s
b: %q (%.4X)
key: %s
Compare(a, b) = %d; want %d.
gold keys:
a: %s
b: %s
`
const failedCompare = `
%d: incorrect comparison result for input:
a: %q (%.4X)
b: %q (%.4X)
Compare(a, b) = %d; want %d.
`
func keyStr(b []byte) string {
buf := &bytes.Buffer{}
for _, v := range b {
fmt.Fprintf(buf, "%.2X ", v)
}
return buf.String()
}
func runRegress(ctxt *Context, args []string) {
input := parseInput(args)
for i := 0; i < ctxt.Len(); i++ {
t := ctxt.Test(i)
if len(input) > 0 {
t.Input = append(t.Input, input...)
} else {
t.GenerateInput()
}
t.Sort()
count := 0
gold := getCollator(*gold, t.Locale)
for i := 1; i < len(t.Input); i++ {
ia := t.Input[i-1]
ib := t.Input[i]
if bytes.IndexAny(ib.UTF8, *exclude) != -1 {
i++
continue
}
if bytes.IndexAny(ia.UTF8, *exclude) != -1 {
continue
}
goldCmp := gold.Compare(ia, ib)
if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp {
count++
a := string(ia.UTF8)
b := string(ib.UTF8)
fmt.Printf(failedKeyCompare, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib)))
} else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp {
count++
a := string(ia.UTF8)
b := string(ib.UTF8)
fmt.Printf(failedKeyCompare, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp)
}
}
if count > 0 {
ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1)
}
}
}
const helpTemplate = `
colcmp is a tool for testing and benchmarking collation
Usage: colcmp command [arguments]
The commands are:
{{range .}}
{{.Name | printf "%-11s"}} {{.Short}}{{end}}
Use "col help [topic]" for more information about that topic.
`
const detailedHelpTemplate = `
Usage: colcmp {{.Usage}}
{{.Long | trim}}
`
func runHelp(args []string) {
t := template.New("help")
t.Funcs(template.FuncMap{"trim": strings.TrimSpace})
if len(args) < 1 {
template.Must(t.Parse(helpTemplate))
failOnError(t.Execute(os.Stderr, &commands))
} else {
for _, cmd := range commands {
if cmd.Name() == args[0] {
template.Must(t.Parse(detailedHelpTemplate))
failOnError(t.Execute(os.Stderr, cmd))
os.Exit(0)
}
}
log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0])
}
os.Exit(0)
}
func main() {
flag.Parse()
log.SetFlags(0)
ctxt := parseTests()
if flag.NArg() < 1 {
runHelp(nil)
}
args := flag.Args()[1:]
if flag.Arg(0) == "help" {
runHelp(args)
}
for _, cmd := range commands {
if cmd.Name() == flag.Arg(0) {
cmd.Run(ctxt, args)
ctxt.flush()
return
}
}
runHelp(flag.Args())
}

View File

@ -0,0 +1,111 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin
package main
/*
#cgo LDFLAGS: -framework CoreFoundation
#include <CoreFoundation/CFBase.h>
#include <CoreFoundation/CoreFoundation.h>
*/
import "C"
import (
"unsafe"
)
func init() {
AddFactory(CollatorFactory{"osx", newOSX16Collator,
"OS X/Darwin collator, using native strings."})
AddFactory(CollatorFactory{"osx8", newOSX8Collator,
"OS X/Darwin collator for UTF-8."})
}
func osxUInt8P(s []byte) *C.UInt8 {
return (*C.UInt8)(unsafe.Pointer(&s[0]))
}
func osxCharP(s []uint16) *C.UniChar {
return (*C.UniChar)(unsafe.Pointer(&s[0]))
}
// osxCollator implements an Collator based on OS X's CoreFoundation.
type osxCollator struct {
loc C.CFLocaleRef
opt C.CFStringCompareFlags
}
func (c *osxCollator) init(locale string) {
l := C.CFStringCreateWithBytes(
nil,
osxUInt8P([]byte(locale)),
C.CFIndex(len(locale)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
)
c.loc = C.CFLocaleCreate(nil, l)
}
func newOSX8Collator(locale string) (Collator, error) {
c := &osx8Collator{}
c.init(locale)
return c, nil
}
func newOSX16Collator(locale string) (Collator, error) {
c := &osx16Collator{}
c.init(locale)
return c, nil
}
func (c osxCollator) Key(s Input) []byte {
return nil // sort keys not supported by OS X CoreFoundation
}
type osx8Collator struct {
osxCollator
}
type osx16Collator struct {
osxCollator
}
func (c osx16Collator) Compare(a, b Input) int {
sa := C.CFStringCreateWithCharactersNoCopy(
nil,
osxCharP(a.UTF16),
C.CFIndex(len(a.UTF16)),
C.kCFAllocatorNull,
)
sb := C.CFStringCreateWithCharactersNoCopy(
nil,
osxCharP(b.UTF16),
C.CFIndex(len(b.UTF16)),
C.kCFAllocatorNull,
)
_range := C.CFRangeMake(0, C.CFStringGetLength(sa))
return int(C.CFStringCompareWithOptionsAndLocale(sa, sb, _range, c.opt, c.loc))
}
func (c osx8Collator) Compare(a, b Input) int {
sa := C.CFStringCreateWithBytesNoCopy(
nil,
osxUInt8P(a.UTF8),
C.CFIndex(len(a.UTF8)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
C.kCFAllocatorNull,
)
sb := C.CFStringCreateWithBytesNoCopy(
nil,
osxUInt8P(b.UTF8),
C.CFIndex(len(b.UTF8)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
C.kCFAllocatorNull,
)
_range := C.CFRangeMake(0, C.CFStringGetLength(sa))
return int(C.CFStringCompareWithOptionsAndLocale(sa, sb, _range, c.opt, c.loc))
}

View File

@ -0,0 +1,179 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"exp/norm"
"math"
"math/rand"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
// parent computes the parent locale for the given locale.
// It returns false if the parent is already root.
func parent(locale string) (parent string, ok bool) {
if locale == "root" {
return "", false
}
if i := strings.LastIndex(locale, "_"); i != -1 {
return locale[:i], true
}
return "root", true
}
// rewriter is used to both unique strings and create variants of strings
// to add to the test set.
type rewriter struct {
seen map[string]bool
addCases bool
}
func newRewriter() *rewriter {
return &rewriter{
seen: make(map[string]bool),
}
}
func (r *rewriter) insert(a []string, s string) []string {
if !r.seen[s] {
r.seen[s] = true
a = append(a, s)
}
return a
}
// rewrite takes a sequence of strings in, adds variants of the these strings
// based on options and removes duplicates.
func (r *rewriter) rewrite(ss []string) []string {
ns := []string{}
for _, s := range ss {
ns = r.insert(ns, s)
if r.addCases {
rs := []rune(s)
rn := rs[0]
for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
rs[0] = c
ns = r.insert(ns, string(rs))
}
}
}
return ns
}
// exemplarySet holds a parsed set of characters from the exemplarCharacters table.
type exemplarySet struct {
typ exemplarType
set []string
charIndex int // cumulative total of phrases, including this set
}
type phraseGenerator struct {
sets [exN]exemplarySet
n int
}
func (g *phraseGenerator) init(locale string) {
ec := exemplarCharacters
// get sets for locale or parent locale if the set is not defined.
for i := range g.sets {
for p, ok := locale, true; ok; p, ok = parent(p) {
if set, ok := ec[p]; ok && set[i] != "" {
g.sets[i].set = strings.Split(set[i], " ")
break
}
}
}
r := newRewriter()
r.addCases = *cases
for i := range g.sets {
g.sets[i].set = r.rewrite(g.sets[i].set)
}
// compute indexes
for i, set := range g.sets {
g.n += len(set.set)
g.sets[i].charIndex = g.n
}
}
// phrase returns the ith phrase, where i < g.n.
func (g *phraseGenerator) phrase(i int) string {
for _, set := range g.sets {
if i < set.charIndex {
return set.set[i-(set.charIndex-len(set.set))]
}
}
panic("index out of range")
}
// generate generates inputs by combining all pairs of examplar strings.
// If doNorm is true, all input strings are normalized to NFC.
// TODO: allow other variations, statistical models, and random
// trailing sequences.
func (g *phraseGenerator) generate(doNorm bool) []Input {
const (
M = 1024 * 1024
buf8Size = 30 * M
buf16Size = 10 * M
)
// TODO: use a better way to limit the input size.
if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
g.n = sq
}
size := g.n * g.n
a := make([]Input, 0, size)
buf8 := make([]byte, 0, buf8Size)
buf16 := make([]uint16, 0, buf16Size)
addInput := func(str string) {
buf8 = buf8[len(buf8):]
buf16 = buf16[len(buf16):]
if len(str) > cap(buf8) {
buf8 = make([]byte, 0, buf8Size)
}
if len(str) > cap(buf16) {
buf16 = make([]uint16, 0, buf16Size)
}
if doNorm {
buf8 = norm.NFC.AppendString(buf8, str)
} else {
buf8 = append(buf8, str...)
}
buf16 = appendUTF16(buf16, buf8)
a = append(a, makeInput(buf8, buf16))
}
for i := 0; i < g.n; i++ {
p1 := g.phrase(i)
addInput(p1)
for j := 0; j < g.n; j++ {
p2 := g.phrase(j)
addInput(p1 + p2)
}
}
// permutate
rnd := rand.New(rand.NewSource(int64(rand.Int())))
for i := range a {
j := i + rnd.Intn(len(a)-i)
a[i], a[j] = a[j], a[i]
a[i].index = i // allow restoring this order if input is used multiple times.
}
return a
}
func appendUTF16(buf []uint16, s []byte) []uint16 {
for len(s) > 0 {
r, sz := utf8.DecodeRune(s)
s = s[sz:]
r1, r2 := utf16.EncodeRune(r)
if r1 != 0xFFFD {
buf = append(buf, uint16(r1), uint16(r2))
} else {
buf = append(buf, uint16(r))
}
}
return buf
}

View File

@ -0,0 +1,209 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build icu
package main
/*
#cgo LDFLAGS: -licui18n -licuuc
#include <stdlib.h>
#include <unicode/ucol.h>
#include <unicode/uiter.h>
#include <unicode/utypes.h>
*/
import "C"
import (
"fmt"
"log"
"unicode/utf16"
"unicode/utf8"
"unsafe"
)
func init() {
AddFactory(CollatorFactory{"icu", newUTF16,
"Main ICU collator, using native strings."})
AddFactory(CollatorFactory{"icu8", newUTF8iter,
"ICU collator using ICU iterators to process UTF8."})
AddFactory(CollatorFactory{"icu16", newUTF8conv,
"ICU collation by first converting UTF8 to UTF16."})
}
func icuCharP(s []byte) *C.char {
return (*C.char)(unsafe.Pointer(&s[0]))
}
func icuUInt8P(s []byte) *C.uint8_t {
return (*C.uint8_t)(unsafe.Pointer(&s[0]))
}
func icuUCharP(s []uint16) *C.UChar {
return (*C.UChar)(unsafe.Pointer(&s[0]))
}
func icuULen(s []uint16) C.int32_t {
return C.int32_t(len(s))
}
func icuSLen(s []byte) C.int32_t {
return C.int32_t(len(s))
}
// icuCollator implements a Collator based on ICU.
type icuCollator struct {
loc *C.char
col *C.UCollator
keyBuf []byte
}
const growBufSize = 10 * 1024 * 1024
func (c *icuCollator) init(locale string) error {
err := C.UErrorCode(0)
c.loc = C.CString(locale)
c.col = C.ucol_open(c.loc, &err)
if err > 0 {
return fmt.Errorf("failed opening collator for %q", locale)
} else if err < 0 {
loc := C.ucol_getLocaleByType(c.col, 0, &err)
fmt, ok := map[int]string{
-127: "warning: using default collator: %s",
-128: "warning: using fallback collator: %s",
}[int(err)]
if ok {
log.Printf(fmt, C.GoString(loc))
}
}
c.keyBuf = make([]byte, 0, growBufSize)
return nil
}
func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
if len(c.keyBuf) == cap(c.keyBuf) {
c.keyBuf = make([]byte, 0, growBufSize)
}
b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
return icuUInt8P(b), icuSLen(b)
}
func (c *icuCollator) extendBuf(n C.int32_t) []byte {
end := len(c.keyBuf) + int(n)
if end > cap(c.keyBuf) {
if len(c.keyBuf) == 0 {
log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
}
c.keyBuf = make([]byte, 0, growBufSize)
return nil
}
b := c.keyBuf[len(c.keyBuf):end]
c.keyBuf = c.keyBuf[:end]
return b
}
func (c *icuCollator) Close() error {
C.ucol_close(c.col)
C.free(unsafe.Pointer(c.loc))
return nil
}
// icuUTF16 implements the Collator interface.
type icuUTF16 struct {
icuCollator
}
func newUTF16(locale string) (Collator, error) {
c := &icuUTF16{}
return c, c.init(locale)
}
func (c *icuUTF16) Compare(a, b Input) int {
return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
}
func (c *icuUTF16) Key(s Input) []byte {
bp, bn := c.buf()
n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
if b := c.extendBuf(n); b != nil {
return b
}
return c.Key(s)
}
// icuUTF8iter implements the Collator interface
// This implementation wraps the UTF8 string in an iterator
// which is passed to the collator.
type icuUTF8iter struct {
icuCollator
a, b C.UCharIterator
}
func newUTF8iter(locale string) (Collator, error) {
c := &icuUTF8iter{}
return c, c.init(locale)
}
func (c *icuUTF8iter) Compare(a, b Input) int {
err := C.UErrorCode(0)
C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
}
func (c *icuUTF8iter) Key(s Input) []byte {
err := C.UErrorCode(0)
state := [2]C.uint32_t{}
C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
bp, bn := c.buf()
n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
if n >= bn {
// Force failure.
if c.extendBuf(n+1) != nil {
log.Fatal("expected extension to fail")
}
return c.Key(s)
}
return c.extendBuf(n)
}
// icuUTF8conv implementes the Collator interface.
// This implentation first converts the give UTF8 string
// to UTF16 and then calls the main ICU collation function.
type icuUTF8conv struct {
icuCollator
}
func newUTF8conv(locale string) (Collator, error) {
c := &icuUTF8conv{}
return c, c.init(locale)
}
func (c *icuUTF8conv) Compare(sa, sb Input) int {
a := encodeUTF16(sa.UTF8)
b := encodeUTF16(sb.UTF8)
return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
}
func (c *icuUTF8conv) Key(s Input) []byte {
a := encodeUTF16(s.UTF8)
bp, bn := c.buf()
n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
if b := c.extendBuf(n); b != nil {
return b
}
return c.Key(s)
}
func encodeUTF16(b []byte) []uint16 {
a := []uint16{}
for len(b) > 0 {
r, sz := utf8.DecodeRune(b)
b = b[sz:]
r1, r2 := utf16.EncodeRune(r)
if r1 != 0xFFFD {
a = append(a, uint16(r1), uint16(r2))
} else {
a = append(a, uint16(r))
}
}
return a
}