2019-07-01 15:08:29 -06:00
|
|
|
// Copyright 2019 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package fuzzy
|
|
|
|
|
|
|
|
import (
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
// RuneRole specifies the role of a rune in the context of an input.
|
|
|
|
type RuneRole byte
|
|
|
|
|
|
|
|
const (
|
|
|
|
// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
|
|
|
|
RNone RuneRole = iota
|
|
|
|
// RSep specifies a rune with the role of segment separator.
|
|
|
|
RSep
|
|
|
|
// RTail specifies a rune which is a lower-case tail in a word in the input.
|
|
|
|
RTail
|
|
|
|
// RUCTail specifies a rune which is an upper-case tail in a word in the input.
|
|
|
|
RUCTail
|
|
|
|
// RHead specifies a rune which is the first character in a word in the input.
|
|
|
|
RHead
|
|
|
|
)
|
|
|
|
|
|
|
|
// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
|
|
|
|
// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
|
|
|
|
// or when it filled the output. If output is nil, then it gets created.
|
2019-10-21 22:07:21 -06:00
|
|
|
func RuneRoles(str string, reuse []RuneRole) []RuneRole {
|
2019-07-01 15:08:29 -06:00
|
|
|
var output []RuneRole
|
|
|
|
if cap(reuse) < len(str) {
|
|
|
|
output = make([]RuneRole, 0, len(str))
|
|
|
|
} else {
|
|
|
|
output = reuse[:0]
|
|
|
|
}
|
|
|
|
|
|
|
|
prev, prev2 := rtNone, rtNone
|
|
|
|
for i := 0; i < len(str); i++ {
|
|
|
|
r := rune(str[i])
|
|
|
|
|
|
|
|
role := RNone
|
|
|
|
|
|
|
|
curr := rtLower
|
|
|
|
if str[i] <= unicode.MaxASCII {
|
|
|
|
curr = runeType(rt[str[i]] - '0')
|
|
|
|
}
|
|
|
|
|
|
|
|
if curr == rtLower {
|
|
|
|
if prev == rtNone || prev == rtPunct {
|
|
|
|
role = RHead
|
|
|
|
} else {
|
|
|
|
role = RTail
|
|
|
|
}
|
|
|
|
} else if curr == rtUpper {
|
|
|
|
role = RHead
|
|
|
|
|
|
|
|
if prev == rtUpper {
|
|
|
|
// This and previous characters are both upper case.
|
|
|
|
|
|
|
|
if i+1 == len(str) {
|
|
|
|
// This is last character, previous was also uppercase -> this is UCTail
|
|
|
|
// i.e., (current char is C): aBC / BC / ABC
|
|
|
|
role = RUCTail
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if curr == rtPunct {
|
2019-10-21 22:07:21 -06:00
|
|
|
switch r {
|
|
|
|
case '.', ':':
|
2019-07-01 15:08:29 -06:00
|
|
|
role = RSep
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if curr != rtLower {
|
|
|
|
if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
|
|
|
|
// The previous two characters were uppercase. The current one is not a lower case, so the
|
|
|
|
// previous one can't be a HEAD. Make it a UCTail.
|
|
|
|
// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
|
|
|
|
output[i-1] = RUCTail
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
output = append(output, role)
|
|
|
|
prev2 = prev
|
|
|
|
prev = curr
|
|
|
|
}
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
|
|
|
type runeType byte
|
|
|
|
|
|
|
|
const (
|
|
|
|
rtNone runeType = iota
|
|
|
|
rtPunct
|
|
|
|
rtLower
|
|
|
|
rtUpper
|
|
|
|
)
|
|
|
|
|
|
|
|
const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
|
|
|
|
|
|
|
|
// LastSegment returns the substring representing the last segment from the input, where each
|
|
|
|
// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
|
|
|
|
// or Filename type.
|
|
|
|
func LastSegment(input string, roles []RuneRole) string {
|
|
|
|
// Exclude ending separators.
|
|
|
|
end := len(input) - 1
|
|
|
|
for end >= 0 && roles[end] == RSep {
|
|
|
|
end--
|
|
|
|
}
|
|
|
|
if end < 0 {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
start := end - 1
|
|
|
|
for start >= 0 && roles[start] != RSep {
|
|
|
|
start--
|
|
|
|
}
|
|
|
|
|
|
|
|
return input[start+1 : end+1]
|
|
|
|
}
|
|
|
|
|
|
|
|
// ToLower transforms the input string to lower case, which is stored in the output byte slice.
|
|
|
|
// The lower casing considers only ASCII values - non ASCII values are left unmodified.
|
|
|
|
// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
|
|
|
|
// created.
|
|
|
|
func ToLower(input string, reuse []byte) []byte {
|
|
|
|
output := reuse
|
|
|
|
if cap(reuse) < len(input) {
|
|
|
|
output = make([]byte, len(input))
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < len(input); i++ {
|
|
|
|
r := rune(input[i])
|
|
|
|
if r <= unicode.MaxASCII {
|
|
|
|
if 'A' <= r && r <= 'Z' {
|
|
|
|
r += 'a' - 'A'
|
|
|
|
}
|
|
|
|
}
|
|
|
|
output[i] = byte(r)
|
|
|
|
}
|
|
|
|
return output[:len(input)]
|
|
|
|
}
|
|
|
|
|
|
|
|
// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
|
|
|
|
// (start is inclusive, end is exclusive).
|
|
|
|
type WordConsumer func(start, end int)
|
|
|
|
|
|
|
|
// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
|
|
|
|
// delimiters for each word are fed to the provided consumer function.
|
|
|
|
func Words(roles []RuneRole, consume WordConsumer) {
|
|
|
|
var wordStart int
|
|
|
|
for i, r := range roles {
|
|
|
|
switch r {
|
|
|
|
case RUCTail, RTail:
|
|
|
|
case RHead, RNone, RSep:
|
|
|
|
if i != wordStart {
|
|
|
|
consume(wordStart, i)
|
|
|
|
}
|
|
|
|
wordStart = i
|
|
|
|
if r != RHead {
|
|
|
|
// Skip this character.
|
|
|
|
wordStart = i + 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if wordStart != len(roles) {
|
|
|
|
consume(wordStart, len(roles))
|
|
|
|
}
|
|
|
|
}
|