mirror of
https://github.com/golang/go
synced 2024-11-18 15:14:44 -07:00
8c5978f193
This PR fixes internal/lsp/fuzzy/matcher.go:230:4: should replace skipPenalty += 1 with skipPenalty++ . Change-Id: I7d5b6c20e25503ea266e26783e68ad92fd8b36e0 GitHub-Last-Rev: 36fd2c25232f593f905051692e84d8c634fb7a46 GitHub-Pull-Request: golang/tools#194 Reviewed-on: https://go-review.googlesource.com/c/tools/+/212400 Reviewed-by: Rebecca Stambler <rstambler@golang.org> Run-TryBot: Rebecca Stambler <rstambler@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
399 lines
9.9 KiB
Go
399 lines
9.9 KiB
Go
// Copyright 2019 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package fuzzy implements a fuzzy matching algorithm.
|
|
package fuzzy
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
)
|
|
|
|
const (
|
|
// MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs
|
|
// will be truncated to this size.
|
|
MaxInputSize = 127
|
|
// MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer
|
|
// inputs are truncated to this size.
|
|
MaxPatternSize = 63
|
|
)
|
|
|
|
type scoreVal int
|
|
|
|
func (s scoreVal) val() int {
|
|
return int(s) >> 1
|
|
}
|
|
|
|
func (s scoreVal) prevK() int {
|
|
return int(s) & 1
|
|
}
|
|
|
|
func score(val int, prevK int /*0 or 1*/) scoreVal {
|
|
return scoreVal(val<<1 + prevK)
|
|
}
|
|
|
|
// Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern.
|
|
// The matcher does not support parallel usage.
|
|
type Matcher struct {
|
|
pattern string
|
|
patternLower []byte // lower-case version of the pattern
|
|
patternShort []byte // first characters of the pattern
|
|
caseSensitive bool // set if the pattern is mix-cased
|
|
|
|
patternRoles []RuneRole // the role of each character in the pattern
|
|
roles []RuneRole // the role of each character in the tested string
|
|
|
|
scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
|
|
|
|
scoreScale float32
|
|
|
|
lastCandidateLen int // in bytes
|
|
lastCandidateMatched bool
|
|
|
|
// Here we save the last candidate in lower-case. This is basically a byte slice we reuse for
|
|
// performance reasons, so the slice is not reallocated for every candidate.
|
|
lowerBuf [MaxInputSize]byte
|
|
rolesBuf [MaxInputSize]RuneRole
|
|
}
|
|
|
|
func (m *Matcher) bestK(i, j int) int {
|
|
if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern.
|
|
func NewMatcher(pattern string) *Matcher {
|
|
if len(pattern) > MaxPatternSize {
|
|
pattern = pattern[:MaxPatternSize]
|
|
}
|
|
|
|
m := &Matcher{
|
|
pattern: pattern,
|
|
patternLower: ToLower(pattern, nil),
|
|
}
|
|
|
|
for i, c := range m.patternLower {
|
|
if pattern[i] != c {
|
|
m.caseSensitive = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if len(pattern) > 3 {
|
|
m.patternShort = m.patternLower[:3]
|
|
} else {
|
|
m.patternShort = m.patternLower
|
|
}
|
|
|
|
m.patternRoles = RuneRoles(pattern, nil)
|
|
|
|
if len(pattern) > 0 {
|
|
maxCharScore := 4
|
|
m.scoreScale = 1 / float32(maxCharScore*len(pattern))
|
|
}
|
|
|
|
return m
|
|
}
|
|
|
|
// Score returns the score returned by matching the candidate to the pattern.
|
|
// This is not designed for parallel use. Multiple candidates must be scored sequentially.
|
|
// Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
|
|
func (m *Matcher) Score(candidate string) float32 {
|
|
if len(candidate) > MaxInputSize {
|
|
candidate = candidate[:MaxInputSize]
|
|
}
|
|
lower := ToLower(candidate, m.lowerBuf[:])
|
|
m.lastCandidateLen = len(candidate)
|
|
|
|
if len(m.pattern) == 0 {
|
|
// Empty patterns perfectly match candidates.
|
|
return 1
|
|
}
|
|
|
|
if m.match(candidate, lower) {
|
|
sc := m.computeScore(candidate, lower)
|
|
if sc > minScore/2 && !m.poorMatch() {
|
|
m.lastCandidateMatched = true
|
|
if len(m.pattern) == len(candidate) {
|
|
// Perfect match.
|
|
return 1
|
|
}
|
|
|
|
if sc < 0 {
|
|
sc = 0
|
|
}
|
|
normalizedScore := float32(sc) * m.scoreScale
|
|
if normalizedScore > 1 {
|
|
normalizedScore = 1
|
|
}
|
|
|
|
return normalizedScore
|
|
}
|
|
}
|
|
|
|
m.lastCandidateMatched = false
|
|
return -1
|
|
}
|
|
|
|
const minScore = -10000
|
|
|
|
// MatchedRanges returns matches ranges for the last scored string as a flattened array of
|
|
// [begin, end) byte offset pairs.
|
|
func (m *Matcher) MatchedRanges() []int {
|
|
if len(m.pattern) == 0 || !m.lastCandidateMatched {
|
|
return nil
|
|
}
|
|
i, j := m.lastCandidateLen, len(m.pattern)
|
|
if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
|
|
return nil
|
|
}
|
|
|
|
var ret []int
|
|
k := m.bestK(i, j)
|
|
for i > 0 {
|
|
take := (k == 1)
|
|
k = m.scores[i][j][k].prevK()
|
|
if take {
|
|
if len(ret) == 0 || ret[len(ret)-1] != i {
|
|
ret = append(ret, i)
|
|
ret = append(ret, i-1)
|
|
} else {
|
|
ret[len(ret)-1] = i - 1
|
|
}
|
|
j--
|
|
}
|
|
i--
|
|
}
|
|
// Reverse slice.
|
|
for i := 0; i < len(ret)/2; i++ {
|
|
ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func (m *Matcher) match(candidate string, candidateLower []byte) bool {
|
|
i, j := 0, 0
|
|
for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
|
|
if candidateLower[i] == m.patternLower[j] {
|
|
j++
|
|
}
|
|
}
|
|
if j != len(m.patternLower) {
|
|
return false
|
|
}
|
|
|
|
// The input passes the simple test against pattern, so it is time to classify its characters.
|
|
// Character roles are used below to find the last segment.
|
|
m.roles = RuneRoles(candidate, m.rolesBuf[:])
|
|
|
|
return true
|
|
}
|
|
|
|
func (m *Matcher) computeScore(candidate string, candidateLower []byte) int {
|
|
pattLen, candLen := len(m.pattern), len(candidate)
|
|
|
|
for j := 0; j <= len(m.pattern); j++ {
|
|
m.scores[0][j][0] = minScore << 1
|
|
m.scores[0][j][1] = minScore << 1
|
|
}
|
|
m.scores[0][0][0] = score(0, 0) // Start with 0.
|
|
|
|
segmentsLeft, lastSegStart := 1, 0
|
|
for i := 0; i < candLen; i++ {
|
|
if m.roles[i] == RSep {
|
|
segmentsLeft++
|
|
lastSegStart = i + 1
|
|
}
|
|
}
|
|
|
|
// A per-character bonus for a consecutive match.
|
|
consecutiveBonus := 2
|
|
wordIdx := 0 // Word count within segment.
|
|
for i := 1; i <= candLen; i++ {
|
|
|
|
role := m.roles[i-1]
|
|
isHead := role == RHead
|
|
|
|
if isHead {
|
|
wordIdx++
|
|
} else if role == RSep && segmentsLeft > 1 {
|
|
wordIdx = 0
|
|
segmentsLeft--
|
|
}
|
|
|
|
var skipPenalty int
|
|
if i == 1 || (i-1) == lastSegStart {
|
|
// Skipping the start of first or last segment.
|
|
skipPenalty++
|
|
}
|
|
|
|
for j := 0; j <= pattLen; j++ {
|
|
// By default, we don't have a match. Fill in the skip data.
|
|
m.scores[i][j][1] = minScore << 1
|
|
|
|
// Compute the skip score.
|
|
k := 0
|
|
if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
|
|
k = 1
|
|
}
|
|
|
|
skipScore := m.scores[i-1][j][k].val()
|
|
// Do not penalize missing characters after the last matched segment.
|
|
if j != pattLen {
|
|
skipScore -= skipPenalty
|
|
}
|
|
m.scores[i][j][0] = score(skipScore, k)
|
|
|
|
if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
|
|
// Not a match.
|
|
continue
|
|
}
|
|
pRole := m.patternRoles[j-1]
|
|
|
|
if role == RTail && pRole == RHead {
|
|
if j > 1 {
|
|
// Not a match: a head in the pattern matches a tail character in the candidate.
|
|
continue
|
|
}
|
|
// Special treatment for the first character of the pattern. We allow
|
|
// matches in the middle of a word if they are long enough, at least
|
|
// min(3, pattern.length) characters.
|
|
if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Compute the char score.
|
|
var charScore int
|
|
// Bonus 1: the char is in the candidate's last segment.
|
|
if segmentsLeft <= 1 {
|
|
charScore++
|
|
}
|
|
// Bonus 2: Case match or a Head in the pattern aligns with one in the word.
|
|
// Single-case patterns lack segmentation signals and we assume any character
|
|
// can be a head of a segment.
|
|
if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
|
|
charScore++
|
|
}
|
|
|
|
// Penalty 1: pattern char is Head, candidate char is Tail.
|
|
if role == RTail && pRole == RHead {
|
|
charScore--
|
|
}
|
|
// Penalty 2: first pattern character matched in the middle of a word.
|
|
if j == 1 && role == RTail {
|
|
charScore -= 4
|
|
}
|
|
|
|
// Third dimension encodes whether there is a gap between the previous match and the current
|
|
// one.
|
|
for k := 0; k < 2; k++ {
|
|
sc := m.scores[i-1][j-1][k].val() + charScore
|
|
|
|
isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
|
|
if isConsecutive {
|
|
// Bonus 3: a consecutive match. First character match also gets a bonus to
|
|
// ensure prefix final match score normalizes to 1.0.
|
|
// Logically, this is a part of charScore, but we have to compute it here because it
|
|
// only applies for consecutive matches (k == 1).
|
|
sc += consecutiveBonus
|
|
}
|
|
if k == 0 {
|
|
// Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack
|
|
// of alignment.
|
|
if role == RTail || role == RUCTail {
|
|
sc -= 3
|
|
}
|
|
}
|
|
|
|
if sc > m.scores[i][j][1].val() {
|
|
m.scores[i][j][1] = score(sc, k)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
|
|
|
|
return result
|
|
}
|
|
|
|
// ScoreTable returns the score table computed for the provided candidate. Used only for debugging.
|
|
func (m *Matcher) ScoreTable(candidate string) string {
|
|
var buf bytes.Buffer
|
|
|
|
var line1, line2, separator bytes.Buffer
|
|
line1.WriteString("\t")
|
|
line2.WriteString("\t")
|
|
for j := 0; j < len(m.pattern); j++ {
|
|
line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
|
|
separator.WriteString("----------------")
|
|
}
|
|
|
|
buf.WriteString(line1.String())
|
|
buf.WriteString("\n")
|
|
buf.WriteString(separator.String())
|
|
buf.WriteString("\n")
|
|
|
|
for i := 1; i <= len(candidate); i++ {
|
|
line1.Reset()
|
|
line2.Reset()
|
|
|
|
line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
|
|
line2.WriteString("\t")
|
|
|
|
for j := 1; j <= len(m.pattern); j++ {
|
|
line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
|
|
line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
|
|
}
|
|
buf.WriteString(line1.String())
|
|
buf.WriteString("\n")
|
|
buf.WriteString(line2.String())
|
|
buf.WriteString("\n")
|
|
buf.WriteString(separator.String())
|
|
buf.WriteString("\n")
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func dir(prevK int) rune {
|
|
if prevK == 0 {
|
|
return 'M'
|
|
}
|
|
return 'H'
|
|
}
|
|
|
|
func (m *Matcher) poorMatch() bool {
|
|
if len(m.pattern) < 2 {
|
|
return false
|
|
}
|
|
|
|
i, j := m.lastCandidateLen, len(m.pattern)
|
|
k := m.bestK(i, j)
|
|
|
|
var counter, len int
|
|
for i > 0 {
|
|
take := (k == 1)
|
|
k = m.scores[i][j][k].prevK()
|
|
if take {
|
|
len++
|
|
if k == 0 && len < 3 && m.roles[i-1] == RTail {
|
|
// Short match in the middle of a word
|
|
counter++
|
|
if counter > 1 {
|
|
return true
|
|
}
|
|
}
|
|
j--
|
|
} else {
|
|
len = 0
|
|
}
|
|
i--
|
|
}
|
|
return false
|
|
}
|