go/internal/span/utf16.go

// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package span

import (
	"fmt"
	"unicode/utf16"
	"unicode/utf8"
)

// ToUTF16Column calculates the utf16 column expressed by the point given the
// supplied file contents.
// This is used to convert from the native (always in bytes) column
// representation and the utf16 counts used by some editors.
func ToUTF16Column(p Point, content []byte) (int, error) {
	if content == nil {
		return -1, fmt.Errorf("ToUTF16Column: missing content")
	}
	if !p.HasPosition() {
		return -1, fmt.Errorf("ToUTF16Column: point is missing position")
	}
	if !p.HasOffset() {
		return -1, fmt.Errorf("ToUTF16Column: point is missing offset")
	}
	offset := p.Offset()      // 0-based
	colZero := p.Column() - 1 // 0-based
	if colZero == 0 {
		// 0-based column 0, so it must be chr 1
		return 1, nil
	} else if colZero < 0 {
		return -1, fmt.Errorf("ToUTF16Column: column is invalid (%v)", colZero)
	}
	// work out the offset at the start of the line using the column
	lineOffset := offset - colZero
	if lineOffset < 0 || offset > len(content) {
		return -1, fmt.Errorf("ToUTF16Column: offsets %v-%v outside file contents (%v)", lineOffset, offset, len(content))
	}
	// Use the offset to pick out the line start.
	// This cannot panic: offset > len(content) and lineOffset < offset.
	start := content[lineOffset:]

	// Now, truncate down to the supplied column.
	start = start[:colZero]

	// and count the number of utf16 characters
	// in theory we could do this by hand more efficiently...
	return len(utf16.Encode([]rune(string(start)))) + 1, nil
}

// FromUTF16Column advances the point by the utf16 character offset given the
// supplied line contents.
// This is used to convert from the utf16 counts used by some editors to the
// native (always in bytes) column representation.
func FromUTF16Column(p Point, chr int, content []byte) (Point, error) {
	if !p.HasOffset() {
		return Point{}, fmt.Errorf("FromUTF16Column: point is missing offset")
	}
	// if chr is 1 then no adjustment needed
	if chr <= 1 {
		return p, nil
	}
	if p.Offset() >= len(content) {
		return p, fmt.Errorf("FromUTF16Column: offset (%v) greater than length of content (%v)", p.Offset(), len(content))
	}
	remains := content[p.Offset():]
	// scan forward the specified number of characters
	for count := 1; count < chr; count++ {
		if len(remains) <= 0 {
			return Point{}, fmt.Errorf("FromUTF16Column: chr goes beyond the content")
		}
		r, w := utf8.DecodeRune(remains)
		if r == '\n' {
			// Per the LSP spec:
			//
			// > If the character value is greater than the line length it
			// > defaults back to the line length.
			break
		}
		remains = remains[w:]
		if r >= 0x10000 {
			// a two point rune
			count++
			// if we finished in a two point rune, do not advance past the first
			if count >= chr {
				break
			}
		}
		p.v.Column += w
		p.v.Offset += w
	}
	return p, nil
}