blob: 85997c24dc4672a6305644d3250ffce877878b62 [file] [log] [blame]
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package protocol
// This file defines Mapper, which wraps a file content buffer
// ([]byte) and provides efficient conversion between every kind of
// position representation.
//
// gopls uses four main representations of position:
//
// 1. byte offsets, e.g. (start, end int), starting from zero.
//
// 2. go/token notation. Use these types when interacting directly
// with the go/* syntax packages:
//
// token.Pos
// token.FileSet
// token.File
//
// Because File.Offset and File.Pos panic on invalid inputs,
// we do not call them directly and instead use the safetoken package
// for these conversions. This is enforced by a static check.
//
// Beware also that the methods of token.File have two bugs for which
// safetoken contains workarounds:
// - #57490, whereby the parser may create ast.Nodes during error
// recovery whose computed positions are out of bounds (EOF+1).
// - #41029, whereby the wrong line number is returned for the EOF position.
//
// 3. the cmd package.
//
// cmd.point = (line, col8, offset).
// cmd.Span = (uri URI, start, end cmd.point)
//
// Line and column are 1-based.
// Columns are measured in bytes (UTF-8 codes).
// All fields are optional.
//
// These types are useful as intermediate conversions of validated
// ranges (though MappedRange is superior as it is self contained
// and universally convertible). Since their fields are optional
// they are also useful for parsing user-provided positions (e.g. in
// the CLI) before we have access to file contents.
//
// 4. protocol, the LSP RPC message format.
//
// protocol.Position = (Line, Character uint32)
// protocol.Range = (start, end Position)
// protocol.Location = (URI, protocol.Range)
//
// Line and Character are 0-based.
// Characters (columns) are measured in UTF-16 codes.
//
// protocol.Mapper holds the (URI, Content) of a file, enabling
// efficient mapping between byte offsets, cmd ranges, and
// protocol ranges.
//
// protocol.MappedRange holds a protocol.Mapper and valid (start,
// end int) byte offsets, enabling infallible, efficient conversion
// to any other format.
import (
"bytes"
"fmt"
"go/ast"
"go/token"
"sort"
"strings"
"sync"
"unicode/utf8"
"golang.org/x/tools/gopls/internal/util/bug"
"golang.org/x/tools/gopls/internal/util/safetoken"
)
// A Mapper wraps the content of a file and provides mapping
// between byte offsets and notations of position such as:
//
// - (line, col8) pairs, where col8 is a 1-based UTF-8 column number
// (bytes), as used by the go/token and cmd packages.
//
// - (line, col16) pairs, where col16 is a 1-based UTF-16 column
// number, as used by the LSP protocol.
//
// All conversion methods are named "FromTo", where From and To are the two types.
// For example, the PointPosition method converts from a Point to a Position.
//
// Mapper does not intrinsically depend on go/token-based
// representations. Use safetoken to map between token.Pos <=> byte
// offsets, or the convenience methods such as PosPosition,
// NodePosition, or NodeRange.
//
// See overview comments at top of this file.
type Mapper struct {
URI DocumentURI
Content []byte
// Line-number information is requested only for a tiny
// fraction of Mappers, so we compute it lazily.
// Call initLines() before accessing fields below.
linesOnce sync.Once
lineStart []int // byte offset of start of ith line (0-based); last=EOF iff \n-terminated
nonASCII bool
// TODO(adonovan): adding an extra lineStart entry for EOF
// might simplify every method that accesses it. Try it out.
}
// NewMapper creates a new mapper for the given URI and content.
func NewMapper(uri DocumentURI, content []byte) *Mapper {
return &Mapper{URI: uri, Content: content}
}
// initLines populates the lineStart table.
func (m *Mapper) initLines() {
m.linesOnce.Do(func() {
nlines := bytes.Count(m.Content, []byte("\n"))
m.lineStart = make([]int, 1, nlines+1) // initially []int{0}
for offset, b := range m.Content {
if b == '\n' {
m.lineStart = append(m.lineStart, offset+1)
}
if b >= utf8.RuneSelf {
m.nonASCII = true
}
}
})
}
// LineCol8Position converts a valid line and UTF-8 column number,
// both 1-based, to a protocol (UTF-16) position.
func (m *Mapper) LineCol8Position(line, col8 int) (Position, error) {
// Report a bug for inputs that are invalid for any file content.
if line < 1 {
return Position{}, bug.Errorf("invalid 1-based line number: %d", line)
}
if col8 < 1 {
return Position{}, bug.Errorf("invalid 1-based column number: %d", col8)
}
m.initLines()
line0 := line - 1 // 0-based
if !(0 <= line0 && line0 < len(m.lineStart)) {
return Position{}, fmt.Errorf("line number %d out of range (max %d)", line, len(m.lineStart))
}
// content[start:end] is the preceding partial line.
start := m.lineStart[line0]
end := start + col8 - 1
// Validate column.
if end > len(m.Content) {
return Position{}, fmt.Errorf("column is beyond end of file")
} else if line0+1 < len(m.lineStart) && end >= m.lineStart[line0+1] {
return Position{}, fmt.Errorf("column is beyond end of line")
}
char := UTF16Len(m.Content[start:end])
return Position{Line: uint32(line0), Character: uint32(char)}, nil
}
// -- conversions from byte offsets --
// OffsetLocation converts a byte-offset interval to a protocol (UTF-16) location.
func (m *Mapper) OffsetLocation(start, end int) (Location, error) {
rng, err := m.OffsetRange(start, end)
if err != nil {
return Location{}, err
}
return m.RangeLocation(rng), nil
}
// OffsetRange converts a byte-offset interval to a protocol (UTF-16) range.
func (m *Mapper) OffsetRange(start, end int) (Range, error) {
if start > end {
return Range{}, fmt.Errorf("start offset (%d) > end (%d)", start, end)
}
startPosition, err := m.OffsetPosition(start)
if err != nil {
return Range{}, fmt.Errorf("start: %v", err)
}
endPosition, err := m.OffsetPosition(end)
if err != nil {
return Range{}, fmt.Errorf("end: %v", err)
}
return Range{Start: startPosition, End: endPosition}, nil
}
// OffsetPosition converts a byte offset to a protocol (UTF-16) position.
func (m *Mapper) OffsetPosition(offset int) (Position, error) {
if !(0 <= offset && offset <= len(m.Content)) {
return Position{}, fmt.Errorf("invalid offset %d (want 0-%d)", offset, len(m.Content))
}
// No error may be returned after this point,
// even if the offset does not fall at a rune boundary.
// (See panic in MappedRange.Range reachable.)
line, col16 := m.lineCol16(offset)
return Position{Line: uint32(line), Character: uint32(col16)}, nil
}
// lineCol16 converts a valid byte offset to line and UTF-16 column numbers, both 0-based.
func (m *Mapper) lineCol16(offset int) (int, int) {
line, start, cr := m.line(offset)
var col16 int
if m.nonASCII {
col16 = UTF16Len(m.Content[start:offset])
} else {
col16 = offset - start
}
if cr {
col16-- // retreat from \r at line end
}
return line, col16
}
// OffsetLineCol8 converts a valid byte offset to line and UTF-8 column numbers, both 1-based.
func (m *Mapper) OffsetLineCol8(offset int) (int, int) {
line, start, cr := m.line(offset)
col8 := offset - start
if cr {
col8-- // retreat from \r at line end
}
return line + 1, col8 + 1
}
// line returns:
// - the 0-based index of the line that encloses the (valid) byte offset;
// - the start offset of that line; and
// - whether the offset denotes a carriage return (\r) at line end.
func (m *Mapper) line(offset int) (int, int, bool) {
m.initLines()
// In effect, binary search returns a 1-based result.
line := sort.Search(len(m.lineStart), func(i int) bool {
return offset < m.lineStart[i]
})
// Adjustment for line-endings: \r|\n is the same as |\r\n.
var eol int
if line == len(m.lineStart) {
eol = len(m.Content) // EOF
} else {
eol = m.lineStart[line] - 1
}
cr := offset == eol && offset > 0 && m.Content[offset-1] == '\r'
line-- // 0-based
return line, m.lineStart[line], cr
}
// OffsetMappedRange returns a MappedRange for the given byte offsets.
// A MappedRange can be converted to any other form.
func (m *Mapper) OffsetMappedRange(start, end int) (MappedRange, error) {
if !(0 <= start && start <= end && end <= len(m.Content)) {
return MappedRange{}, fmt.Errorf("invalid offsets (%d, %d) (file %s has size %d)", start, end, m.URI, len(m.Content))
}
return MappedRange{m, start, end}, nil
}
// -- conversions from protocol (UTF-16) domain --
// RangeOffsets converts a protocol (UTF-16) range to start/end byte offsets.
func (m *Mapper) RangeOffsets(r Range) (int, int, error) {
start, err := m.PositionOffset(r.Start)
if err != nil {
return 0, 0, err
}
end, err := m.PositionOffset(r.End)
if err != nil {
return 0, 0, err
}
return start, end, nil
}
// PositionOffset converts a protocol (UTF-16) position to a byte offset.
func (m *Mapper) PositionOffset(p Position) (int, error) {
m.initLines()
// Validate line number.
if p.Line > uint32(len(m.lineStart)) {
return 0, fmt.Errorf("line number %d out of range 0-%d", p.Line, len(m.lineStart))
} else if p.Line == uint32(len(m.lineStart)) {
if p.Character == 0 {
return len(m.Content), nil // EOF
}
return 0, fmt.Errorf("column is beyond end of file")
}
offset := m.lineStart[p.Line]
content := m.Content[offset:] // rest of file from start of enclosing line
// Advance bytes up to the required number of UTF-16 codes.
col8 := 0
for col16 := 0; col16 < int(p.Character); col16++ {
r, sz := utf8.DecodeRune(content)
if sz == 0 {
return 0, fmt.Errorf("column is beyond end of file")
}
if r == '\n' {
return 0, fmt.Errorf("column is beyond end of line")
}
if sz == 1 && r == utf8.RuneError {
return 0, fmt.Errorf("buffer contains invalid UTF-8 text")
}
content = content[sz:]
if r >= 0x10000 {
col16++ // rune was encoded by a pair of surrogate UTF-16 codes
if col16 == int(p.Character) {
break // requested position is in the middle of a rune
}
}
col8 += sz
}
return offset + col8, nil
}
// -- go/token domain convenience methods --
// PosPosition converts a token pos to a protocol (UTF-16) position.
func (m *Mapper) PosPosition(tf *token.File, pos token.Pos) (Position, error) {
offset, err := safetoken.Offset(tf, pos)
if err != nil {
return Position{}, err
}
return m.OffsetPosition(offset)
}
// PosLocation converts a token range to a protocol (UTF-16) location.
func (m *Mapper) PosLocation(tf *token.File, start, end token.Pos) (Location, error) {
startOffset, endOffset, err := safetoken.Offsets(tf, start, end)
if err != nil {
return Location{}, err
}
rng, err := m.OffsetRange(startOffset, endOffset)
if err != nil {
return Location{}, err
}
return m.RangeLocation(rng), nil
}
// PosRange converts a token range to a protocol (UTF-16) range.
func (m *Mapper) PosRange(tf *token.File, start, end token.Pos) (Range, error) {
startOffset, endOffset, err := safetoken.Offsets(tf, start, end)
if err != nil {
return Range{}, err
}
return m.OffsetRange(startOffset, endOffset)
}
// NodeRange converts a syntax node range to a protocol (UTF-16) range.
func (m *Mapper) NodeRange(tf *token.File, node ast.Node) (Range, error) {
return m.PosRange(tf, node.Pos(), node.End())
}
// RangeLocation pairs a protocol Range with its URI, in a Location.
func (m *Mapper) RangeLocation(rng Range) Location {
return Location{URI: m.URI, Range: rng}
}
// PosMappedRange returns a MappedRange for the given token.Pos range.
func (m *Mapper) PosMappedRange(tf *token.File, start, end token.Pos) (MappedRange, error) {
startOffset, endOffset, err := safetoken.Offsets(tf, start, end)
if err != nil {
return MappedRange{}, nil
}
return m.OffsetMappedRange(startOffset, endOffset)
}
// NodeMappedRange returns a MappedRange for the given node range.
func (m *Mapper) NodeMappedRange(tf *token.File, node ast.Node) (MappedRange, error) {
return m.PosMappedRange(tf, node.Pos(), node.End())
}
// -- MappedRange --
// A MappedRange represents a valid byte-offset range of a file.
// Through its Mapper it can be converted into other forms such
// as protocol.Range or UTF-8.
//
// Construct one by calling Mapper.OffsetMappedRange with start/end offsets.
// From the go/token domain, call safetoken.Offsets first,
// or use a helper such as parsego.File.MappedPosRange.
//
// Two MappedRanges produced the same Mapper are equal if and only if they
// denote the same range. Two MappedRanges produced by different Mappers
// are unequal even when they represent the same range of the same file.
type MappedRange struct {
Mapper *Mapper
start, end int // valid byte offsets: 0 <= start <= end <= len(Mapper.Content)
}
// Offsets returns the (start, end) byte offsets of this range.
func (mr MappedRange) Offsets() (start, end int) { return mr.start, mr.end }
// -- convenience functions --
// URI returns the URI of the range's file.
func (mr MappedRange) URI() DocumentURI {
return mr.Mapper.URI
}
// Range returns the range in protocol (UTF-16) form.
func (mr MappedRange) Range() Range {
rng, err := mr.Mapper.OffsetRange(mr.start, mr.end)
if err != nil {
panic(err) // can't happen
}
return rng
}
// Location returns the range in protocol location (UTF-16) form.
func (mr MappedRange) Location() Location {
return mr.Mapper.RangeLocation(mr.Range())
}
// String formats the range in UTF-8 notation.
func (mr MappedRange) String() string {
var s strings.Builder
startLine, startCol8 := mr.Mapper.OffsetLineCol8(mr.start)
fmt.Fprintf(&s, "%d:%d", startLine, startCol8)
if mr.end != mr.start {
endLine, endCol8 := mr.Mapper.OffsetLineCol8(mr.end)
if endLine == startLine {
fmt.Fprintf(&s, "-%d", endCol8)
} else {
fmt.Fprintf(&s, "-%d:%d", endLine, endCol8)
}
}
return s.String()
}
// LocationTextDocumentPositionParams converts its argument to its result.
func LocationTextDocumentPositionParams(loc Location) TextDocumentPositionParams {
return TextDocumentPositionParams{
TextDocument: TextDocumentIdentifier{URI: loc.URI},
Position: loc.Range.Start,
}
}