From 5a75ac88c98c7bfeb907b655bc10f3ee35a4f82b Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Mon, 26 Oct 2009 18:32:51 -0700 Subject: [PATCH] - indexing component for godoc R=rsc http://go/go-review/1015014 --- src/cmd/godoc/index.go | 664 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 664 insertions(+) create mode 100644 src/cmd/godoc/index.go diff --git a/src/cmd/godoc/index.go b/src/cmd/godoc/index.go new file mode 100644 index 00000000000..1cd6c5ca5db --- /dev/null +++ b/src/cmd/godoc/index.go @@ -0,0 +1,664 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains the infrastructure to create an +// (identifier) index for a set of Go files. +// +// Basic indexing algorithm: +// - traverse all .go files of the file tree specified by root +// - for each word (identifier) encountered, collect all occurences (spots) +// into a list; this produces a list of spots for each word +// - reduce the lists: from a list of spots to a list of FileRuns, +// and from a list of FileRuns into a list of PakRuns +// - make a HitList from the PakRuns +// +// Details: +// - keep two lists per word: one containing package-level declarations +// that have snippets, and one containing all other spots +// - keep the snippets in a separate table indexed by snippet index +// and store the snippet index in place of the line number in a SpotInfo +// (the line number for spots with snippets is stored in the snippet) +// - at the end, create lists of alternative spellings for a given +// word + +package main + +import ( + "container/vector"; + "go/ast"; + "go/parser"; + "go/token"; + "os"; + pathutil "path"; + "sort"; + "strings"; +) + + +// ---------------------------------------------------------------------------- +// Data structures used during indexing + +// A RunList is a vector of entries that can be sorted according to some +// criteria. A RunList may be compressed by grouping "runs" of entries +// which are equal (according to the sort critera) into a new RunList of +// runs. For instance, a RunList containing pairs (x, y) may be compressed +// into a RunList containing pair runs (x, {y}) where each run consists of +// a list of y's with the same x. +type RunList struct { + vector.Vector; + less func(x, y interface{}) bool; +} + +func (h *RunList) Less(i, j int) bool { + return h.less(h.At(i), h.At(j)); +} + + +func (h *RunList) sort(less func(x, y interface{}) bool) { + h.less = less; + sort.Sort(h); +} + + +// Compress entries which are the same according to a sort criteria +// (specified by less) into "runs". +func (h *RunList) reduce(less func(x, y interface{}) bool, newRun func(h *RunList, i, j int) interface{}) *RunList { + // create runs of entries with equal values + h.sort(less); + + // for each run, make a new run object and collect them in a new RunList + var hh RunList; + i := 0; + for j := 0; j < h.Len(); j++ { + if less(h.At(i), h.At(j)) { + hh.Push(newRun(h, i, j)); + i = j; // start a new run + } + } + // add final run, if any + if i < h.Len() { + hh.Push(newRun(h, i, h.Len())); + } + + return &hh; +} + + +// A SpotInfo value describes a particular identifier spot in a given file; +// It encodes three values: the SpotKind (declaration or use), a line or +// snippet index "lori", and whether it's a line or index. +// +// The following encoding is used: +// +// bits 32 4 1 0 +// value [lori|kind|isIndex] +// +type SpotInfo uint32 + +// SpotKind describes whether an identifier is declared (and what kind of +// declaration) or used. +type SpotKind uint32 + +const ( + ImportDecl SpotKind = iota; + ConstDecl; + TypeDecl; + VarDecl; + FuncDecl; + MethodDecl; + Use; + nKinds; +) + + +// makeSpotInfo makes a SpotInfo. +func makeSpotInfo(kind SpotKind, lori int, isIndex bool) SpotInfo { + // encode lori: bits [4..32) + x := SpotInfo(lori)<<4; + if int(x>>4) != lori { + // lori value doesn't fit - since snippet indices are + // most certainly always smaller then 1<<28, this can + // only happen for line numbers; give it no line number (= 0) + x = 0; + } + // encode kind: bits [1..4) + x |= SpotInfo(kind)<<1; + // encode isIndex: bit 0 + if isIndex { + x |= 1; + } + return x; +} + + +func (x SpotInfo) less(y SpotInfo) bool { + return x.Lori() < y.Lori(); +} + + +func (x SpotInfo) Kind() SpotKind { + return SpotKind(x>>1&7); +} + + +func (x SpotInfo) Lori() int { + return int(x>>4); +} + + +func (x SpotInfo) IsIndex() bool { + return x&1 != 0; +} + + +// A Pak describes a Go package. +type Pak struct { + Path string; // directory name containing the package + Name string; // package name as declared by package clause +} + + +func (p *Pak) less(q *Pak) bool { + return p.Path < q.Path || p.Name < q.Name; +} + + +// A File describes a Go file. +type File struct { + Path string; // complete file name + Pak Pak; // the package to which the file belongs +} + + +func (f *File) less(g *File) bool { + return f.Path < g.Path; +} + + +// A Spot describes a single occurence of a word. +type Spot struct { + File *File; + Info SpotInfo; +} + + +// Spots are sorted by filename. +func lessSpot(x, y interface{}) bool { + return x.(Spot).File.less(y.(Spot).File); +} + + +// A FileRun describes a run of Spots of a word in a single file. +type FileRun struct { + File *File; + Infos []SpotInfo; +} + + +// newFileRun allocates a new *FileRun from the Spot run [i, j) in h. +func newFileRun(h *RunList, i, j int) interface{} { + file := h.At(i).(Spot).File; + lines := make([]SpotInfo, j-i); + prev := 0; + k := 0; + for ; i < j; i++ { + info := h.At(i).(Spot).Info; + // ignore line duplicates + // (if lori is a snippet index it is unique - no need to check IsIndex()) + lori := info.Lori(); + if lori != prev { + lines[k] = info; + prev = lori; + k++; + } + } + return &FileRun{file, lines[0:k]}; +} + + +// FileRuns are sorted by package. +func lessFileRun(x, y interface{}) bool { + return x.(*FileRun).File.Pak.less(&y.(*FileRun).File.Pak); +} + + +// A PakRun describes a run of *FileRuns of a package. +type PakRun struct { + Pak Pak; + Files []*FileRun; +} + +// Sorting support for files within a PakRun. +func (p *PakRun) Len() int { + return len(p.Files); +} +func (p *PakRun) Less(i, j int) bool { + return p.Files[i].File.less(p.Files[j].File); +} +func (p *PakRun) Swap(i, j int) { + p.Files[i], p.Files[j] = p.Files[j], p.Files[i]; +} + + +// newPakRun allocates a new *PakRun from the *FileRun run [i, j) in h. +func newPakRun(h *RunList, i, j int) interface{} { + pak := h.At(i).(*FileRun).File.Pak; + files := make([]*FileRun, j-i); + k := 0; + for ; i < j; i++ { + files[k] = h.At(i).(*FileRun); + k++; + } + run := &PakRun{pak, files}; + sort.Sort(run); // files were sorted by package; sort them by file now + return run; +} + + +// PakRuns are sorted by package. +func lessPakRun(x, y interface{}) bool { + return x.(*PakRun).Pak.less(&y.(*PakRun).Pak); +} + + +// A HitList describes a list of PakRuns. +type HitList []*PakRun + + +func reduce(h0 *RunList) HitList { + // reduce a list of Spots into a list of FileRuns + h1 := h0.reduce(lessSpot, newFileRun); + // reduce a list of FileRuns into a list of PakRuns + h2 := h1.reduce(lessFileRun, newPakRun); + // sort the list of PakRuns by package + h2.sort(lessPakRun); + // create a HitList + h := make(HitList, h2.Len()); + for i := 0; i < h2.Len(); i++ { + h[i] = h2.At(i).(*PakRun); + } + return h; +} + + +func (h HitList) filter(pakname string) HitList { + // determine number of matching packages (most of the time just one) + n := 0; + for _, p := range h { + if p.Pak.Name == pakname { + n++; + } + } + // create filtered HitList + hh := make(HitList, n); + i := 0; + for _, p := range h { + if p.Pak.Name == pakname { + hh[i] = p; + i++; + } + } + return hh; +} + + +type wordPair struct { + canon string; // canonical word spelling (all lowercase) + alt string; // alternative spelling +} + + +// An AltWords describes a list of alternative spellings for a +// canonical (all lowercase) spelling of a word. +type AltWords struct { + Canon string; // canonical word spelling (all lowercase) + Alts []string; // alternative spelling for the same word +} + + +func lessWordPair(x, y interface{}) bool { + return x.(*wordPair).canon < y.(*wordPair).canon; +} + + +// newAltWords allocates a new *AltWords from the *wordPair run [i, j) in h. +func newAltWords(h *RunList, i, j int) interface{} { + canon := h.At(i).(*wordPair).canon; + alts := make([]string, j-i); + k := 0; + for ; i < j; i++ { + alts[k] = h.At(i).(*wordPair).alt; + k++; + } + return &AltWords{canon, alts}; +} + + +func (a *AltWords) filter(s string) *AltWords { + if len(a.Alts) == 1 && a.Alts[0] == s { + // there are no different alternatives + return nil; + } + + // make a new AltWords with the current spelling removed + alts := make([]string, len(a.Alts)); + i := 0; + for _, w := range a.Alts { + if w != s { + alts[i] = w; + i++; + } + } + return &AltWords{a.Canon, alts[0:i]}; +} + + +// ---------------------------------------------------------------------------- +// Indexer + +type IndexResult struct { + Decls RunList; // package-level declarations (with snippets) + Others RunList; // all other occurences +} + + +// An Indexer maintains the data structures and provides the machinery +// for indexing .go files under a file tree. It implements the path.Visitor +// interface for walking file trees, and the ast.Visitor interface for +// walking Go ASTs. +type Indexer struct { + words map[string]*IndexResult; // RunLists of Spots + snippets vector.Vector; // vector of *Snippets, indexed by snippet indices + file *File; // current file + decl ast.Decl; // current decl + nspots int; // number of spots encountered +} + + +func (x *Indexer) addSnippet(s *Snippet) int { + index := x.snippets.Len(); + x.snippets.Push(s); + return index; +} + + +func (x *Indexer) visitComment(c *ast.CommentGroup) { + if c != nil { + ast.Walk(x, c); + } +} + + +func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) { + if id != nil { + lists, found := x.words[id.Value]; + if !found { + lists = new(IndexResult); + x.words[id.Value] = lists; + } + + if kind == Use || x.decl == nil { + // not a declaration or no snippet required + info := makeSpotInfo(kind, id.Pos().Line, false); + lists.Others.Push(Spot{x.file, info}); + } else { + // a declaration with snippet + index := x.addSnippet(NewSnippet(x.decl, id)); + info := makeSpotInfo(kind, index, true); + lists.Decls.Push(Spot{x.file, info}); + } + + x.nspots++; + } +} + + +func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) { + switch n := spec.(type) { + case *ast.ImportSpec: + x.visitComment(n.Doc); + x.visitIdent(ImportDecl, n.Name); + for _, s := range n.Path { + ast.Walk(x, s); + } + x.visitComment(n.Comment); + + case *ast.ValueSpec: + x.visitComment(n.Doc); + kind := ConstDecl; + if isVarDecl { + kind = VarDecl; + } + for _, n := range n.Names { + x.visitIdent(kind, n); + } + ast.Walk(x, n.Type); + for _, v := range n.Values { + ast.Walk(x, v); + } + x.visitComment(n.Comment); + + case *ast.TypeSpec: + x.visitComment(n.Doc); + x.visitIdent(TypeDecl, n.Name); + ast.Walk(x, n.Type); + x.visitComment(n.Comment); + } +} + + +func (x *Indexer) Visit(node interface{}) bool { + // TODO(gri): methods in interface types are categorized as VarDecl + switch n := node.(type) { + case *ast.Ident: + x.visitIdent(Use, n); + + case *ast.Field: + x.decl = nil; // no snippets for fields + x.visitComment(n.Doc); + for _, m := range n.Names { + x.visitIdent(VarDecl, m); + } + ast.Walk(x, n.Type); + for _, s := range n.Tag { + ast.Walk(x, s); + } + x.visitComment(n.Comment); + + case *ast.DeclStmt: + if decl, ok := n.Decl.(*ast.GenDecl); ok { + // local declarations can only be *ast.GenDecls + x.decl = nil; // no snippets for local declarations + x.visitComment(decl.Doc); + for _, s := range decl.Specs { + x.visitSpec(s, decl.Tok == token.VAR); + } + } else { + // handle error case gracefully + ast.Walk(x, n.Decl); + } + + case *ast.GenDecl: + x.decl = n; + x.visitComment(n.Doc); + for _, s := range n.Specs { + x.visitSpec(s, n.Tok == token.VAR); + } + + case *ast.FuncDecl: + x.visitComment(n.Doc); + kind := FuncDecl; + if n.Recv != nil { + kind = MethodDecl; + ast.Walk(x, n.Recv); + } + x.decl = n; + x.visitIdent(kind, n.Name); + ast.Walk(x, n.Type); + if n.Body != nil { + ast.Walk(x, n.Type); + } + + default: + return true; + } + + return false; +} + + +func (x *Indexer) VisitDir(path string, d *os.Dir) bool { + return true; +} + + +func (x *Indexer) VisitFile(path string, d *os.Dir) { + if !isGoFile(d) { + return; + } + + file, err := parser.ParseFile(path, nil, parser.ParseComments); + if err != nil { + return; // ignore files with (parse) errors + } + + dir, _ := pathutil.Split(path); + pak := Pak{dir, file.Name.Value}; + x.file = &File{path, pak}; + ast.Walk(x, file); +} + + +// ---------------------------------------------------------------------------- +// Index + +type LookupResult struct { + Decls HitList; // package-level declarations (with snippets) + Others HitList; // all other occurences +} + + +type Index struct { + words map[string]*LookupResult; // maps words to hit lists + alts map[string]*AltWords; // maps canonical(words) to lists of alternative spellings + snippets []*Snippet; // all snippets, indexed by snippet index + nspots int; // number of spots indexed (a measure of the index size) +} + + +func canonical(w string) string { + return strings.ToLower(w); +} + + +// NewIndex creates a new index for the file tree rooted at root. +func NewIndex(root string) *Index { + var x Indexer; + + // initialize Indexer + x.words = make(map[string]*IndexResult); + + // collect all Spots + pathutil.Walk(root, &x, nil); + + // for each word, reduce the RunLists into a LookupResult; + // also collect the word with its canonical spelling in a + // word list for later computation of alternative spellings + words := make(map[string]*LookupResult); + var wlist RunList; + for w, h := range x.words { + decls := reduce(&h.Decls); + others := reduce(&h.Others); + words[w] = &LookupResult{ + Decls: decls, + Others: others, + }; + wlist.Push(&wordPair{canonical(w), w}); + } + + // reduce the word list {canonical(w), w} into + // a list of AltWords runs {canonical(w), {w}} + alist := wlist.reduce(lessWordPair, newAltWords); + + // convert alist into a map of alternative spellings + alts := make(map[string]*AltWords); + for i := 0; i < alist.Len(); i++ { + a := alist.At(i).(*AltWords); + alts[a.Canon] = a; + } + + // convert snippet vector into a list + snippets := make([]*Snippet, x.snippets.Len()); + for i := 0; i < x.snippets.Len(); i++ { + snippets[i] = x.snippets.At(i).(*Snippet); + } + + return &Index{words, alts, snippets, x.nspots}; +} + + +// Size returns the number of different words and +// spots indexed as a measure for the index size. +func (x *Index) Size() (nwords int, nspots int) { + return len(x.words), x.nspots; +} + + +func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) { + match, _ = x.words[w]; + alt, _ = x.alts[canonical(w)]; + // remove current spelling from alternatives + // (if there is no match, the alternatives do + // not contain the current spelling) + if match != nil && alt != nil { + alt = alt.filter(w); + } + return; +} + + +// For a given string s, which is either a single identifier or a qualified +// identifier, Lookup returns a LookupResult, and a list of alternative +// spellings, if any. +func (x *Index) Lookup(s string) (match *LookupResult, alt *AltWords) { + ss := strings.Split(s, ".", 0); + + switch len(ss) { + case 1: + match, alt = x.LookupWord(ss[0]); + + case 2: + pakname := ss[0]; + match, alt = x.LookupWord(ss[1]); + if match != nil { + // found a match - filter by package name + decls := match.Decls.filter(pakname); + others := match.Others.filter(pakname); + match = &LookupResult{decls, others}; + } + if alt != nil { + // alternative spellings found - add package name + // TODO(gri): At the moment this is not very smart + // and likely will produce suggestions that have + // no match. Should filter incorrect alternatives. + canon := pakname + "." + alt.Canon; // for completeness (currently not used) + alts := make([]string, len(alt.Alts)); + for i, a := range alt.Alts { + alts[i] = pakname+"."+a; + } + alt = &AltWords{canon, alts}; + } + } + + return; +} + + +func (x *Index) Snippet(i int) *Snippet { + // handle illegal snippet indices gracefully + if 0 <= i && i < len(x.snippets) { + return x.snippets[i]; + } + return nil; +}