mirror of
https://github.com/golang/go
synced 2024-11-18 13:04:46 -07:00
cmd/digraph: support Go-style double-quotes in input data.
+ test. LGTM=sameer R=sameer CC=golang-codereviews, gri https://golang.org/cl/170090043
This commit is contained in:
parent
fb44a24d4c
commit
c097262a24
@ -16,19 +16,26 @@ package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"strconv"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const Usage = `digraph: queries over directed graphs in text form.
|
||||
|
||||
Graph format:
|
||||
|
||||
Each line contains zero or more whitespace-separated fields.
|
||||
Each line contains zero or more words. Words are separated by
|
||||
unquoted whitespace; words may contain Go-style double-quoted portions,
|
||||
allowing spaces and other characters to be expressed.
|
||||
|
||||
Each field declares a node, and if there are more than one,
|
||||
an edge from the first to each subsequent one.
|
||||
The graph is provided on the standard input.
|
||||
@ -38,7 +45,7 @@ Graph format:
|
||||
|
||||
% cat clothes.txt
|
||||
socks shoes
|
||||
shorts pants
|
||||
"boxer shorts" pants
|
||||
pants belt shoes
|
||||
shirt tie sweater
|
||||
sweater jacket
|
||||
@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset {
|
||||
func parse(rd io.Reader) (graph, error) {
|
||||
g := make(graph)
|
||||
|
||||
var linenum int
|
||||
in := bufio.NewScanner(rd)
|
||||
for in.Scan() {
|
||||
words := strings.Fields(in.Text())
|
||||
linenum++
|
||||
// Split into words, honoring double-quotes per Go spec.
|
||||
words, err := split(in.Text())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("at line %d: %v", linenum, err)
|
||||
}
|
||||
if len(words) > 0 {
|
||||
g.addEdges(words[0], words[1:]...)
|
||||
}
|
||||
@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// -- Utilities --------------------------------------------------------
|
||||
|
||||
// split splits a line into words, which are generally separated by
|
||||
// spaces, but Go-style double-quoted string literals are also supported.
|
||||
// (This approximates the behaviour of the Bourne shell.)
|
||||
//
|
||||
// `one "two three"` -> ["one" "two three"]
|
||||
// `a"\n"b` -> ["a\nb"]
|
||||
//
|
||||
func split(line string) ([]string, error) {
|
||||
var (
|
||||
words []string
|
||||
inWord bool
|
||||
current bytes.Buffer
|
||||
)
|
||||
|
||||
for len(line) > 0 {
|
||||
r, size := utf8.DecodeRuneInString(line)
|
||||
if unicode.IsSpace(r) {
|
||||
if inWord {
|
||||
words = append(words, current.String())
|
||||
current.Reset()
|
||||
inWord = false
|
||||
}
|
||||
} else if r == '"' {
|
||||
var ok bool
|
||||
size, ok = quotedLength(line)
|
||||
if !ok {
|
||||
return nil, errors.New("invalid quotation")
|
||||
}
|
||||
s, err := strconv.Unquote(line[:size])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
current.WriteString(s)
|
||||
inWord = true
|
||||
} else {
|
||||
current.WriteRune(r)
|
||||
inWord = true
|
||||
}
|
||||
line = line[size:]
|
||||
}
|
||||
if inWord {
|
||||
words = append(words, current.String())
|
||||
}
|
||||
return words, nil
|
||||
}
|
||||
|
||||
// quotedLength returns the length in bytes of the prefix of input that
|
||||
// contain a possibly-valid double-quoted Go string literal.
|
||||
//
|
||||
// On success, n is at least two (""); input[:n] may be passed to
|
||||
// strconv.Unquote to interpret its value, and input[n:] contains the
|
||||
// rest of the input.
|
||||
//
|
||||
// On failure, quotedLength returns false, and the entire input can be
|
||||
// passed to strconv.Unquote if an informative error message is desired.
|
||||
//
|
||||
// quotedLength does not and need not detect all errors, such as
|
||||
// invalid hex or octal escape sequences, since it assumes
|
||||
// strconv.Unquote will be applied to the prefix. It guarantees only
|
||||
// that if there is a prefix of input containing a valid string literal,
|
||||
// its length is returned.
|
||||
//
|
||||
// TODO(adonovan): move this into a strconv-like utility package.
|
||||
//
|
||||
func quotedLength(input string) (n int, ok bool) {
|
||||
var offset int
|
||||
|
||||
// next returns the rune at offset, or -1 on EOF.
|
||||
// offset advances to just after that rune.
|
||||
next := func() rune {
|
||||
if offset < len(input) {
|
||||
r, size := utf8.DecodeRuneInString(input[offset:])
|
||||
offset += size
|
||||
return r
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
if next() != '"' {
|
||||
return // error: not a quotation
|
||||
}
|
||||
|
||||
for {
|
||||
r := next()
|
||||
if r == '\n' || r < 0 {
|
||||
return // error: string literal not terminated
|
||||
}
|
||||
if r == '"' {
|
||||
return offset, true // success
|
||||
}
|
||||
if r == '\\' {
|
||||
var skip int
|
||||
switch next() {
|
||||
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
|
||||
skip = 0
|
||||
case '0', '1', '2', '3', '4', '5', '6', '7':
|
||||
skip = 2
|
||||
case 'x':
|
||||
skip = 2
|
||||
case 'u':
|
||||
skip = 4
|
||||
case 'U':
|
||||
skip = 8
|
||||
default:
|
||||
return // error: invalid escape
|
||||
}
|
||||
|
||||
for i := 0; i < skip; i++ {
|
||||
next()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
@ -60,3 +61,61 @@ d c
|
||||
// - test somepath (it's nondeterministic).
|
||||
// - test errors
|
||||
}
|
||||
|
||||
func TestSplit(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
line string
|
||||
want []string
|
||||
}{
|
||||
{`one "2a 2b" three`, []string{"one", "2a 2b", "three"}},
|
||||
{`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}},
|
||||
} {
|
||||
got, err := split(test.line)
|
||||
if err != nil {
|
||||
t.Errorf("split(%s) failed: %v", test.line, err)
|
||||
}
|
||||
if !reflect.DeepEqual(got, test.want) {
|
||||
t.Errorf("split(%s) = %v, want %v", test.line, got, test.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestQuotedLength(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
input string
|
||||
want int
|
||||
}{
|
||||
{`"abc"`, 5},
|
||||
{`"abc"def`, 5},
|
||||
{`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue
|
||||
{`"\012\n\x0a\u000a\U0000000a"`, 28},
|
||||
{"\"\xff\"", 3}, // bad UTF-8 is ok
|
||||
{`"\xff"`, 6}, // hex escape for bad UTF-8 is ok
|
||||
} {
|
||||
got, ok := quotedLength(test.input)
|
||||
if !ok {
|
||||
got = 0
|
||||
}
|
||||
if got != test.want {
|
||||
t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want)
|
||||
}
|
||||
}
|
||||
|
||||
// errors
|
||||
for _, input := range []string{
|
||||
``, // not a quotation
|
||||
`a`, // not a quotation
|
||||
`'a'`, // not a quotation
|
||||
`"a`, // not terminated
|
||||
`"\0"`, // short octal escape
|
||||
`"\x1"`, // short hex escape
|
||||
`"\u000"`, // short \u escape
|
||||
`"\U0000000"`, // short \U escape
|
||||
`"\k"`, // invalid escape
|
||||
"\"ab\nc\"", // newline
|
||||
} {
|
||||
if n, ok := quotedLength(input); ok {
|
||||
t.Errorf("quotedLength(%s) = %d, want !ok", input, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user