1
0
mirror of https://github.com/golang/go synced 2024-09-30 16:28:32 -06:00

cmd/digraph: support Go-style double-quotes in input data.

+ test.

LGTM=sameer
R=sameer
CC=golang-codereviews, gri
https://golang.org/cl/170090043
This commit is contained in:
Alan Donovan 2014-11-12 13:37:06 -05:00
parent fb44a24d4c
commit c097262a24
2 changed files with 192 additions and 4 deletions

View File

@ -16,19 +16,26 @@ package main
import (
"bufio"
"bytes"
"errors"
"flag"
"fmt"
"io"
"os"
"sort"
"strings"
"strconv"
"unicode"
"unicode/utf8"
)
const Usage = `digraph: queries over directed graphs in text form.
Graph format:
Each line contains zero or more whitespace-separated fields.
Each line contains zero or more words. Words are separated by
unquoted whitespace; words may contain Go-style double-quoted portions,
allowing spaces and other characters to be expressed.
Each field declares a node, and if there are more than one,
an edge from the first to each subsequent one.
The graph is provided on the standard input.
@ -38,7 +45,7 @@ Graph format:
% cat clothes.txt
socks shoes
shorts pants
"boxer shorts" pants
pants belt shoes
shirt tie sweater
sweater jacket
@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset {
func parse(rd io.Reader) (graph, error) {
g := make(graph)
var linenum int
in := bufio.NewScanner(rd)
for in.Scan() {
words := strings.Fields(in.Text())
linenum++
// Split into words, honoring double-quotes per Go spec.
words, err := split(in.Text())
if err != nil {
return nil, fmt.Errorf("at line %d: %v", linenum, err)
}
if len(words) > 0 {
g.addEdges(words[0], words[1:]...)
}
@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error {
return nil
}
// -- Utilities --------------------------------------------------------
// split splits a line into words, which are generally separated by
// spaces, but Go-style double-quoted string literals are also supported.
// (This approximates the behaviour of the Bourne shell.)
//
// `one "two three"` -> ["one" "two three"]
// `a"\n"b` -> ["a\nb"]
//
func split(line string) ([]string, error) {
var (
words []string
inWord bool
current bytes.Buffer
)
for len(line) > 0 {
r, size := utf8.DecodeRuneInString(line)
if unicode.IsSpace(r) {
if inWord {
words = append(words, current.String())
current.Reset()
inWord = false
}
} else if r == '"' {
var ok bool
size, ok = quotedLength(line)
if !ok {
return nil, errors.New("invalid quotation")
}
s, err := strconv.Unquote(line[:size])
if err != nil {
return nil, err
}
current.WriteString(s)
inWord = true
} else {
current.WriteRune(r)
inWord = true
}
line = line[size:]
}
if inWord {
words = append(words, current.String())
}
return words, nil
}
// quotedLength returns the length in bytes of the prefix of input that
// contain a possibly-valid double-quoted Go string literal.
//
// On success, n is at least two (""); input[:n] may be passed to
// strconv.Unquote to interpret its value, and input[n:] contains the
// rest of the input.
//
// On failure, quotedLength returns false, and the entire input can be
// passed to strconv.Unquote if an informative error message is desired.
//
// quotedLength does not and need not detect all errors, such as
// invalid hex or octal escape sequences, since it assumes
// strconv.Unquote will be applied to the prefix. It guarantees only
// that if there is a prefix of input containing a valid string literal,
// its length is returned.
//
// TODO(adonovan): move this into a strconv-like utility package.
//
func quotedLength(input string) (n int, ok bool) {
var offset int
// next returns the rune at offset, or -1 on EOF.
// offset advances to just after that rune.
next := func() rune {
if offset < len(input) {
r, size := utf8.DecodeRuneInString(input[offset:])
offset += size
return r
}
return -1
}
if next() != '"' {
return // error: not a quotation
}
for {
r := next()
if r == '\n' || r < 0 {
return // error: string literal not terminated
}
if r == '"' {
return offset, true // success
}
if r == '\\' {
var skip int
switch next() {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
skip = 0
case '0', '1', '2', '3', '4', '5', '6', '7':
skip = 2
case 'x':
skip = 2
case 'u':
skip = 4
case 'U':
skip = 8
default:
return // error: invalid escape
}
for i := 0; i < skip; i++ {
next()
}
}
}
}

View File

@ -3,6 +3,7 @@ package main
import (
"bytes"
"fmt"
"reflect"
"strings"
"testing"
)
@ -60,3 +61,61 @@ d c
// - test somepath (it's nondeterministic).
// - test errors
}
func TestSplit(t *testing.T) {
for _, test := range []struct {
line string
want []string
}{
{`one "2a 2b" three`, []string{"one", "2a 2b", "three"}},
{`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}},
} {
got, err := split(test.line)
if err != nil {
t.Errorf("split(%s) failed: %v", test.line, err)
}
if !reflect.DeepEqual(got, test.want) {
t.Errorf("split(%s) = %v, want %v", test.line, got, test.want)
}
}
}
func TestQuotedLength(t *testing.T) {
for _, test := range []struct {
input string
want int
}{
{`"abc"`, 5},
{`"abc"def`, 5},
{`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue
{`"\012\n\x0a\u000a\U0000000a"`, 28},
{"\"\xff\"", 3}, // bad UTF-8 is ok
{`"\xff"`, 6}, // hex escape for bad UTF-8 is ok
} {
got, ok := quotedLength(test.input)
if !ok {
got = 0
}
if got != test.want {
t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want)
}
}
// errors
for _, input := range []string{
``, // not a quotation
`a`, // not a quotation
`'a'`, // not a quotation
`"a`, // not terminated
`"\0"`, // short octal escape
`"\x1"`, // short hex escape
`"\u000"`, // short \u escape
`"\U0000000"`, // short \U escape
`"\k"`, // invalid escape
"\"ab\nc\"", // newline
} {
if n, ok := quotedLength(input); ok {
t.Errorf("quotedLength(%s) = %d, want !ok", input, n)
}
}
}