diff --git a/cmd/digraph/digraph.go b/cmd/digraph/digraph.go index a5c623b58c..76d39ec063 100644 --- a/cmd/digraph/digraph.go +++ b/cmd/digraph/digraph.go @@ -16,19 +16,26 @@ package main import ( "bufio" + "bytes" + "errors" "flag" "fmt" "io" "os" "sort" - "strings" + "strconv" + "unicode" + "unicode/utf8" ) const Usage = `digraph: queries over directed graphs in text form. Graph format: - Each line contains zero or more whitespace-separated fields. + Each line contains zero or more words. Words are separated by + unquoted whitespace; words may contain Go-style double-quoted portions, + allowing spaces and other characters to be expressed. + Each field declares a node, and if there are more than one, an edge from the first to each subsequent one. The graph is provided on the standard input. @@ -38,7 +45,7 @@ Graph format: % cat clothes.txt socks shoes - shorts pants + "boxer shorts" pants pants belt shoes shirt tie sweater sweater jacket @@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset { func parse(rd io.Reader) (graph, error) { g := make(graph) + var linenum int in := bufio.NewScanner(rd) for in.Scan() { - words := strings.Fields(in.Text()) + linenum++ + // Split into words, honoring double-quotes per Go spec. + words, err := split(in.Text()) + if err != nil { + return nil, fmt.Errorf("at line %d: %v", linenum, err) + } if len(words) > 0 { g.addEdges(words[0], words[1:]...) } @@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error { return nil } + +// -- Utilities -------------------------------------------------------- + +// split splits a line into words, which are generally separated by +// spaces, but Go-style double-quoted string literals are also supported. +// (This approximates the behaviour of the Bourne shell.) +// +// `one "two three"` -> ["one" "two three"] +// `a"\n"b` -> ["a\nb"] +// +func split(line string) ([]string, error) { + var ( + words []string + inWord bool + current bytes.Buffer + ) + + for len(line) > 0 { + r, size := utf8.DecodeRuneInString(line) + if unicode.IsSpace(r) { + if inWord { + words = append(words, current.String()) + current.Reset() + inWord = false + } + } else if r == '"' { + var ok bool + size, ok = quotedLength(line) + if !ok { + return nil, errors.New("invalid quotation") + } + s, err := strconv.Unquote(line[:size]) + if err != nil { + return nil, err + } + current.WriteString(s) + inWord = true + } else { + current.WriteRune(r) + inWord = true + } + line = line[size:] + } + if inWord { + words = append(words, current.String()) + } + return words, nil +} + +// quotedLength returns the length in bytes of the prefix of input that +// contain a possibly-valid double-quoted Go string literal. +// +// On success, n is at least two (""); input[:n] may be passed to +// strconv.Unquote to interpret its value, and input[n:] contains the +// rest of the input. +// +// On failure, quotedLength returns false, and the entire input can be +// passed to strconv.Unquote if an informative error message is desired. +// +// quotedLength does not and need not detect all errors, such as +// invalid hex or octal escape sequences, since it assumes +// strconv.Unquote will be applied to the prefix. It guarantees only +// that if there is a prefix of input containing a valid string literal, +// its length is returned. +// +// TODO(adonovan): move this into a strconv-like utility package. +// +func quotedLength(input string) (n int, ok bool) { + var offset int + + // next returns the rune at offset, or -1 on EOF. + // offset advances to just after that rune. + next := func() rune { + if offset < len(input) { + r, size := utf8.DecodeRuneInString(input[offset:]) + offset += size + return r + } + return -1 + } + + if next() != '"' { + return // error: not a quotation + } + + for { + r := next() + if r == '\n' || r < 0 { + return // error: string literal not terminated + } + if r == '"' { + return offset, true // success + } + if r == '\\' { + var skip int + switch next() { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"': + skip = 0 + case '0', '1', '2', '3', '4', '5', '6', '7': + skip = 2 + case 'x': + skip = 2 + case 'u': + skip = 4 + case 'U': + skip = 8 + default: + return // error: invalid escape + } + + for i := 0; i < skip; i++ { + next() + } + } + } +} diff --git a/cmd/digraph/digraph_test.go b/cmd/digraph/digraph_test.go index 28403d3ebc..0c90304939 100644 --- a/cmd/digraph/digraph_test.go +++ b/cmd/digraph/digraph_test.go @@ -3,6 +3,7 @@ package main import ( "bytes" "fmt" + "reflect" "strings" "testing" ) @@ -60,3 +61,61 @@ d c // - test somepath (it's nondeterministic). // - test errors } + +func TestSplit(t *testing.T) { + for _, test := range []struct { + line string + want []string + }{ + {`one "2a 2b" three`, []string{"one", "2a 2b", "three"}}, + {`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}}, + } { + got, err := split(test.line) + if err != nil { + t.Errorf("split(%s) failed: %v", test.line, err) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("split(%s) = %v, want %v", test.line, got, test.want) + } + } +} + +func TestQuotedLength(t *testing.T) { + for _, test := range []struct { + input string + want int + }{ + {`"abc"`, 5}, + {`"abc"def`, 5}, + {`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue + {`"\012\n\x0a\u000a\U0000000a"`, 28}, + {"\"\xff\"", 3}, // bad UTF-8 is ok + {`"\xff"`, 6}, // hex escape for bad UTF-8 is ok + } { + got, ok := quotedLength(test.input) + if !ok { + got = 0 + } + if got != test.want { + t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want) + } + } + + // errors + for _, input := range []string{ + ``, // not a quotation + `a`, // not a quotation + `'a'`, // not a quotation + `"a`, // not terminated + `"\0"`, // short octal escape + `"\x1"`, // short hex escape + `"\u000"`, // short \u escape + `"\U0000000"`, // short \U escape + `"\k"`, // invalid escape + "\"ab\nc\"", // newline + } { + if n, ok := quotedLength(input); ok { + t.Errorf("quotedLength(%s) = %d, want !ok", input, n) + } + } +}