1
0
mirror of https://github.com/golang/go synced 2024-11-07 05:56:18 -07:00

strconv: add QuotedPrefix

QuotedPrefix is similar to Unquote, but returns the quoted string verbatim
and ignores any data after the quoted string.

Fixes #45033

Change-Id: I9f69fe9e3e45cbe9e63581cf1b457facb625045d
Reviewed-on: https://go-review.googlesource.com/c/go/+/314775
Trust: Joe Tsai <joetsai@digital-static.net>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
Joe Tsai 2021-04-28 17:39:46 -07:00 committed by Joe Tsai
parent 2422c5eae5
commit e3769299cd
4 changed files with 155 additions and 87 deletions

View File

@ -9,7 +9,7 @@ package strconv
import "internal/bytealg" import "internal/bytealg"
// contains reports whether the string contains the byte c. // index returns the index of the first instance of c in s, or -1 if missing.
func contains(s string, c byte) bool { func index(s string, c byte) int {
return bytealg.IndexByteString(s, c) != -1 return bytealg.IndexByteString(s, c)
} }

View File

@ -7,12 +7,12 @@
package strconv package strconv
// contains reports whether the string contains the byte c. // index returns the index of the first instance of c in s, or -1 if missing.
func contains(s string, c byte) bool { func index(s string, c byte) int {
for i := 0; i < len(s); i++ { for i := 0; i < len(s); i++ {
if s[i] == c { if s[i] == c {
return true return i
} }
} }
return false return -1
} }

View File

@ -15,6 +15,11 @@ const (
upperhex = "0123456789ABCDEF" upperhex = "0123456789ABCDEF"
) )
// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
return index(s, c) != -1
}
func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
} }
@ -359,80 +364,132 @@ func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string,
return return
} }
// QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
// If s does not start with a valid quoted string, QuotedPrefix returns an error.
func QuotedPrefix(s string) (string, error) {
out, _, err := unquote(s, false)
return out, err
}
// Unquote interprets s as a single-quoted, double-quoted, // Unquote interprets s as a single-quoted, double-quoted,
// or backquoted Go string literal, returning the string value // or backquoted Go string literal, returning the string value
// that s quotes. (If s is single-quoted, it would be a Go // that s quotes. (If s is single-quoted, it would be a Go
// character literal; Unquote returns the corresponding // character literal; Unquote returns the corresponding
// one-character string.) // one-character string.)
func Unquote(s string) (string, error) { func Unquote(s string) (string, error) {
n := len(s) out, rem, err := unquote(s, true)
if n < 2 { if len(rem) > 0 {
return "", ErrSyntax return "", ErrSyntax
} }
quote := s[0] return out, err
if quote != s[n-1] { }
return "", ErrSyntax
}
s = s[1 : n-1]
if quote == '`' { // unquote parses a quoted string at the start of the input,
if contains(s, '`') { // returning the parsed prefix, the remaining suffix, and any parse errors.
return "", ErrSyntax // If unescape is true, the parsed prefix is unescaped,
// otherwise the input prefix is provided verbatim.
func unquote(in string, unescape bool) (out, rem string, err error) {
// Determine the quote form and optimistically find the terminating quote.
if len(in) < 2 {
return "", in, ErrSyntax
} }
if contains(s, '\r') { quote := in[0]
// -1 because we know there is at least one \r to remove. end := index(in[1:], quote)
buf := make([]byte, 0, len(s)-1) if end < 0 {
for i := 0; i < len(s); i++ { return "", in, ErrSyntax
if s[i] != '\r' {
buf = append(buf, s[i])
}
}
return string(buf), nil
}
return s, nil
}
if quote != '"' && quote != '\'' {
return "", ErrSyntax
}
if contains(s, '\n') {
return "", ErrSyntax
} }
end += 2 // position after terminating quote; may be wrong if escape sequences are present
// Is it trivial? Avoid allocation. switch quote {
if !contains(s, '\\') && !contains(s, quote) { case '`':
switch {
case !unescape:
out = in[:end] // include quotes
case !contains(in[:end], '\r'):
out = in[len("`") : end-len("`")] // exclude quotes
default:
// Carriage return characters ('\r') inside raw string literals
// are discarded from the raw string value.
buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
for i := len("`"); i < end-len("`"); i++ {
if in[i] != '\r' {
buf = append(buf, in[i])
}
}
out = string(buf)
}
// NOTE: Prior implementations did not verify that raw strings consist
// of valid UTF-8 characters and we continue to not verify it as such.
// The Go specification does not explicitly require valid UTF-8,
// but only mention that it is implicitly valid for Go source code
// (which must be valid UTF-8).
return out, in[end:], nil
case '"', '\'':
// Handle quoted strings without any escape sequences.
if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
var valid bool
switch quote { switch quote {
case '"': case '"':
if utf8.ValidString(s) { valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
return s, nil
}
case '\'': case '\'':
r, size := utf8.DecodeRuneInString(s) r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
if size == len(s) && (r != utf8.RuneError || size != 1) { valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
return s, nil
} }
if valid {
out = in[:end]
if unescape {
out = out[1 : end-1] // exclude quotes
}
return out, in[end:], nil
} }
} }
var runeTmp [utf8.UTFMax]byte // Handle quoted strings with escape sequences.
buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. var buf []byte
for len(s) > 0 { in0 := in
c, multibyte, ss, err := UnquoteChar(s, quote) in = in[1:] // skip starting quote
if err != nil { if unescape {
return "", err buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
} }
s = ss for len(in) > 0 && in[0] != quote {
if c < utf8.RuneSelf || !multibyte { // Process the next character,
buf = append(buf, byte(c)) // rejecting any unescaped newline characters which are invalid.
r, multibyte, rem, err := UnquoteChar(in, quote)
if in[0] == '\n' || err != nil {
return "", in0, ErrSyntax
}
in = rem
// Append the character if unescaping the input.
if unescape {
if r < utf8.RuneSelf || !multibyte {
buf = append(buf, byte(r))
} else { } else {
n := utf8.EncodeRune(runeTmp[:], c) var arr [utf8.UTFMax]byte
buf = append(buf, runeTmp[:n]...) n := utf8.EncodeRune(arr[:], r)
} buf = append(buf, arr[:n]...)
if quote == '\'' && len(s) != 0 {
// single-quoted must be single character
return "", ErrSyntax
} }
} }
return string(buf), nil
// Single quoted strings must be a single character.
if quote == '\'' {
break
}
}
// Verify that the string ends with a terminating quote.
if !(len(in) > 0 && in[0] == quote) {
return "", in0, ErrSyntax
}
in = in[1:] // skip terminating quote
if unescape {
return string(buf), in, nil
}
return in0[:len(in0)-len(in)], in, nil
default:
return "", in, ErrSyntax
}
} }
// bsearch16 returns the smallest i such that a[i] >= x. // bsearch16 returns the smallest i such that a[i] >= x.

View File

@ -6,6 +6,7 @@ package strconv_test
import ( import (
. "strconv" . "strconv"
"strings"
"testing" "testing"
"unicode" "unicode"
) )
@ -297,6 +298,7 @@ var misquoted = []string{
`"\z"`, `"\z"`,
"`", "`",
"`xxx", "`xxx",
"``x\r",
"`\"", "`\"",
`"\'"`, `"\'"`,
`'\"'`, `'\"'`,
@ -307,22 +309,13 @@ var misquoted = []string{
func TestUnquote(t *testing.T) { func TestUnquote(t *testing.T) {
for _, tt := range unquotetests { for _, tt := range unquotetests {
if out, err := Unquote(tt.in); err != nil || out != tt.out { testUnquote(t, tt.in, tt.out, nil)
t.Errorf("Unquote(%#q) = %q, %v want %q, nil", tt.in, out, err, tt.out)
} }
}
// run the quote tests too, backward
for _, tt := range quotetests { for _, tt := range quotetests {
if in, err := Unquote(tt.out); in != tt.in { testUnquote(t, tt.out, tt.in, nil)
t.Errorf("Unquote(%#q) = %q, %v, want %q, nil", tt.out, in, err, tt.in)
} }
}
for _, s := range misquoted { for _, s := range misquoted {
if out, err := Unquote(s); out != "" || err != ErrSyntax { testUnquote(t, s, "", ErrSyntax)
t.Errorf("Unquote(%#q) = %q, %v want %q, %v", s, out, err, "", ErrSyntax)
}
} }
} }
@ -333,26 +326,44 @@ func TestUnquoteInvalidUTF8(t *testing.T) {
// one of: // one of:
want string want string
wantErr string wantErr error
}{ }{
{in: `"foo"`, want: "foo"}, {in: `"foo"`, want: "foo"},
{in: `"foo`, wantErr: "invalid syntax"}, {in: `"foo`, wantErr: ErrSyntax},
{in: `"` + "\xc0" + `"`, want: "\xef\xbf\xbd"}, {in: `"` + "\xc0" + `"`, want: "\xef\xbf\xbd"},
{in: `"a` + "\xc0" + `"`, want: "a\xef\xbf\xbd"}, {in: `"a` + "\xc0" + `"`, want: "a\xef\xbf\xbd"},
{in: `"\t` + "\xc0" + `"`, want: "\t\xef\xbf\xbd"}, {in: `"\t` + "\xc0" + `"`, want: "\t\xef\xbf\xbd"},
} }
for i, tt := range tests { for _, tt := range tests {
got, err := Unquote(tt.in) testUnquote(t, tt.in, tt.want, tt.wantErr)
var gotErr string
if err != nil {
gotErr = err.Error()
} }
if gotErr != tt.wantErr { }
t.Errorf("%d. Unquote(%q) = err %v; want %q", i, tt.in, err, tt.wantErr)
func testUnquote(t *testing.T, in, want string, wantErr error) {
// Test Unquote.
got, gotErr := Unquote(in)
if got != want || gotErr != wantErr {
t.Errorf("Unquote(%q) = (%q, %v), want (%q, %v)", in, got, gotErr, want, wantErr)
} }
if tt.wantErr == "" && err == nil && got != tt.want {
t.Errorf("%d. Unquote(%q) = %02x; want %02x", i, tt.in, []byte(got), []byte(tt.want)) // Test QuotedPrefix.
// Adding an arbitrary suffix should not change the result of QuotedPrefix
// assume that the suffix doesn't accidentally terminate a truncated input.
if gotErr == nil {
want = in
} }
suffix := "\n\r\\\"`'" // special characters for quoted strings
if len(in) > 0 {
suffix = strings.ReplaceAll(suffix, in[:1], "")
}
in += suffix
got, gotErr = QuotedPrefix(in)
if gotErr == nil && wantErr != nil {
_, wantErr = Unquote(got) // original input had trailing junk, reparse with only valid prefix
want = got
}
if got != want || gotErr != wantErr {
t.Errorf("QuotedPrefix(%q) = (%q, %v), want (%q, %v)", in, got, gotErr, want, wantErr)
} }
} }