1
0
mirror of https://github.com/golang/go synced 2024-11-07 03:36:12 -07:00

strconv: add QuotedPrefix

QuotedPrefix is similar to Unquote, but returns the quoted string verbatim
and ignores any data after the quoted string.

Fixes #45033

Change-Id: I9f69fe9e3e45cbe9e63581cf1b457facb625045d
Reviewed-on: https://go-review.googlesource.com/c/go/+/314775
Trust: Joe Tsai <joetsai@digital-static.net>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
Joe Tsai 2021-04-28 17:39:46 -07:00 committed by Joe Tsai
parent 2422c5eae5
commit e3769299cd
4 changed files with 155 additions and 87 deletions

View File

@ -9,7 +9,7 @@ package strconv
import "internal/bytealg"
// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
return bytealg.IndexByteString(s, c) != -1
// index returns the index of the first instance of c in s, or -1 if missing.
func index(s string, c byte) int {
return bytealg.IndexByteString(s, c)
}

View File

@ -7,12 +7,12 @@
package strconv
// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
// index returns the index of the first instance of c in s, or -1 if missing.
func index(s string, c byte) int {
for i := 0; i < len(s); i++ {
if s[i] == c {
return true
return i
}
}
return false
return -1
}

View File

@ -15,6 +15,11 @@ const (
upperhex = "0123456789ABCDEF"
)
// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
return index(s, c) != -1
}
func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
}
@ -359,80 +364,132 @@ func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string,
return
}
// QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
// If s does not start with a valid quoted string, QuotedPrefix returns an error.
func QuotedPrefix(s string) (string, error) {
out, _, err := unquote(s, false)
return out, err
}
// Unquote interprets s as a single-quoted, double-quoted,
// or backquoted Go string literal, returning the string value
// that s quotes. (If s is single-quoted, it would be a Go
// character literal; Unquote returns the corresponding
// one-character string.)
func Unquote(s string) (string, error) {
n := len(s)
if n < 2 {
out, rem, err := unquote(s, true)
if len(rem) > 0 {
return "", ErrSyntax
}
quote := s[0]
if quote != s[n-1] {
return "", ErrSyntax
}
s = s[1 : n-1]
return out, err
}
if quote == '`' {
if contains(s, '`') {
return "", ErrSyntax
}
if contains(s, '\r') {
// -1 because we know there is at least one \r to remove.
buf := make([]byte, 0, len(s)-1)
for i := 0; i < len(s); i++ {
if s[i] != '\r' {
buf = append(buf, s[i])
// unquote parses a quoted string at the start of the input,
// returning the parsed prefix, the remaining suffix, and any parse errors.
// If unescape is true, the parsed prefix is unescaped,
// otherwise the input prefix is provided verbatim.
func unquote(in string, unescape bool) (out, rem string, err error) {
// Determine the quote form and optimistically find the terminating quote.
if len(in) < 2 {
return "", in, ErrSyntax
}
quote := in[0]
end := index(in[1:], quote)
if end < 0 {
return "", in, ErrSyntax
}
end += 2 // position after terminating quote; may be wrong if escape sequences are present
switch quote {
case '`':
switch {
case !unescape:
out = in[:end] // include quotes
case !contains(in[:end], '\r'):
out = in[len("`") : end-len("`")] // exclude quotes
default:
// Carriage return characters ('\r') inside raw string literals
// are discarded from the raw string value.
buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
for i := len("`"); i < end-len("`"); i++ {
if in[i] != '\r' {
buf = append(buf, in[i])
}
}
return string(buf), nil
out = string(buf)
}
return s, nil
}
if quote != '"' && quote != '\'' {
return "", ErrSyntax
}
if contains(s, '\n') {
return "", ErrSyntax
}
// Is it trivial? Avoid allocation.
if !contains(s, '\\') && !contains(s, quote) {
switch quote {
case '"':
if utf8.ValidString(s) {
return s, nil
// NOTE: Prior implementations did not verify that raw strings consist
// of valid UTF-8 characters and we continue to not verify it as such.
// The Go specification does not explicitly require valid UTF-8,
// but only mention that it is implicitly valid for Go source code
// (which must be valid UTF-8).
return out, in[end:], nil
case '"', '\'':
// Handle quoted strings without any escape sequences.
if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
var valid bool
switch quote {
case '"':
valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
case '\'':
r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
}
case '\'':
r, size := utf8.DecodeRuneInString(s)
if size == len(s) && (r != utf8.RuneError || size != 1) {
return s, nil
if valid {
out = in[:end]
if unescape {
out = out[1 : end-1] // exclude quotes
}
return out, in[end:], nil
}
}
}
var runeTmp [utf8.UTFMax]byte
buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
for len(s) > 0 {
c, multibyte, ss, err := UnquoteChar(s, quote)
if err != nil {
return "", err
// Handle quoted strings with escape sequences.
var buf []byte
in0 := in
in = in[1:] // skip starting quote
if unescape {
buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
}
s = ss
if c < utf8.RuneSelf || !multibyte {
buf = append(buf, byte(c))
} else {
n := utf8.EncodeRune(runeTmp[:], c)
buf = append(buf, runeTmp[:n]...)
for len(in) > 0 && in[0] != quote {
// Process the next character,
// rejecting any unescaped newline characters which are invalid.
r, multibyte, rem, err := UnquoteChar(in, quote)
if in[0] == '\n' || err != nil {
return "", in0, ErrSyntax
}
in = rem
// Append the character if unescaping the input.
if unescape {
if r < utf8.RuneSelf || !multibyte {
buf = append(buf, byte(r))
} else {
var arr [utf8.UTFMax]byte
n := utf8.EncodeRune(arr[:], r)
buf = append(buf, arr[:n]...)
}
}
// Single quoted strings must be a single character.
if quote == '\'' {
break
}
}
if quote == '\'' && len(s) != 0 {
// single-quoted must be single character
return "", ErrSyntax
// Verify that the string ends with a terminating quote.
if !(len(in) > 0 && in[0] == quote) {
return "", in0, ErrSyntax
}
in = in[1:] // skip terminating quote
if unescape {
return string(buf), in, nil
}
return in0[:len(in0)-len(in)], in, nil
default:
return "", in, ErrSyntax
}
return string(buf), nil
}
// bsearch16 returns the smallest i such that a[i] >= x.

View File

@ -6,6 +6,7 @@ package strconv_test
import (
. "strconv"
"strings"
"testing"
"unicode"
)
@ -297,6 +298,7 @@ var misquoted = []string{
`"\z"`,
"`",
"`xxx",
"``x\r",
"`\"",
`"\'"`,
`'\"'`,
@ -307,22 +309,13 @@ var misquoted = []string{
func TestUnquote(t *testing.T) {
for _, tt := range unquotetests {
if out, err := Unquote(tt.in); err != nil || out != tt.out {
t.Errorf("Unquote(%#q) = %q, %v want %q, nil", tt.in, out, err, tt.out)
}
testUnquote(t, tt.in, tt.out, nil)
}
// run the quote tests too, backward
for _, tt := range quotetests {
if in, err := Unquote(tt.out); in != tt.in {
t.Errorf("Unquote(%#q) = %q, %v, want %q, nil", tt.out, in, err, tt.in)
}
testUnquote(t, tt.out, tt.in, nil)
}
for _, s := range misquoted {
if out, err := Unquote(s); out != "" || err != ErrSyntax {
t.Errorf("Unquote(%#q) = %q, %v want %q, %v", s, out, err, "", ErrSyntax)
}
testUnquote(t, s, "", ErrSyntax)
}
}
@ -333,26 +326,44 @@ func TestUnquoteInvalidUTF8(t *testing.T) {
// one of:
want string
wantErr string
wantErr error
}{
{in: `"foo"`, want: "foo"},
{in: `"foo`, wantErr: "invalid syntax"},
{in: `"foo`, wantErr: ErrSyntax},
{in: `"` + "\xc0" + `"`, want: "\xef\xbf\xbd"},
{in: `"a` + "\xc0" + `"`, want: "a\xef\xbf\xbd"},
{in: `"\t` + "\xc0" + `"`, want: "\t\xef\xbf\xbd"},
}
for i, tt := range tests {
got, err := Unquote(tt.in)
var gotErr string
if err != nil {
gotErr = err.Error()
}
if gotErr != tt.wantErr {
t.Errorf("%d. Unquote(%q) = err %v; want %q", i, tt.in, err, tt.wantErr)
}
if tt.wantErr == "" && err == nil && got != tt.want {
t.Errorf("%d. Unquote(%q) = %02x; want %02x", i, tt.in, []byte(got), []byte(tt.want))
}
for _, tt := range tests {
testUnquote(t, tt.in, tt.want, tt.wantErr)
}
}
func testUnquote(t *testing.T, in, want string, wantErr error) {
// Test Unquote.
got, gotErr := Unquote(in)
if got != want || gotErr != wantErr {
t.Errorf("Unquote(%q) = (%q, %v), want (%q, %v)", in, got, gotErr, want, wantErr)
}
// Test QuotedPrefix.
// Adding an arbitrary suffix should not change the result of QuotedPrefix
// assume that the suffix doesn't accidentally terminate a truncated input.
if gotErr == nil {
want = in
}
suffix := "\n\r\\\"`'" // special characters for quoted strings
if len(in) > 0 {
suffix = strings.ReplaceAll(suffix, in[:1], "")
}
in += suffix
got, gotErr = QuotedPrefix(in)
if gotErr == nil && wantErr != nil {
_, wantErr = Unquote(got) // original input had trailing junk, reparse with only valid prefix
want = got
}
if got != want || gotErr != wantErr {
t.Errorf("QuotedPrefix(%q) = (%q, %v), want (%q, %v)", in, got, gotErr, want, wantErr)
}
}