http: add lexing functions

In particular, add field-value tokenizer which respects quoting rules. The code is intended for use in tokenizing the Transfer-Encoding and Trailer fields. The lexing function is not connected to the main parsing code yet (in the next CL). R=rsc CC=golang-dev https://golang.org/cl/190085
2024-11-25 05:47:57 -07:00 · 2010-01-28 15:14:54 -08:00 · 2010-01-28 15:14:54 -08:00 · dd77c63d3d
commit dd77c63d3d
parent a0e6f03add
3 changed files with 223 additions and 0 deletions
--- a/src/pkg/http/Makefile
+++ b/src/pkg/http/Makefile
@ -9,6 +9,7 @@ GOFILES=\
 	chunked.go\
 	client.go\
 	fs.go\
 	lex.go\
 	request.go\
 	response.go\
 	server.go\
--- a/src/pkg/http/lex.go
+++ b/src/pkg/http/lex.go
@ -0,0 +1,152 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package http
 import (
 	"strings"
 )
 // This file deals with lexical matters of HTTP
 func isSeparator(c byte) bool {
 	switch c {
 	case '(', ')', '<', '>', '@', ',', ';', ':', '\\', '"', '/', '[', ']', '?', '=', '{', '}', ' ', '\t':
 		return true
 	default:
 		return false
 	}
 	panic()
 }
 func isSpace(c byte) bool {
 	switch c {
 	case ' ', '\t', '\r', '\n':
 		return true
 	default:
 		return false
 	}
 	panic()
 }
 func isCtl(c byte) bool { return (0 <= c && c <= 31) || c == 127 }
 func isChar(c byte) bool { return 0 <= c && c <= 127 }
 func isAnyText(c byte) bool { return !isCtl(c) }
 func isQdText(c byte) bool { return isAnyText(c) && c != '"' }
 func isToken(c byte) bool { return isChar(c) && !isCtl(c) && !isSeparator(c) }
 // Valid escaped sequences are not specified in RFC 2616, so for now, we assume
 // that they coincide with the common sense ones used by GO. Malformed
 // characters should probably not be treated as errors by a robust (forgiving)
 // parser, so we replace them with the '?' character.
 func httpUnquotePair(b byte) byte {
 	// skip the first byte, which should always be '\'
 	switch b {
 	case 'a':
 		return '\a'
 	case 'b':
 		return '\b'
 	case 'f':
 		return '\f'
 	case 'n':
 		return '\n'
 	case 'r':
 		return '\r'
 	case 't':
 		return '\t'
 	case 'v':
 		return '\v'
 	case '\\':
 		return '\\'
 	case '\'':
 		return '\''
 	case '"':
 		return '"'
 	}
 	return '?'
 }
 // raw must begin with a valid quoted string. Only the first quoted string is
 // parsed and is unquoted in result. eaten is the number of bytes parsed, or -1
 // upon failure.
 func httpUnquote(raw []byte) (eaten int, result string) {
 	buf := make([]byte, len(raw))
 	if raw[0] != '"' {
 		return -1, ""
 	}
 	eaten = 1
 	j := 0 // # of bytes written in buf
 	for i := 1; i < len(raw); i++ {
 		switch b := raw[i]; b {
 		case '"':
 			eaten++
 			buf = buf[0:j]
 			return i + 1, string(buf)
 		case '\\':
 			if len(raw) < i+2 {
 				return -1, ""
 			}
 			buf[j] = httpUnquotePair(raw[i+1])
 			eaten += 2
 			j++
 			i++
 		default:
 			if isQdText(b) {
 				buf[j] = b
 			} else {
 				buf[j] = '?'
 			}
 			eaten++
 			j++
 		}
 	}
 	return -1, ""
 }
 // This is a best effort parse, so errors are not returned, instead not all of
 // the input string might be parsed. result is always non-nil.
 func httpSplitFieldValue(fv string) (eaten int, result []string) {
 	result = make([]string, 0, len(fv))
 	raw := strings.Bytes(fv)
 	i := 0
 	chunk := ""
 	for i < len(raw) {
 		b := raw[i]
 		switch {
 		case b == '"':
 			eaten, unq := httpUnquote(raw[i:len(raw)])
 			if eaten < 0 {
 				return i, result
 			} else {
 				i += eaten
 				chunk += unq
 			}
 		case isSeparator(b):
 			if chunk != "" {
 				result = result[0 : len(result)+1]
 				result[len(result)-1] = chunk
 				chunk = ""
 			}
 			i++
 		case isToken(b):
 			chunk += string(b)
 			i++
 		case b == '\n' || b == '\r':
 			i++
 		default:
 			chunk += "?"
 			i++
 		}
 	}
 	if chunk != "" {
 		result = result[0 : len(result)+1]
 		result[len(result)-1] = chunk
 		chunk = ""
 	}
 	return i, result
 }
--- a/src/pkg/http/lex_test.go
+++ b/src/pkg/http/lex_test.go
@ -0,0 +1,70 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package http
 import (
 	"testing"
 )
 type lexTest struct {
 	Raw    string
 	Parsed int // # of parsed characters
 	Result []string
 }
 var lexTests = []lexTest{
 	lexTest{
 		Raw: `"abc"def,:ghi`,
 		Parsed: 13,
 		Result: []string{"abcdef", "ghi"},
 	},
 	// My understanding of the RFC is that escape sequences outside of
 	// quotes are not interpreted?
 	lexTest{
 		Raw: `"\t"\t"\t"`,
 		Parsed: 10,
 		Result: []string{"\t", "t\t"},
 	},
 	lexTest{
 		Raw: `"\yab"\r\n`,
 		Parsed: 10,
 		Result: []string{"?ab", "r", "n"},
 	},
 	lexTest{
 		Raw: "ab\f",
 		Parsed: 3,
 		Result: []string{"ab?"},
 	},
 	lexTest{
 		Raw: "\"ab \" c,de f, gh, ij\n\t\r",
 		Parsed: 23,
 		Result: []string{"ab ", "c", "de", "f", "gh", "ij"},
 	},
 }
 func min(x, y int) int {
 	if x <= y {
 		return x
 	}
 	return y
 }
 func TestSplitFieldValue(t *testing.T) {
 	for k, l := range lexTests {
 		parsed, result := httpSplitFieldValue(l.Raw)
 		if parsed != l.Parsed {
 			t.Errorf("#%d: Parsed %d, expected %d", k, parsed, l.Parsed)
 		}
 		if len(result) != len(l.Result) {
 			t.Errorf("#%d: Result len  %d, expected %d", k, len(result), len(l.Result))
 		}
 		for i := 0; i < min(len(result), len(l.Result)); i++ {
 			if result[i] != l.Result[i] {
 				t.Errorf("#%d: %d-th entry mismatch. Have {%s}, expect {%s}",
 					k, i, result[i], l.Result[i])
 			}
 		}
 	}
 }