From 78c89d21bcb33b71d716165a9204c397cf1eaf63 Mon Sep 17 00:00:00 2001 From: David Symonds Date: Thu, 21 Jul 2011 08:38:35 +1000 Subject: [PATCH] http: sniffing algorithm. This follows draft-ietf-websec-mime-sniff-03 in its intent, though not its algorithmic specification. R=rsc CC=golang-dev https://golang.org/cl/4746042 --- src/pkg/http/server.go | 5 +- src/pkg/http/sniff.go | 170 +++++++++++++++++++++++++++++++++++-- src/pkg/http/sniff_test.go | 40 +++++++++ 3 files changed, 205 insertions(+), 10 deletions(-) create mode 100644 src/pkg/http/sniff_test.go diff --git a/src/pkg/http/server.go b/src/pkg/http/server.go index ca9ab647425..dd4547c25f4 100644 --- a/src/pkg/http/server.go +++ b/src/pkg/http/server.go @@ -359,10 +359,7 @@ func (w *response) sniff() { w.needSniff = false data := w.conn.body - ctype := detectContentType(data) - if ctype != "" { - fmt.Fprintf(w.conn.buf, "Content-Type: %s\r\n", ctype) - } + fmt.Fprintf(w.conn.buf, "Content-Type: %s\r\n", DetectContentType(data)) io.WriteString(w.conn.buf, "\r\n") if w.chunking && len(data) > 0 { diff --git a/src/pkg/http/sniff.go b/src/pkg/http/sniff.go index 9fd6fee9b51..bf8ce245450 100644 --- a/src/pkg/http/sniff.go +++ b/src/pkg/http/sniff.go @@ -4,15 +4,173 @@ package http +import ( + "bytes" +) + // Content-type sniffing algorithm. -// http://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03 +// References in this file refer to this draft specification: +// http://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03 // The algorithm prefers to use sniffLen bytes to make its decision. -const sniffLen = 1024 +const sniffLen = 512 -// detectContentType returns the sniffed Content-Type string -// for the given data. -func detectContentType(data []byte) string { - // TODO(dsymonds,rsc): Implement algorithm from draft. +// DetectContentType returns the sniffed Content-Type string +// for the given data. This function always returns a valid MIME type. +func DetectContentType(data []byte) string { + if len(data) > sniffLen { + data = data[:sniffLen] + } + + // Index of the first non-whitespace byte in data. + firstNonWS := 0 + for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ { + } + + for _, sig := range sniffSignatures { + if ct := sig.match(data, firstNonWS); ct != "" { + return ct + } + } + + return "application/octet-stream" // fallback +} + +func isWS(b byte) bool { + return bytes.IndexByte([]byte("\t\n\x0C\n "), b) != -1 +} + +type sniffSig interface { + // match returns the MIME type of the data, or "" if unknown. + match(data []byte, firstNonWS int) string +} + +// Data matching the table in section 6. +var sniffSignatures = []sniffSig{ + htmlSig([]byte("' { + return "" + } return "text/html; charset=utf-8" } + +type textSig int + +func (textSig) match(data []byte, firstNonWS int) string { + // c.f. section 5, step 4. + for _, b := range data[firstNonWS:] { + switch { + case 0x00 <= b && b <= 0x08, + b == 0x0B, + 0x0E <= b && b <= 0x1A, + 0x1C <= b && b <= 0x1F: + return "" + } + } + return "text/plain; charset=utf-8" +} diff --git a/src/pkg/http/sniff_test.go b/src/pkg/http/sniff_test.go new file mode 100644 index 00000000000..770496f4051 --- /dev/null +++ b/src/pkg/http/sniff_test.go @@ -0,0 +1,40 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package http + +import ( + "testing" +) + +var sniffTests = []struct { + desc string + data []byte + exp string +}{ + // Some nonsense. + {"Empty", []byte{}, "text/plain; charset=utf-8"}, + {"Binary", []byte{1, 2, 3}, "application/octet-stream"}, + + {"HTML document #1", []byte(`blah blah blah`), "text/html; charset=utf-8"}, + {"HTML document #2", []byte(``), "text/html; charset=utf-8"}, + {"HTML document #3 (leading whitespace)", []byte(` ...`), "text/html; charset=utf-8"}, + + {"Plain text", []byte(`This is not HTML. It has ☃ though.`), "text/plain; charset=utf-8"}, + + {"XML", []byte("\n