From 65fc379daeda784d085f98d621a9ab712c096148 Mon Sep 17 00:00:00 2001 From: Alexandre Cesaro Date: Thu, 24 Sep 2015 23:45:13 +0200 Subject: [PATCH] mime: limit UTF-8 encoded-word length to 75 characters As specified by RFC 2047 section 2, encoded-words may not be more than 75 characters long. We only enforce this rule when the charset is UTF-8, since multi-bytes characters must not be split accross encoded-words (see section 5.3). Fixes #12300 Change-Id: I72a43fc3fe6ddeb3dab54dcdce0837d7ebf658f0 Reviewed-on: https://go-review.googlesource.com/14957 Run-TryBot: Brad Fitzpatrick TryBot-Result: Gobot Gobot Reviewed-by: Brad Fitzpatrick --- src/mime/encodedword.go | 138 +++++++++++++++++++++++++++++------ src/mime/encodedword_test.go | 8 ++ 2 files changed, 124 insertions(+), 22 deletions(-) diff --git a/src/mime/encodedword.go b/src/mime/encodedword.go index ebf6164bb6..3b414dd5c4 100644 --- a/src/mime/encodedword.go +++ b/src/mime/encodedword.go @@ -54,35 +54,129 @@ func (e WordEncoder) encodeWord(charset, s string) string { buf := getBuffer() defer putBuffer(buf) + e.openWord(buf, charset) + if e == BEncoding { + e.bEncode(buf, charset, s) + } else { + e.qEncode(buf, charset, s) + } + closeWord(buf) + + return buf.String() +} + +const ( + // The maximum length of an encoded-word is 75 characters. + // See RFC 2047, section 2. + maxEncodedWordLen = 75 + // maxContentLen is how much content can be encoded, ignoring the header and + // 2-byte footer. + maxContentLen = maxEncodedWordLen - len("=?UTF-8?") - len("?=") +) + +var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen) + +// bEncode encodes s using base64 encoding and writes it to buf. +func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) { + w := base64.NewEncoder(base64.StdEncoding, buf) + // If the charset is not UTF-8 or if the content is short, do not bother + // splitting the encoded-word. + if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen { + io.WriteString(w, s) + w.Close() + return + } + + var currentLen, last, runeLen int + for i := 0; i < len(s); i += runeLen { + // Multi-byte characters must not be split accross encoded-words. + // See RFC 2047, section 5.3. + _, runeLen = utf8.DecodeRuneInString(s[i:]) + + if currentLen+runeLen <= maxBase64Len { + currentLen += runeLen + } else { + io.WriteString(w, s[last:i]) + w.Close() + e.splitWord(buf, charset) + last = i + currentLen = runeLen + } + } + io.WriteString(w, s[last:]) + w.Close() +} + +// qEncode encodes s using Q encoding and writes it to buf. It splits the +// encoded-words when necessary. +func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) { + // We only split encoded-words when the charset is UTF-8. + if !isUTF8(charset) { + writeQString(buf, s) + return + } + + var currentLen, runeLen int + for i := 0; i < len(s); i += runeLen { + b := s[i] + // Multi-byte characters must not be split accross encoded-words. + // See RFC 2047, section 5.3. + var encLen int + if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' { + runeLen, encLen = 1, 1 + } else { + _, runeLen = utf8.DecodeRuneInString(s[i:]) + encLen = 3 * runeLen + } + + if currentLen+encLen > maxContentLen { + e.splitWord(buf, charset) + currentLen = 0 + } + writeQString(buf, s[i:i+runeLen]) + currentLen += encLen + } +} + +// writeQString encodes s using Q encoding and writes it to buf. +func writeQString(buf *bytes.Buffer, s string) { + for i := 0; i < len(s); i++ { + switch b := s[i]; { + case b == ' ': + buf.WriteByte('_') + case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_': + buf.WriteByte(b) + default: + buf.WriteByte('=') + buf.WriteByte(upperhex[b>>4]) + buf.WriteByte(upperhex[b&0x0f]) + } + } +} + +// openWord writes the beginning of an encoded-word into buf. +func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) { buf.WriteString("=?") buf.WriteString(charset) buf.WriteByte('?') buf.WriteByte(byte(e)) buf.WriteByte('?') +} - if e == BEncoding { - w := base64.NewEncoder(base64.StdEncoding, buf) - io.WriteString(w, s) - w.Close() - } else { - enc := make([]byte, 3) - for i := 0; i < len(s); i++ { - b := s[i] - switch { - case b == ' ': - buf.WriteByte('_') - case b <= '~' && b >= '!' && b != '=' && b != '?' && b != '_': - buf.WriteByte(b) - default: - enc[0] = '=' - enc[1] = upperhex[b>>4] - enc[2] = upperhex[b&0x0f] - buf.Write(enc) - } - } - } +// closeWord writes the end of an encoded-word into buf. +func closeWord(buf *bytes.Buffer) { buf.WriteString("?=") - return buf.String() +} + +// splitWord closes the current encoded-word and opens a new one. +func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) { + closeWord(buf) + buf.WriteByte(' ') + e.openWord(buf, charset) +} + +func isUTF8(charset string) bool { + return strings.EqualFold(charset, "UTF-8") } const upperhex = "0123456789ABCDEF" diff --git a/src/mime/encodedword_test.go b/src/mime/encodedword_test.go index b30ecba3b9..5fcd7a06dd 100644 --- a/src/mime/encodedword_test.go +++ b/src/mime/encodedword_test.go @@ -27,6 +27,14 @@ func TestEncodeWord(t *testing.T) { {QEncoding, iso88591, "a", "a"}, {QEncoding, utf8, "123 456", "123 456"}, {QEncoding, utf8, "\t !\"#$%&'()*+,-./ :;<>?@[\\]^_`{|}~", "\t !\"#$%&'()*+,-./ :;<>?@[\\]^_`{|}~"}, + {QEncoding, utf8, strings.Repeat("é", 10), "=?utf-8?q?" + strings.Repeat("=C3=A9", 10) + "?="}, + {QEncoding, utf8, strings.Repeat("é", 11), "=?utf-8?q?" + strings.Repeat("=C3=A9", 10) + "?= =?utf-8?q?=C3=A9?="}, + {QEncoding, iso88591, strings.Repeat("\xe9", 22), "=?iso-8859-1?q?" + strings.Repeat("=E9", 22) + "?="}, + {QEncoding, utf8, strings.Repeat("\x80", 22), "=?utf-8?q?" + strings.Repeat("=80", 21) + "?= =?utf-8?q?=80?="}, + {BEncoding, utf8, strings.Repeat("é", 24), "=?utf-8?b?" + strings.Repeat("w6nDqcOp", 8) + "?="}, + {BEncoding, utf8, strings.Repeat("é", 27), "=?utf-8?b?" + strings.Repeat("w6nDqcOp", 8) + "?= =?utf-8?b?w6nDqcOp?="}, + {BEncoding, iso88591, strings.Repeat("\xe9", 45), "=?iso-8859-1?b?" + strings.Repeat("6enp", 15) + "?="}, + {BEncoding, utf8, strings.Repeat("\x80", 51), "=?utf-8?b?" + strings.Repeat("gICA", 16) + "?= =?utf-8?b?gICA?="}, } for _, test := range tests {