From 1b9734b995f3d79ec9a412d7e33d3d50ad1a32be Mon Sep 17 00:00:00 2001
From: Robert Griesemer <gri@golang.org>
Date: Thu, 18 Jun 2009 17:06:08 -0700
Subject: [PATCH] 1) Fix a problem with tabwriter.Flush: any pending text not
 yet    in a cell makes a final cell in that line    (this showed up as
 occasionally missing single spaces in    godoc-formatted declarations that
 fit on a single line)

2) Cleaned up tabwriter implementation a bit:
   - replaced local unicodeLen() with utf8.RuneCount()
   - instead of having 2 parallel arrays for line widths and sizes,
     have a single array of cells containing a width and size
   - factored code a bit better
   - added more comments
   - added testnames to tabwriter tests
   - added more test cases and fixed a broken test case that
     now works correctly

R=r
DELTA=279  (133 added, 62 deleted, 84 changed)
OCL=30509
CL=30514
---
 src/pkg/tabwriter/tabwriter.go      | 241 ++++++++++++++++------------
 src/pkg/tabwriter/tabwriter_test.go |  72 +++++++--
 2 files changed, 192 insertions(+), 121 deletions(-)

diff --git a/src/pkg/tabwriter/tabwriter.go b/src/pkg/tabwriter/tabwriter.go
index 6799f72d10..aa942d482e 100644
--- a/src/pkg/tabwriter/tabwriter.go
+++ b/src/pkg/tabwriter/tabwriter.go
@@ -20,6 +20,16 @@ import (
 // ----------------------------------------------------------------------------
 // Filter implementation
 
+// A cell represents a segment of text delineated by tabs, form-feed,
+// or newline chars. The text itself is stored in a separate buffer;
+// cell only describes the segment's size in bytes and width in runes.
+//
+type cell struct {
+	size int;  // cell size in bytes
+	width int;  // cell width in runes
+}
+
+
 // A Writer is a filter that inserts padding around
 // tab-delimited columns in its input to align them
 // in the output.
@@ -37,8 +47,8 @@ import (
 // UTF-8 characters.
 //
 // If a Writer is configured to filter HTML, HTML tags and entities
-// are simply passed through and their widths are assumed to be zero
-// for formatting purposes.
+// are simply passed through. The widths of tags and entities are
+// assumed to be zero (tags) and one (entities) for formatting purposes.
 //
 // The form feed character ('\f') acts like a newline but it also
 // terminates all columns in the current line (effectively calling
@@ -59,22 +69,42 @@ type Writer struct {
 	flags uint;
 
 	// current state
-	html_char byte;  // terminating char of html tag/entity, or 0 ('>', ';', or 0)
 	buf io.ByteBuffer;  // collected text w/o tabs, newlines, or form feed chars
-	size int;  // size of incomplete cell in bytes
-	width int;  // width of incomplete cell in runes up to buf[pos] w/o ignored sections
 	pos int;  // buffer position up to which width of incomplete cell has been computed
-	lines_size vector.Vector;  // list of lines; each line is a list of cell sizes in bytes
-	lines_width vector.Vector;  // list of lines; each line is a list of cell widths in runes
+	cell cell;  // current incomplete cell; cell.width is up to buf[pos] w/o ignored sections
+	html_char byte;  // terminating char of html tag/entity, or 0 ('>', ';', or 0)
+	lines vector.Vector;  // list if lines; each line is a list of cells
 	widths vector.IntVector;  // list of column widths in runes - re-used during formatting
 }
 
 
+func (b *Writer) addLine() {
+	b.lines.Push(vector.New(0));
+}
+
+
+func (b *Writer) line(i int) *vector.Vector {
+	return b.lines.At(i).(*vector.Vector);
+}
+
+
+// Reset the current state.
+func (b *Writer) reset() {
+	b.buf.Reset();
+	b.pos = 0;
+	b.cell = cell{};
+	b.html_char = 0;
+	b.lines.Init(0);
+	b.widths.Init(0);
+	b.addLine();
+}
+
+
 // Internal representation (current state):
 //
 // - all text written is appended to buf; form feed chars, tabs and newlines are stripped away
 // - at any given time there is a (possibly empty) incomplete cell at the end
-//   (the cell starts after a tab or newline)
+//   (the cell starts after a tab, form feed, or newline)
 // - size is the number of bytes belonging to the cell so far
 // - width is text width in runes of that cell from the start of the cell to
 //   position pos; html tags and entities are excluded from this width if html
@@ -94,12 +124,6 @@ type Writer struct {
 // buf                start of incomplete cell  pos
 
 
-func (b *Writer) addLine() {
-	b.lines_size.Push(vector.NewIntVector(0));
-	b.lines_width.Push(vector.NewIntVector(0));
-}
-
-
 // Formatting can be controlled with these flags.
 const (
 	// Ignore html tags and treat entities (starting with '&'
@@ -144,22 +168,12 @@ func (b *Writer) Init(output io.Writer, cellwidth, padding int, padchar byte, fl
 	}
 	b.flags = flags;
 
-	b.lines_size.Init(0);
-	b.lines_width.Init(0);
-	b.widths.Init(0);
-	b.addLine();  // the very first line
+	b.reset();
 
 	return b;
 }
 
 
-func (b *Writer) line(i int) (*vector.IntVector, *vector.IntVector) {
-	return
-		b.lines_size.At(i).(*vector.IntVector),
-		b.lines_width.At(i).(*vector.IntVector);
-}
-
-
 // debugging support (keep code around)
 /*
 func (b *Writer) dump() {
@@ -219,19 +233,19 @@ func (b *Writer) writePadding(textw, cellw int) os.Error {
 func (b *Writer) writeLines(pos0 int, line0, line1 int) (int, os.Error) {
 	pos := pos0;
 	for i := line0; i < line1; i++ {
-		line_size, line_width := b.line(i);
-		for j := 0; j < line_size.Len(); j++ {
-			s, w := line_size.At(j), line_width.At(j);
+		line := b.line(i);
+		for j := 0; j < line.Len(); j++ {
+			c := line.At(j).(cell);
 
 			switch {
 			default: // align left
 
-				if err := b.write0(b.buf.Data()[pos : pos + s]); err != nil {
+				if err := b.write0(b.buf.Data()[pos : pos + c.size]); err != nil {
 					return pos, err;
 				}
-				pos += s;
+				pos += c.size;
 				if j < b.widths.Len() {
-					if err := b.writePadding(w, b.widths.At(j)); err != nil {
+					if err := b.writePadding(c.width, b.widths.At(j)); err != nil {
 						return pos, err;
 					}
 				}
@@ -239,24 +253,24 @@ func (b *Writer) writeLines(pos0 int, line0, line1 int) (int, os.Error) {
 			case b.flags & AlignRight != 0:  // align right
 
 				if j < b.widths.Len() {
-					if err := b.writePadding(w, b.widths.At(j)); err != nil {
+					if err := b.writePadding(c.width, b.widths.At(j)); err != nil {
 						return pos, err;
 					}
 				}
-				if err := b.write0(b.buf.Data()[pos : pos + s]); err != nil {
+				if err := b.write0(b.buf.Data()[pos : pos + c.size]); err != nil {
 					return pos, err;
 				}
-				pos += s;
+				pos += c.size;
 			}
 		}
 
-		if i+1 == b.lines_size.Len() {
+		if i+1 == b.lines.Len() {
 			// last buffered line - we don't have a newline, so just write
 			// any outstanding buffered data
-			if err := b.write0(b.buf.Data()[pos : pos + b.size]); err != nil {
+			if err := b.write0(b.buf.Data()[pos : pos + b.cell.size]); err != nil {
 				return pos, err;
 			}
-			pos += b.size;
+			pos += b.cell.size;
 		} else {
 			// not the last line - write newline
 			if err := b.write0(newline); err != nil {
@@ -273,9 +287,9 @@ func (b *Writer) format(pos0 int, line0, line1 int) (pos int, err os.Error) {
 	column := b.widths.Len();
 	last := line0;
 	for this := line0; this < line1; this++ {
-		line_size, line_width := b.line(this);
+		line := b.line(this);
 
-		if column < line_size.Len() - 1 {
+		if column < line.Len() - 1 {
 			// cell exists in this column
 			// (note that the last cell per line is ignored)
 
@@ -289,10 +303,10 @@ func (b *Writer) format(pos0 int, line0, line1 int) (pos int, err os.Error) {
 			// column block begin
 			width := b.cellwidth;  // minimal width
 			for ; this < line1; this++ {
-				line_size, line_width = b.line(this);
-				if column < line_size.Len() - 1 {
+				line = b.line(this);
+				if column < line.Len() - 1 {
 					// cell exists in this column => update width
-					w := line_width.At(column) + b.padding;
+					w := line.At(column).(cell).width + b.padding;
 					if w > width {
 						width = w;
 					}
@@ -316,40 +330,77 @@ func (b *Writer) format(pos0 int, line0, line1 int) (pos int, err os.Error) {
 }
 
 
+// Append text to current cell. Only update the cell width if updateWidth
+// is set (the cell width can only be updated if we know that we cannot be
+// in the middle of a UTF-8 encoded Unicode character).
+//
+func (b *Writer) append(text []byte, updateWidth bool) {
+	b.buf.Write(text);
+	b.cell.size += len(text);
+	if updateWidth {
+		b.cell.width += utf8.RuneCount(b.buf.Data()[b.pos : b.buf.Len()]);
+		b.pos = b.buf.Len();
+	}
+}
+
+
+// Start HTML-escape mode.
+func (b *Writer) startHTML(ch byte) {
+	if ch == '<' {
+		b.html_char = '>';
+	} else {
+		b.html_char = ';';
+	}
+}
+
+
+// Terminate HTML-escape mode. If the HTML text was an entity, its width
+// is assumed to be one for formatting purposes; otherwise it assumed to
+// be zero.
+//
+func (b *Writer) terminateHTML() {
+	if b.html_char == ';' {
+		// was entity, count as one rune
+		b.cell.width++;
+	}
+	b.pos = b.buf.Len();
+	b.html_char = 0;
+}
+
+
+// Terminate the current cell by adding it to the list of cells of the
+// current line. Returns the number of cells in that line.
+//
+func (b *Writer) terminateCell() int {
+	line := b.line(b.lines.Len() - 1);
+	line.Push(b.cell);
+	b.cell = cell{};
+	return line.Len();
+}
+
+
 // Flush should be called after the last call to Write to ensure
-// that any data buffered in the Writer is written to output.
+// that any data buffered in the Writer is written to output. Any
+// incomplete HTML tag or entity at the end is simply considered
+// complete for formatting purposes.
 //
 func (b *Writer) Flush() os.Error {
-	_, err := b.format(0, 0, b.lines_size.Len());
-	// reset (even in the presence of errors)
-	b.buf.Reset();
-	b.size, b.width = 0, 0;
-	b.pos = 0;
-	b.lines_size.Init(0);
-	b.lines_width.Init(0);
-	b.addLine();
-	return err;
-}
-
-
-func unicodeLen(buf []byte) int {
-	l := 0;
-	for i := 0; i < len(buf); {
-		if buf[i] < utf8.RuneSelf {
-			i++;
-		} else {
-			rune, size := utf8.DecodeRune(buf[i : len(buf)]);
-			i += size;
+	// add current cell if not empty
+	if b.cell.size > 0 {
+		if b.html_char != 0 {
+			// inside html tag/entity - terminate it even if incomplete
+			b.terminateHTML();
 		}
-		l++;
+		b.terminateCell();
 	}
-	return l;
-}
 
+	// format contents of buffer
+	_, err := b.format(0, 0, b.lines.Len());
 
-func (b *Writer) append(buf []byte) {
-	b.buf.Write(buf);
-	b.size += len(buf);
+	// reset, even in the presence of errors
+	b.reset();
+
+	return err;
 }
 
 
@@ -358,31 +409,21 @@ func (b *Writer) append(buf []byte) {
 // while writing to the underlying output stream.
 //
 func (b *Writer) Write(buf []byte) (written int, err os.Error) {
-	i0, n := 0, len(buf);
-
 	// split text into cells
-	for i := 0; i < n; i++ {
-		ch := buf[i];
-
+	i0 := 0;
+	for i, ch := range buf {
 		if b.html_char == 0 {
 			// outside html tag/entity
 			switch ch {
 			case '\t', '\n', '\f':
-				b.append(buf[i0 : i]);
-				i0 = i + 1;  // exclude ch from (next) cell
-				b.width += unicodeLen(b.buf.Data()[b.pos : b.buf.Len()]);
-				b.pos = b.buf.Len();
-
-				// terminate cell
-				last_size, last_width := b.line(b.lines_size.Len() - 1);
-				last_size.Push(b.size);
-				last_width.Push(b.width);
-				b.size, b.width = 0, 0;
-
+				// end of cell
+				b.append(buf[i0 : i], true);
+				i0 = i+1;  // exclude ch from (next) cell
+				ncells := b.terminateCell();
 				if ch != '\t' {
 					// terminate line
 					b.addLine();
-					if ch == '\f' || last_size.Len() == 1 {
+					if ch == '\f' || ncells == 1 {
 						// A '\f' always forces a flush. Otherwise, if the previous
 						// line has only one cell which does not have an impact on
 						// the formatting of the following lines (the last cell per
@@ -395,37 +436,29 @@ func (b *Writer) Write(buf []byte) (written int, err os.Error) {
 				}
 
 			case '<', '&':
+				// possibly an html tag/entity 
 				if b.flags & FilterHTML != 0 {
-					b.append(buf[i0 : i]);
+					// begin of tag/entity
+					b.append(buf[i0 : i], true);
 					i0 = i;
-					b.width += unicodeLen(b.buf.Data()[b.pos : b.buf.Len()]);
-					b.pos = -1;  // preventative - should not be used (will cause index out of bounds)
-					if ch == '<' {
-						b.html_char = '>';
-					} else {
-						b.html_char = ';';
-					}
+					b.startHTML(ch);
 				}
 			}
 
 		} else {
 			// inside html tag/entity
 			if ch == b.html_char {
-				// reached the end of tag/entity
-				b.append(buf[i0 : i + 1]);
-				i0 = i + 1;
-				if b.html_char == ';' {
-					b.width++;  // count as one char
-				}
-				b.pos = b.buf.Len();
-				b.html_char = 0;
+				// end of tag/entity
+				b.append(buf[i0 : i+1], false);
+				i0 = i+1;  // exclude ch from (next) cell
+				b.terminateHTML();
 			}
 		}
 	}
 
 	// append leftover text
-	b.append(buf[i0 : n]);
-	return n, nil;
+	b.append(buf[i0 : len(buf)], false);
+	return len(buf), nil;
 }
 
 
diff --git a/src/pkg/tabwriter/tabwriter_test.go b/src/pkg/tabwriter/tabwriter_test.go
index 7026446e62..7967a46743 100644
--- a/src/pkg/tabwriter/tabwriter_test.go
+++ b/src/pkg/tabwriter/tabwriter_test.go
@@ -47,31 +47,31 @@ func (b *buffer) String() string {
 }
 
 
-func write(t *testing.T, w *tabwriter.Writer, src string) {
+func write(t *testing.T, testname string, w *tabwriter.Writer, src string) {
 	written, err := io.WriteString(w, src);
 	if err != nil {
-		t.Errorf("--- src:\n%s\n--- write error: %v\n", src, err);
+		t.Errorf("--- test: %s\n--- src:\n%s\n--- write error: %v\n", testname, src, err);
 	}
 	if written != len(src) {
-		t.Errorf("--- src:\n%s\n--- written = %d, len(src) = %d\n", src, written, len(src));
+		t.Errorf("--- test: %s\n--- src:\n%s\n--- written = %d, len(src) = %d\n", testname, src, written, len(src));
 	}
 }
 
 
-func verify(t *testing.T, w *tabwriter.Writer, b *buffer, src, expected string) {
+func verify(t *testing.T, testname string, w *tabwriter.Writer, b *buffer, src, expected string) {
 	err := w.Flush();
 	if err != nil {
-		t.Errorf("--- src:\n%s\n--- flush error: %v\n", src, err);
+		t.Errorf("--- test: %s\n--- src:\n%s\n--- flush error: %v\n", testname, src, err);
 	}
 
 	res := b.String();
 	if res != expected {
-		t.Errorf("--- src:\n%s\n--- found:\n%s\n--- expected:\n%s\n", src, res, expected)
+		t.Errorf("--- test: %s\n--- src:\n%s\n--- found:\n%s\n--- expected:\n%s\n", testname, src, res, expected)
 	}
 }
 
 
-func check(t *testing.T, tabwidth, padding int, padchar byte, flags uint, src, expected string) {
+func check(t *testing.T, testname string, tabwidth, padding int, padchar byte, flags uint, src, expected string) {
 	var b buffer;
 	b.init(1000);
 
@@ -80,30 +80,31 @@ func check(t *testing.T, tabwidth, padding int, padchar byte, flags uint, src, e
 
 	// write all at once
 	b.clear();
-	write(t, &w, src);
-	verify(t, &w, &b, src, expected);
+	write(t, testname, &w, src);
+	verify(t, testname, &w, &b, src, expected);
 
 	// write byte-by-byte
 	b.clear();
 	for i := 0; i < len(src); i++ {
-		write(t, &w, src[i : i+1]);
+		write(t, testname, &w, src[i : i+1]);
 	}
-	verify(t, &w, &b, src, expected);
+	verify(t, testname, &w, &b, src, expected);
 
 	// write using Fibonacci slice sizes
 	b.clear();
 	for i, d := 0, 0; i < len(src); {
-		write(t, &w, src[i : i+d]);
+		write(t, testname, &w, src[i : i+d]);
 		i, d = i+d, d+1;
 		if i+d > len(src) {
 			d = len(src) - i;
 		}
 	}
-	verify(t, &w, &b, src, expected);
+	verify(t, testname, &w, &b, src, expected);
 }
 
 
 type entry struct {
+	testname string;
 	tabwidth, padding int;
 	padchar byte;
 	flags uint;
@@ -113,108 +114,133 @@ type entry struct {
 
 var tests = []entry {
 	entry{
+		"1",
 		8, 1, '.', 0,
 		"",
 		""
 	},
 
 	entry{
+		"2",
 		8, 1, '.', 0,
 		"\n\n\n",
 		"\n\n\n"
 	},
 
 	entry{
+		"3",
 		8, 1, '.', 0,
 		"a\nb\nc",
 		"a\nb\nc"
 	},
 
 	entry{
+		"4a",
 		8, 1, '.', 0,
 		"\t",  // '\t' terminates an empty cell on last line - nothing to print
 		""
 	},
 
 	entry{
+		"4b",
 		8, 1, '.', tabwriter.AlignRight,
 		"\t",  // '\t' terminates an empty cell on last line - nothing to print
 		""
 	},
 
 	entry{
+		"5",
 		8, 1, '.', 0,
 		"*\t*",
-		"**"
+		"*.......*"
 	},
 
 	entry{
+		"5b",
 		8, 1, '.', 0,
 		"*\t*\n",
 		"*.......*\n"
 	},
 
 	entry{
+		"5c",
 		8, 1, '.', 0,
 		"*\t*\t",
 		"*.......*"
 	},
 
 	entry{
+		"5d",
 		8, 1, '.', tabwriter.AlignRight,
 		"*\t*\t",
 		".......**"
 	},
 
 	entry{
+		"6",
 		8, 1, '.', 0,
 		"\t\n",
 		"........\n"
 	},
 
 	entry{
+		"7a",
 		8, 1, '.', 0,
 		"a) foo",
 		"a) foo"
 	},
 
 	entry{
+		"7b",
 		8, 1, ' ', 0,
-		"b) foo\tbar",  // "bar" is not in any cell - not formatted, just flushed
-		"b) foobar"
+		"b) foo\tbar",
+		"b) foo  bar"
 	},
 
 	entry{
+		"7c",
 		8, 1, '.', 0,
 		"c) foo\tbar\t",
 		"c) foo..bar"
 	},
 
 	entry{
+		"7d",
 		8, 1, '.', 0,
 		"d) foo\tbar\n",
 		"d) foo..bar\n"
 	},
 
 	entry{
+		"7e",
 		8, 1, '.', 0,
 		"e) foo\tbar\t\n",
 		"e) foo..bar.....\n"
 	},
 
 	entry{
+		"7f",
 		8, 1, '.', tabwriter.FilterHTML,
 		"f) f&lt;o\t<b>bar</b>\t\n",
 		"f) f&lt;o..<b>bar</b>.....\n"
 	},
 
 	entry{
+		"7g",
+		8, 1, '.', tabwriter.FilterHTML,
+		"g) f&lt;o\t<b>bar</b>\t non-terminated entity &amp",
+		"g) f&lt;o..<b>bar</b>..... non-terminated entity &amp"
+	},
+
+	entry{
+		"8",
 		8, 1, '*', 0,
 		"Hello, world!\n",
 		"Hello, world!\n"
 	},
 
 	entry{
+		"9a",
 		0, 0, '.', 0,
 		"1\t2\t3\t4\n"
 		"11\t222\t3333\t44444\n",
@@ -224,6 +250,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"9b",
 		0, 0, '.', tabwriter.FilterHTML,
 		"1\t2<!---\f--->\t3\t4\n"  // \f inside HTML is ignored
 		"11\t222\t3333\t44444\n",
@@ -233,6 +260,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"9c",
 		0, 0, '.', 0,
 		"1\t2\t3\t4\f"  // \f causes a newline and flush
 		"11\t222\t3333\t44444\n",
@@ -242,18 +270,21 @@ var tests = []entry {
 	},
 
 	entry{
+		"10a",
 		5, 0, '.', 0,
 		"1\t2\t3\t4\n",
 		"1....2....3....4\n"
 	},
 
 	entry{
+		"10b",
 		5, 0, '.', 0,
 		"1\t2\t3\t4\t\n",
 		"1....2....3....4....\n"
 	},
 
 	entry{
+		"11",
 		8, 1, '.', 0,
 		"本\tb\tc\n"
 		"aa\t\u672c\u672c\u672c\tcccc\tddddd\n"
@@ -265,6 +296,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"12a",
 		8, 1, ' ', tabwriter.AlignRight,
 		"a\tè\tc\t\n"
 		"aa\tèèè\tcccc\tddddd\t\n"
@@ -276,6 +308,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"12b",
 		2, 0, ' ', 0,
 		"a\tb\tc\n"
 		"aa\tbbb\tcccc\n"
@@ -287,6 +320,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"12c",
 		8, 1, '_', 0,
 		"a\tb\tc\n"
 		"aa\tbbb\tcccc\n"
@@ -298,6 +332,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"13a",
 		4, 1, '-', 0,
 		"4444\t日本語\t22\t1\t333\n"
 		"999999999\t22\n"
@@ -317,6 +352,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"13b",
 		4, 3, '.', 0,
 		"4444\t333\t22\t1\t333\n"
 		"999999999\t22\n"
@@ -336,6 +372,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"13c",
 		8, 1, '\t', tabwriter.FilterHTML,
 		"4444\t333\t22\t1\t333\n"
 		"999999999\t22\n"
@@ -355,6 +392,7 @@ var tests = []entry {
 	},
 
 	entry{
+		"14",
 		0, 2, ' ', tabwriter.AlignRight,
 		".0\t.3\t2.4\t-5.1\t\n"
 		"23.0\t12345678.9\t2.4\t-989.4\t\n"
@@ -375,6 +413,6 @@ var tests = []entry {
 
 func Test(t *testing.T) {
 	for _, e := range tests {
-		check(t, e.tabwidth, e.padding, e.padchar, e.flags, e.src, e.expected);
+		check(t, e.testname, e.tabwidth, e.padding, e.padchar, e.flags, e.src, e.expected);
 	}
 }