From 98abf91377f155266fa60505c0c12beccad38eeb Mon Sep 17 00:00:00 2001 From: unbyte Date: Tue, 8 Jun 2021 12:38:00 +0800 Subject: [PATCH] cmd/go: ignore UTF8 BOM when reading source code Fix the problem that UTF8 BOM can cause the parsing of import path and directives to fail. --- src/cmd/go/internal/imports/read.go | 18 ++++++++-- src/cmd/go/internal/imports/read_test.go | 26 ++++++++++++++ .../script/build_ignore_leading_bom.txt | 27 ++++++++++++++ src/go/build/read.go | 13 ++++++- src/go/build/read_test.go | 36 +++++++++++++++++++ 5 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 src/cmd/go/testdata/script/build_ignore_leading_bom.txt diff --git a/src/cmd/go/internal/imports/read.go b/src/cmd/go/internal/imports/read.go index 5e270781d7..70d5190450 100644 --- a/src/cmd/go/internal/imports/read.go +++ b/src/cmd/go/internal/imports/read.go @@ -8,6 +8,7 @@ package imports import ( "bufio" + "bytes" "errors" "io" "unicode/utf8" @@ -22,6 +23,19 @@ type importReader struct { nerr int } +var bom = []byte{0xef, 0xbb, 0xbf} + +func newImportReader(b *bufio.Reader) *importReader { + // Remove leading UTF-8 BOM. + // Per https://golang.org/ref/spec#Source_code_representation: + // a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF) + // if it is the first Unicode code point in the source text. + if leadingBytes, err := b.Peek(3); err == nil && bytes.Equal(leadingBytes, bom) { + b.Discard(3) + } + return &importReader{b: b} +} + func isIdent(c byte) bool { return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' || c >= utf8.RuneSelf } @@ -201,7 +215,7 @@ func (r *importReader) readImport(imports *[]string) { // ReadComments is like io.ReadAll, except that it only reads the leading // block of comments in the file. func ReadComments(f io.Reader) ([]byte, error) { - r := &importReader{b: bufio.NewReader(f)} + r := newImportReader(bufio.NewReader(f)) r.peekByte(true) if r.err == nil && !r.eof { // Didn't reach EOF, so must have found a non-space byte. Remove it. @@ -213,7 +227,7 @@ func ReadComments(f io.Reader) ([]byte, error) { // ReadImports is like io.ReadAll, except that it expects a Go file as input // and stops reading the input once the imports have completed. func ReadImports(f io.Reader, reportSyntaxError bool, imports *[]string) ([]byte, error) { - r := &importReader{b: bufio.NewReader(f)} + r := newImportReader(bufio.NewReader(f)) r.readKeyword("package") r.readIdent() diff --git a/src/cmd/go/internal/imports/read_test.go b/src/cmd/go/internal/imports/read_test.go index 6ea356f1ff..6a1a6524a1 100644 --- a/src/cmd/go/internal/imports/read_test.go +++ b/src/cmd/go/internal/imports/read_test.go @@ -66,6 +66,10 @@ var readImportsTests = []readTest{ `, "", }, + { + "\ufeffš¯”»" + `package p; import "x";ā„™var x = 1`, + "", + }, } var readCommentsTests = []readTest{ @@ -81,6 +85,10 @@ var readCommentsTests = []readTest{ `ā„™package p; import . "x"`, "", }, + { + "\ufeffš¯”»" + `ā„™package p; import . "x"`, + "", + }, { `// foo @@ -90,6 +98,19 @@ var readCommentsTests = []readTest{ /*/ zot */ + // asdf + ā„™Hello, world`, + "", + }, + { + "\ufeffš¯”»" + `// foo + + /* bar */ + + /* quux */ // baz + + /*/ zot */ + // asdf ā„™Hello, world`, "", @@ -107,6 +128,11 @@ func testRead(t *testing.T, tests []readTest, read func(io.Reader) ([]byte, erro in = tt.in[:j] + tt.in[j+len("ā„™"):] testOut = tt.in[:j] } + d := strings.Index(tt.in, "š¯”»") + if d >= 0 { + in = in[:d] + in[d+len("š¯”»"):] + testOut = testOut[d+len("š¯”»"):] + } r := strings.NewReader(in) buf, err := read(r) if err != nil { diff --git a/src/cmd/go/testdata/script/build_ignore_leading_bom.txt b/src/cmd/go/testdata/script/build_ignore_leading_bom.txt new file mode 100644 index 0000000000..37141f3466 --- /dev/null +++ b/src/cmd/go/testdata/script/build_ignore_leading_bom.txt @@ -0,0 +1,27 @@ +# Per https://golang.org/ref/spec#Source_code_representation: +# a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF) +# if it is the first Unicode code point in the source text. + +go list -f 'Imports: {{.Imports}} EmbedFiles: {{.EmbedFiles}}' . +stdout '^Imports: \[embed m/hello\] EmbedFiles: \[.*file\]$' + +-- go.mod -- +module m + +go 1.16 +-- m.go -- +ļ»æpackage main + +import ( + _ "embed" + + "m/hello" +) + +//go:embed file +var s string + +-- hello/hello.go -- +package hello + +-- file -- diff --git a/src/go/build/read.go b/src/go/build/read.go index aa7c6ee59e..b98c7938a8 100644 --- a/src/go/build/read.go +++ b/src/go/build/read.go @@ -6,6 +6,7 @@ package build import ( "bufio" + "bytes" "errors" "fmt" "go/ast" @@ -28,9 +29,19 @@ type importReader struct { pos token.Position } +var bom = []byte{0xef, 0xbb, 0xbf} + func newImportReader(name string, r io.Reader) *importReader { + b := bufio.NewReader(r) + // Remove leading UTF-8 BOM. + // Per https://golang.org/ref/spec#Source_code_representation: + // a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF) + // if it is the first Unicode code point in the source text. + if leadingBytes, err := b.Peek(3); err == nil && bytes.Equal(leadingBytes, bom) { + b.Discard(3) + } return &importReader{ - b: bufio.NewReader(r), + b: b, pos: token.Position{ Filename: name, Line: 1, diff --git a/src/go/build/read_test.go b/src/go/build/read_test.go index 32e6bae008..1e5e1c2de2 100644 --- a/src/go/build/read_test.go +++ b/src/go/build/read_test.go @@ -66,6 +66,10 @@ var readGoInfoTests = []readTest{ `, "", }, + { + "\ufeffš¯”»" + `package p; import "x";ā„™var x = 1`, + "", + }, } var readCommentsTests = []readTest{ @@ -81,6 +85,10 @@ var readCommentsTests = []readTest{ `ā„™package p; import . "x"`, "", }, + { + "\ufeffš¯”»" + `ā„™package p; import . "x"`, + "", + }, { `// foo @@ -90,6 +98,19 @@ var readCommentsTests = []readTest{ /*/ zot */ + // asdf + ā„™Hello, world`, + "", + }, + { + "\ufeffš¯”»" + `// foo + + /* bar */ + + /* quux */ // baz + + /*/ zot */ + // asdf ā„™Hello, world`, "", @@ -107,6 +128,11 @@ func testRead(t *testing.T, tests []readTest, read func(io.Reader) ([]byte, erro in = tt.in[:j] + tt.in[j+len("ā„™"):] testOut = tt.in[:j] } + d := strings.Index(tt.in, "š¯”»") + if d >= 0 { + in = in[:d] + in[d+len("š¯”»"):] + testOut = testOut[d+len("š¯”»"):] + } r := strings.NewReader(in) buf, err := read(r) if err != nil { @@ -264,6 +290,12 @@ var readEmbedTests = []struct { test:3:14:y test:3:16:z`, }, + { + "\ufeffpackage p\nimport \"embed\"\n//go:embed x y z\nvar files embed.FS", + `test:3:12:x + test:3:14:y + test:3:16:z`, + }, { "package p\nimport \"embed\"\nvar s = \"/*\"\n//go:embed x\nvar files embed.FS", `test:4:12:x`, @@ -292,6 +324,10 @@ var readEmbedTests = []struct { "package p\n//go:embed x y z\nvar files embed.FS", // no import, no scan "", }, + { + "\ufeffpackage p\n//go:embed x y z\nvar files embed.FS", // no import, no scan + "", + }, } func TestReadEmbed(t *testing.T) {