From afd835434c34178eb8b8b8a89a7a41f5ec35c47c Mon Sep 17 00:00:00 2001 From: David Crawshaw Date: Sat, 27 Feb 2016 08:33:14 -0900 Subject: [PATCH] cmd/link: deduplicate read-only string data Many read-only strings in Go binaries are substrings of other read-only strings. A common source is the text form of type information, which will include both "struct { X int }" and "*struct { X int }" or "*bytes.Reader" and "func(*bytes.Reader)" in the same binary. Because this character data is referred to by separate string headers, we can skip writing the smaller string and modify the pointer relocation to point to the larger string. This CL does this deduplication in the linker after the reachable set of strings has been determined. This removes 765KB from juju (1.4% without DWARF). Link time goes at tip goes form 4.6s to 6.3s, but note that this CL is part of a series that recently reduced link time from 9.6s. For #6853. Change-Id: Ib2087cf627c9f1e9a1181f9b4c8f81d1a3f42191 Reviewed-on: https://go-review.googlesource.com/19987 Reviewed-by: Ian Lance Taylor Run-TryBot: David Crawshaw TryBot-Result: Gobot Gobot --- src/cmd/link/internal/ld/mergestrings.go | 91 +++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/src/cmd/link/internal/ld/mergestrings.go b/src/cmd/link/internal/ld/mergestrings.go index 1facd0aaf6..eb60c40e02 100644 --- a/src/cmd/link/internal/ld/mergestrings.go +++ b/src/cmd/link/internal/ld/mergestrings.go @@ -5,7 +5,10 @@ package ld import ( + "bytes" "cmd/internal/obj" + "index/suffixarray" + "sort" "strings" ) @@ -48,12 +51,87 @@ func mergestrings() { } } + // Sort the strings, shortest first. + // + // Ordering by length lets us use the largest matching substring + // index when there are multiple matches. This means we will not + // use a substring of a string that we will later in the pass + // mark as unreachable. + sort.Sort(strSymsByLen(strs)) + + // Build a suffix array. + dataOff := make([]int, len(strs)) + data := make([]byte, 0, size) + for i := range strs { + dataOff[i] = len(data) + data = append(data, strs[i].P...) + } + index := suffixarray.New(data) + + // Search for substring replacements. + type replacement struct { + str *LSym + off int + } + replacements := make(map[*LSym]replacement) + for i, s := range strs { + results := index.Lookup(s.P, -1) + if len(results) == 0 { + continue + } + var res int + for _, result := range results { + if result > res { + res = result + } + } + var off int + x := sort.SearchInts(dataOff, res) + if x == len(dataOff) || dataOff[x] > res { + x-- + off = res - dataOff[x] + } + if x == i { + continue // found ourself + } + if len(s.P) > len(strs[x].P[off:]) { + // Do not use substrings that match across strings. + // In theory it is possible, but it would + // complicate accounting for which future strings + // are already used. It doesn't appear to be common + // enough to do the extra work. + continue + } + if off%Thearch.Minalign != 0 { + continue // Cannot relcate to this substring. + } + replacements[s] = replacement{ + str: strs[x], + off: off, + } + } + // Put all string data into a single symbol and update the relocations. alldata := Linklookup(Ctxt, "go.string.alldata", 0) alldata.Type = obj.SGOSTRING alldata.Attr |= AttrReachable alldata.P = make([]byte, 0, size) for _, str := range strs { + str.Attr.Set(AttrReachable, false) + if rep, isReplaced := replacements[str]; isReplaced { + // As strs is sorted, the replacement string + // is always later in the strs range. Shift the + // relocations to the replacement string symbol + // and process then. + relocs := relocsToStrs[rep.str] + for _, r := range relocsToStrs[str] { + r.Add += int64(rep.off) + relocs = append(relocs, r) + } + relocsToStrs[rep.str] = relocs + continue + } + off := len(alldata.P) alldata.P = append(alldata.P, str.P...) // Architectures with Minalign > 1 cannot have relocations pointing @@ -62,7 +140,6 @@ func mergestrings() { for r := len(alldata.P) % Thearch.Minalign; r > 0; r-- { alldata.P = append(alldata.P, 0) } - str.Attr.Set(AttrReachable, false) for _, r := range relocsToStrs[str] { r.Add += int64(off) r.Sym = alldata @@ -70,3 +147,15 @@ func mergestrings() { } alldata.Size = int64(len(alldata.P)) } + +// strSymsByLen implements sort.Interface. It sorts *LSym by the length of P. +type strSymsByLen []*LSym + +func (s strSymsByLen) Len() int { return len(s) } +func (s strSymsByLen) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s strSymsByLen) Less(i, j int) bool { + if len(s[i].P) == len(s[j].P) { + return bytes.Compare(s[i].P, s[j].P) == -1 + } + return len(s[i].P) < len(s[j].P) +}