unicode: improve generated comments for categories

The comments on the category range tables in the unicode package are fairly redundent and require an external source to translate into human readable category names. This adds a look up table with the category descriptions and uses it if available when generating the comments for the range tables. Fixes #28954 Change-Id: I853e2d270def6492c2c1dd2ad0ec761a74c04e5d Reviewed-on: https://go-review.googlesource.com/c/151297 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
2024-11-23 00:10:07 -07:00 · 2018-11-26 15:11:45 +00:00 · 2018-11-26 15:11:45 +00:00 · d704b5c956
commit d704b5c956
parent 6bf531384d
2 changed files with 72 additions and 32 deletions
--- a/src/unicode/maketables.go
+++ b/src/unicode/maketables.go
@ -458,6 +458,39 @@ package unicode

 `

+var categoryMapping = map[string]string{
+	"Lu": "Letter, uppercase",
+	"Ll": "Letter, lowercase",
+	"Lt": "Letter, titlecase",
+	"Lm": "Letter, modifier",
+	"Lo": "Letter, other",
+	"Mn": "Mark, nonspacing",
+	"Mc": "Mark, spacing combining",
+	"Me": "Mark, enclosing",
+	"Nd": "Number, decimal digit",
+	"Nl": "Number, letter",
+	"No": "Number, other",
+	"Pc": "Punctuation, connector",
+	"Pd": "Punctuation, dash",
+	"Ps": "Punctuation, open",
+	"Pe": "Punctuation, close",
+	"Pi": "Punctuation, initial quote",
+	"Pf": "Punctuation, final quote",
+	"Po": "Punctuation, other",
+	"Sm": "Symbol, math",
+	"Sc": "Symbol, currency",
+	"Sk": "Symbol, modifier",
+	"So": "Symbol, other",
+	"Zs": "Separator, space",
+	"Zl": "Separator, line",
+	"Zp": "Separator, paragraph",
+	"Cc": "Other, control",
+	"Cf": "Other, format",
+	"Cs": "Other, surrogate",
+	"Co": "Other, private use",
+	"Cn": "Other, not assigned",
+}
+
 func printCategories() {
 	if *tablelist == "" {
 		return
@ -528,9 +561,16 @@ func printCategories() {
 			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
 		}
 		if len(name) > 1 {
-			varDecl += fmt.Sprintf(
-				"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
-				name, name, name, name)
+			desc, ok := categoryMapping[name]
+			if ok {
+				varDecl += fmt.Sprintf(
+					"\t%s = _%s;	// %s is the set of Unicode characters in category %s (%s).\n",
+					name, name, name, name, desc)
+			} else {
+				varDecl += fmt.Sprintf(
+					"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
+					name, name, name, name)
+			}
 		}
 		decl[ndecl] = varDecl
 		ndecl++
--- a/src/unicode/tables.go
+++ b/src/unicode/tables.go
@ -3380,53 +3380,53 @@ var _Zs = &RangeTable{

 // These variables have type *RangeTable.
 var (
-	Cc     = _Cc // Cc is the set of Unicode characters in category Cc.
-	Cf     = _Cf // Cf is the set of Unicode characters in category Cf.
-	Co     = _Co // Co is the set of Unicode characters in category Co.
-	Cs     = _Cs // Cs is the set of Unicode characters in category Cs.
+	Cc     = _Cc // Cc is the set of Unicode characters in category Cc (Other, control).
+	Cf     = _Cf // Cf is the set of Unicode characters in category Cf (Other, format).
+	Co     = _Co // Co is the set of Unicode characters in category Co (Other, private use).
+	Cs     = _Cs // Cs is the set of Unicode characters in category Cs (Other, surrogate).
 	Digit  = _Nd // Digit is the set of Unicode characters with the "decimal digit" property.
-	Nd     = _Nd // Nd is the set of Unicode characters in category Nd.
+	Nd     = _Nd // Nd is the set of Unicode characters in category Nd (Number, decimal digit).
 	Letter = _L  // Letter/L is the set of Unicode letters, category L.
 	L      = _L
-	Lm     = _Lm // Lm is the set of Unicode characters in category Lm.
-	Lo     = _Lo // Lo is the set of Unicode characters in category Lo.
+	Lm     = _Lm // Lm is the set of Unicode characters in category Lm (Letter, modifier).
+	Lo     = _Lo // Lo is the set of Unicode characters in category Lo (Letter, other).
 	Lower  = _Ll // Lower is the set of Unicode lower case letters.
-	Ll     = _Ll // Ll is the set of Unicode characters in category Ll.
+	Ll     = _Ll // Ll is the set of Unicode characters in category Ll (Letter, lowercase).
 	Mark   = _M  // Mark/M is the set of Unicode mark characters, category M.
 	M      = _M
-	Mc     = _Mc // Mc is the set of Unicode characters in category Mc.
-	Me     = _Me // Me is the set of Unicode characters in category Me.
-	Mn     = _Mn // Mn is the set of Unicode characters in category Mn.
-	Nl     = _Nl // Nl is the set of Unicode characters in category Nl.
-	No     = _No // No is the set of Unicode characters in category No.
+	Mc     = _Mc // Mc is the set of Unicode characters in category Mc (Mark, spacing combining).
+	Me     = _Me // Me is the set of Unicode characters in category Me (Mark, enclosing).
+	Mn     = _Mn // Mn is the set of Unicode characters in category Mn (Mark, nonspacing).
+	Nl     = _Nl // Nl is the set of Unicode characters in category Nl (Number, letter).
+	No     = _No // No is the set of Unicode characters in category No (Number, other).
 	Number = _N  // Number/N is the set of Unicode number characters, category N.
 	N      = _N
 	Other  = _C // Other/C is the set of Unicode control and special characters, category C.
 	C      = _C
-	Pc     = _Pc // Pc is the set of Unicode characters in category Pc.
-	Pd     = _Pd // Pd is the set of Unicode characters in category Pd.
-	Pe     = _Pe // Pe is the set of Unicode characters in category Pe.
-	Pf     = _Pf // Pf is the set of Unicode characters in category Pf.
-	Pi     = _Pi // Pi is the set of Unicode characters in category Pi.
-	Po     = _Po // Po is the set of Unicode characters in category Po.
-	Ps     = _Ps // Ps is the set of Unicode characters in category Ps.
+	Pc     = _Pc // Pc is the set of Unicode characters in category Pc (Punctuation, connector).
+	Pd     = _Pd // Pd is the set of Unicode characters in category Pd (Punctuation, dash).
+	Pe     = _Pe // Pe is the set of Unicode characters in category Pe (Punctuation, close).
+	Pf     = _Pf // Pf is the set of Unicode characters in category Pf (Punctuation, final quote).
+	Pi     = _Pi // Pi is the set of Unicode characters in category Pi (Punctuation, initial quote).
+	Po     = _Po // Po is the set of Unicode characters in category Po (Punctuation, other).
+	Ps     = _Ps // Ps is the set of Unicode characters in category Ps (Punctuation, open).
 	Punct  = _P  // Punct/P is the set of Unicode punctuation characters, category P.
 	P      = _P
-	Sc     = _Sc // Sc is the set of Unicode characters in category Sc.
-	Sk     = _Sk // Sk is the set of Unicode characters in category Sk.
-	Sm     = _Sm // Sm is the set of Unicode characters in category Sm.
-	So     = _So // So is the set of Unicode characters in category So.
+	Sc     = _Sc // Sc is the set of Unicode characters in category Sc (Symbol, currency).
+	Sk     = _Sk // Sk is the set of Unicode characters in category Sk (Symbol, modifier).
+	Sm     = _Sm // Sm is the set of Unicode characters in category Sm (Symbol, math).
+	So     = _So // So is the set of Unicode characters in category So (Symbol, other).
 	Space  = _Z  // Space/Z is the set of Unicode space characters, category Z.
 	Z      = _Z
 	Symbol = _S // Symbol/S is the set of Unicode symbol characters, category S.
 	S      = _S
 	Title  = _Lt // Title is the set of Unicode title case letters.
-	Lt     = _Lt // Lt is the set of Unicode characters in category Lt.
+	Lt     = _Lt // Lt is the set of Unicode characters in category Lt (Letter, titlecase).
 	Upper  = _Lu // Upper is the set of Unicode upper case letters.
-	Lu     = _Lu // Lu is the set of Unicode characters in category Lu.
-	Zl     = _Zl // Zl is the set of Unicode characters in category Zl.
-	Zp     = _Zp // Zp is the set of Unicode characters in category Zp.
-	Zs     = _Zs // Zs is the set of Unicode characters in category Zs.
+	Lu     = _Lu // Lu is the set of Unicode characters in category Lu (Letter, uppercase).
+	Zl     = _Zl // Zl is the set of Unicode characters in category Zl (Separator, line).
+	Zp     = _Zp // Zp is the set of Unicode characters in category Zp (Separator, paragraph).
+	Zs     = _Zs // Zs is the set of Unicode characters in category Zs (Separator, space).
 )

 // Generated by running