cmd/present2md: add command to convert legacy present to Markdown-enabled present

Accepting Markdown in present is nice, but the job isn't done unless we actually convert all our existing blog files to the new format. Otherwise we have two different kinds of files to understand when working in that directory. This tool lets us mechanically convert all the existing blog articles. For golang/go#33955. Change-Id: If8a8dd90eefa171f422644134d0571e2abe4c7cb Reviewed-on: https://go-review.googlesource.com/c/tools/+/222847 Reviewed-by: Rob Pike <r@golang.org>
2024-11-18 06:54:49 -07:00 · 2020-03-09 23:10:56 -04:00 · 2020-03-09 23:10:56 -04:00 · 4303120df7
commit 4303120df7
parent 8ac058ed9f
1 changed files with 468 additions and 0 deletions
--- a/cmd/present2md/main.go
+++ b/cmd/present2md/main.go
@ -0,0 +1,468 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Present2md converts legacy-syntax present files to Markdown-syntax present files.
+//
+// Usage:
+//
+//	present2md [-w] [file ...]
+//
+// By default, present2md prints the Markdown-syntax form of each input file to standard output.
+// If no input file is listed, standard input is used.
+//
+// The -w flag causes present2md to update the files in place, overwriting each with its
+// Markdown-syntax equivalent.
+//
+// Examples
+//
+//	present2md your.article
+//	present2md -w *.article
+//
+package main
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/url"
+	"os"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+
+	"golang.org/x/tools/present"
+)
+
+func usage() {
+	fmt.Fprintf(os.Stderr, "usage: present2md [-w] [file ...]\n")
+	os.Exit(2)
+}
+
+var (
+	writeBack  = flag.Bool("w", false, "write conversions back to original files")
+	exitStatus = 0
+)
+
+func main() {
+	log.SetPrefix("present2md: ")
+	log.SetFlags(0)
+	flag.Usage = usage
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) == 0 {
+		if *writeBack {
+			log.Fatalf("cannot use -w with standard input")
+		}
+		convert(os.Stdin, "stdin", false)
+		return
+	}
+
+	for _, arg := range args {
+		f, err := os.Open(arg)
+		if err != nil {
+			log.Print(err)
+			exitStatus = 1
+			continue
+		}
+		err = convert(f, arg, *writeBack)
+		f.Close()
+		if err != nil {
+			log.Print(err)
+			exitStatus = 1
+		}
+	}
+	os.Exit(exitStatus)
+}
+
+// convert reads the data from r, parses it as legacy present,
+// and converts it to Markdown-enabled present.
+// If any errors occur, the data is reported as coming from file.
+// If writeBack is true, the converted version is written back to file.
+// If writeBack is false, the converted version is printed to standard output.
+func convert(r io.Reader, file string, writeBack bool) error {
+	data, err := ioutil.ReadAll(r)
+	if err != nil {
+		return err
+	}
+	if bytes.HasPrefix(data, []byte("# ")) {
+		return fmt.Errorf("%v: already markdown", file)
+	}
+
+	doc, err := present.Parse(bytes.NewReader(data), file, 0)
+	if err != nil {
+		return err
+	}
+
+	// Title and Subtitle, Time, Tags.
+	var md bytes.Buffer
+	fmt.Fprintf(&md, "# %s\n", doc.Title)
+	if doc.Subtitle != "" {
+		fmt.Fprintf(&md, "%s\n", doc.Subtitle)
+	}
+	if !doc.Time.IsZero() {
+		fmt.Fprintf(&md, "%s\n", doc.Time.Format("2 Jan 2006"))
+	}
+	if len(doc.Tags) > 0 {
+		fmt.Fprintf(&md, "Tags: %s\n", strings.Join(doc.Tags, ", "))
+	}
+
+	// Summary, defaulting to first paragraph of section.
+	// (Summaries must be explicit for Markdown-enabled present,
+	// and the expectation is that they will be shorter than the
+	// whole first paragraph. But this is what the blog does today.)
+	if strings.HasSuffix(file, ".article") && len(doc.Sections) > 0 {
+		for _, elem := range doc.Sections[0].Elem {
+			text, ok := elem.(present.Text)
+			if !ok || text.Pre {
+				// skip everything but non-text elements
+				continue
+			}
+			fmt.Fprintf(&md, "Summary:")
+			for i, line := range text.Lines {
+				fmt.Fprintf(&md, " ")
+				printStyled(&md, line, i == 0)
+			}
+			fmt.Fprintf(&md, "\n")
+			break
+		}
+	}
+
+	// Authors
+	for _, a := range doc.Authors {
+		fmt.Fprintf(&md, "\n")
+		for _, elem := range a.Elem {
+			switch elem := elem.(type) {
+			default:
+				// Can only happen if this type switch is incomplete, which is a bug.
+				log.Fatalf("%s: unexpected author type %T", file, elem)
+			case present.Text:
+				for _, line := range elem.Lines {
+					fmt.Fprintf(&md, "%s\n", markdownEscape(line, true))
+				}
+			case present.Link:
+				fmt.Fprintf(&md, "%s\n", markdownEscape(elem.Label, true))
+			}
+		}
+	}
+
+	// Invariant: the output ends in non-blank line now,
+	// and after printing any piece of the file below,
+	// the output should still end in a non-blank line.
+	// If a blank line separator is needed, it should be printed
+	// before the block that needs separating, not after.
+
+	if len(doc.TitleNotes) > 0 {
+		fmt.Fprintf(&md, "\n")
+		for _, line := range doc.TitleNotes {
+			fmt.Fprintf(&md, ": %s\n", line)
+		}
+	}
+
+	if len(doc.Sections) == 1 && strings.HasSuffix(file, ".article") {
+		// Blog drops section headers when there is only one section.
+		// Don't print a title in this case, to make clear that it's being dropped.
+		fmt.Fprintf(&md, "\n##\n")
+		printSectionBody(file, 1, &md, doc.Sections[0].Elem)
+	} else {
+		for _, s := range doc.Sections {
+			fmt.Fprintf(&md, "\n")
+			fmt.Fprintf(&md, "## %s\n", markdownEscape(s.Title, false))
+			printSectionBody(file, 2, &md, s.Elem)
+		}
+	}
+
+	if !writeBack {
+		os.Stdout.Write(md.Bytes())
+		return nil
+	}
+	return ioutil.WriteFile(file, md.Bytes(), 0666)
+}
+
+func printSectionBody(file string, depth int, w *bytes.Buffer, elems []present.Elem) {
+	for _, elem := range elems {
+		switch elem := elem.(type) {
+		default:
+			// Can only happen if this type switch is incomplete, which is a bug.
+			log.Fatalf("%s: unexpected present element type %T", file, elem)
+
+		case present.Text:
+			fmt.Fprintf(w, "\n")
+			lines := elem.Lines
+			for len(lines) > 0 && lines[0] == "" {
+				lines = lines[1:]
+			}
+			if elem.Pre {
+				for _, line := range strings.Split(strings.TrimRight(elem.Raw, "\n"), "\n") {
+					if line == "" {
+						fmt.Fprintf(w, "\n")
+					} else {
+						fmt.Fprintf(w, "\t%s\n", line)
+					}
+				}
+			} else {
+				for _, line := range elem.Lines {
+					printStyled(w, line, true)
+					fmt.Fprintf(w, "\n")
+				}
+			}
+
+		case present.List:
+			fmt.Fprintf(w, "\n")
+			for _, item := range elem.Bullet {
+				fmt.Fprintf(w, "  - ")
+				for i, line := range strings.Split(item, "\n") {
+					if i > 0 {
+						fmt.Fprintf(w, "    ")
+					}
+					printStyled(w, line, false)
+					fmt.Fprintf(w, "\n")
+				}
+			}
+
+		case present.Section:
+			fmt.Fprintf(w, "\n")
+			sep := " "
+			if elem.Title == "" {
+				sep = ""
+			}
+			fmt.Fprintf(w, "%s%s%s\n", strings.Repeat("#", depth+2), sep, markdownEscape(elem.Title, false))
+			printSectionBody(file, depth+1, w, elem.Elem)
+
+		case interface{ PresentCmd() string }:
+			// If there are multiple present commands in a row, don't print a blank line before the second etc.
+			b := w.Bytes()
+			sep := "\n"
+			if len(b) > 0 {
+				i := bytes.LastIndexByte(b[:len(b)-1], '\n')
+				if b[i+1] == '.' {
+					sep = ""
+				}
+			}
+			fmt.Fprintf(w, "%s%s\n", sep, elem.PresentCmd())
+		}
+	}
+}
+
+func markdownEscape(s string, startLine bool) string {
+	var b strings.Builder
+	for i, r := range s {
+		switch {
+		case r == '#' && i == 0,
+			r == '*',
+			r == '_',
+			r == '<' && (i == 0 || s[i-1] != ' ') && i+1 < len(s) && s[i+1] != ' ',
+			r == '[' && strings.Contains(s[i:], "]("):
+			b.WriteRune('\\')
+		}
+		b.WriteRune(r)
+	}
+	return b.String()
+}
+
+// Copy of ../../present/style.go adjusted to produce Markdown instead of HTML.
+
+/*
+	Fonts are demarcated by an initial and final char bracketing a
+	space-delimited word, plus possibly some terminal punctuation.
+	The chars are
+		_ for italic
+		* for bold
+		` (back quote) for fixed width.
+	Inner appearances of the char become spaces. For instance,
+		_this_is_italic_!
+	becomes
+		<i>this is italic</i>!
+*/
+
+func printStyled(w *bytes.Buffer, text string, startLine bool) {
+	w.WriteString(font(text, startLine))
+}
+
+// font returns s with font indicators turned into HTML font tags.
+func font(s string, startLine bool) string {
+	if !strings.ContainsAny(s, "[`_*") {
+		return markdownEscape(s, startLine)
+	}
+	words := split(s)
+	var b bytes.Buffer
+Word:
+	for w, word := range words {
+		words[w] = markdownEscape(word, startLine && w == 0) // for all the continue Word
+		if len(word) < 2 {
+			continue Word
+		}
+		if link, _ := parseInlineLink(word); link != "" {
+			words[w] = link
+			continue Word
+		}
+		const marker = "_*`"
+		// Initial punctuation is OK but must be peeled off.
+		first := strings.IndexAny(word, marker)
+		if first == -1 {
+			continue Word
+		}
+		// Opening marker must be at the beginning of the token or else preceded by punctuation.
+		if first != 0 {
+			r, _ := utf8.DecodeLastRuneInString(word[:first])
+			if !unicode.IsPunct(r) {
+				continue Word
+			}
+		}
+		open, word := markdownEscape(word[:first], startLine && w == 0), word[first:]
+		char := word[0] // ASCII is OK.
+		close := ""
+		switch char {
+		default:
+			continue Word
+		case '_':
+			open += "_"
+			close = "_"
+		case '*':
+			open += "**"
+			close = "**"
+		case '`':
+			open += "`"
+			close = "`"
+		}
+		// Closing marker must be at the end of the token or else followed by punctuation.
+		last := strings.LastIndex(word, word[:1])
+		if last == 0 {
+			continue Word
+		}
+		if last+1 != len(word) {
+			r, _ := utf8.DecodeRuneInString(word[last+1:])
+			if !unicode.IsPunct(r) {
+				continue Word
+			}
+		}
+		head, tail := word[:last+1], word[last+1:]
+		b.Reset()
+		var wid int
+		for i := 1; i < len(head)-1; i += wid {
+			var r rune
+			r, wid = utf8.DecodeRuneInString(head[i:])
+			if r != rune(char) {
+				// Ordinary character.
+				b.WriteRune(r)
+				continue
+			}
+			if head[i+1] != char {
+				// Inner char becomes space.
+				b.WriteRune(' ')
+				continue
+			}
+			// Doubled char becomes real char.
+			// Not worth worrying about "_x__".
+			b.WriteByte(char)
+			wid++ // Consumed two chars, both ASCII.
+		}
+		text := b.String()
+		if close == "`" {
+			for strings.Contains(text, close) {
+				open += "`"
+				close += "`"
+			}
+		} else {
+			text = markdownEscape(text, false)
+		}
+		words[w] = open + text + close + tail
+	}
+	return strings.Join(words, "")
+}
+
+// split is like strings.Fields but also returns the runs of spaces
+// and treats inline links as distinct words.
+func split(s string) []string {
+	var (
+		words = make([]string, 0, 10)
+		start = 0
+	)
+
+	// appendWord appends the string s[start:end] to the words slice.
+	// If the word contains the beginning of a link, the non-link portion
+	// of the word and the entire link are appended as separate words,
+	// and the start index is advanced to the end of the link.
+	appendWord := func(end int) {
+		if j := strings.Index(s[start:end], "[["); j > -1 {
+			if _, l := parseInlineLink(s[start+j:]); l > 0 {
+				// Append portion before link, if any.
+				if j > 0 {
+					words = append(words, s[start:start+j])
+				}
+				// Append link itself.
+				words = append(words, s[start+j:start+j+l])
+				// Advance start index to end of link.
+				start = start + j + l
+				return
+			}
+		}
+		// No link; just add the word.
+		words = append(words, s[start:end])
+		start = end
+	}
+
+	wasSpace := false
+	for i, r := range s {
+		isSpace := unicode.IsSpace(r)
+		if i > start && isSpace != wasSpace {
+			appendWord(i)
+		}
+		wasSpace = isSpace
+	}
+	for start < len(s) {
+		appendWord(len(s))
+	}
+	return words
+}
+
+// parseInlineLink parses an inline link at the start of s, and returns
+// a rendered Markdown link and the total length of the raw inline link.
+// If no inline link is present, it returns all zeroes.
+func parseInlineLink(s string) (link string, length int) {
+	if !strings.HasPrefix(s, "[[") {
+		return
+	}
+	end := strings.Index(s, "]]")
+	if end == -1 {
+		return
+	}
+	urlEnd := strings.Index(s, "]")
+	rawURL := s[2:urlEnd]
+	const badURLChars = `<>"{}|\^[] ` + "`" // per RFC2396 section 2.4.3
+	if strings.ContainsAny(rawURL, badURLChars) {
+		return
+	}
+	if urlEnd == end {
+		simpleURL := ""
+		url, err := url.Parse(rawURL)
+		if err == nil {
+			// If the URL is http://foo.com, drop the http://
+			// In other words, render [[http://golang.org]] as:
+			//   <a href="http://golang.org">golang.org</a>
+			if strings.HasPrefix(rawURL, url.Scheme+"://") {
+				simpleURL = strings.TrimPrefix(rawURL, url.Scheme+"://")
+			} else if strings.HasPrefix(rawURL, url.Scheme+":") {
+				simpleURL = strings.TrimPrefix(rawURL, url.Scheme+":")
+			}
+		}
+		return renderLink(rawURL, simpleURL), end + 2
+	}
+	if s[urlEnd:urlEnd+2] != "][" {
+		return
+	}
+	text := s[urlEnd+2 : end]
+	return renderLink(rawURL, text), end + 2
+}
+
+func renderLink(href, text string) string {
+	text = font(text, false)
+	if text == "" {
+		text = markdownEscape(href, false)
+	}
+	return "[" + text + "](" + href + ")"
+}