// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This program takes an HTML file and outputs a corresponding article file in // present format. See: code.google.com/p/go.tools/present package main import ( "bufio" "bytes" "errors" "flag" "fmt" "io" "log" "os" "regexp" "strings" "code.google.com/p/go.net/html" "code.google.com/p/go.net/html/atom" ) func main() { flag.Parse() err := convert(os.Stdout, os.Stdin) if err != nil { log.Fatal(err) } } func convert(w io.Writer, r io.Reader) error { root, err := html.Parse(r) if err != nil { return err } style := find(root, isTag(atom.Style)) parseStyles(style) body := find(root, isTag(atom.Body)) if body == nil { return errors.New("couldn't find body") } article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) _, err = fmt.Fprintf(w, "Title\n\n%s", article) return err } type Style string const ( Bold Style = "*" Italic Style = "_" Code Style = "`" ) var cssRules = make(map[string]Style) func parseStyles(style *html.Node) { if style == nil || style.FirstChild == nil { log.Println("couldn't find styles") return } s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data)) findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) { if i := bytes.Index(b, []byte("{")); i >= 0 { token = bytes.TrimSpace(b[:i]) advance = i } return } findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) { if len(b) == 0 { return } if b[0] != '{' { err = fmt.Errorf("expected {, got %c", b[0]) return } if i := bytes.Index(b, []byte("}")); i < 0 { err = fmt.Errorf("can't find closing }") return } else { token = b[1:i] advance = i + 1 } return } s.Split(findRule) for s.Scan() { rule := s.Text() s.Split(findBody) if !s.Scan() { break } b := strings.ToLower(s.Text()) switch { case strings.Contains(b, "italic"): cssRules[rule] = Italic case strings.Contains(b, "bold"): cssRules[rule] = Bold case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"): cssRules[rule] = Code } s.Split(findRule) } if err := s.Err(); err != nil { log.Println(err) } } var newlineRun = regexp.MustCompile(`\n\n+`) func limitNewlineRuns(s string) string { return newlineRun.ReplaceAllString(s, "\n\n") } func makeHeadings(body string) string { buf := new(bytes.Buffer) lines := strings.Split(body, "\n") for i, s := range lines { if i == 0 && !isBoldTitle(s) { buf.WriteString("* Introduction\n\n") } if isBoldTitle(s) { s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) s = "* " + s } buf.WriteString(s) buf.WriteByte('\n') } return buf.String() } func isBoldTitle(s string) bool { return !strings.Contains(s, " ") && strings.HasPrefix(s, "*") && strings.HasSuffix(s, "*") } func indent(buf *bytes.Buffer, s string) { for _, l := range strings.Split(s, "\n") { if l != "" { buf.WriteByte('\t') buf.WriteString(l) } buf.WriteByte('\n') } } func unwrap(buf *bytes.Buffer, s string) { var cont bool for _, l := range strings.Split(s, "\n") { l = strings.TrimSpace(l) if len(l) == 0 { if cont { buf.WriteByte('\n') buf.WriteByte('\n') } cont = false } else { if cont { buf.WriteByte(' ') } buf.WriteString(l) cont = true } } } func text(n *html.Node) string { var buf bytes.Buffer walk(n, func(n *html.Node) bool { switch n.Type { case html.TextNode: buf.WriteString(n.Data) return false case html.ElementNode: // no-op default: return true } a := n.DataAtom if a == atom.Span { switch { case hasStyle(Code)(n): a = atom.Code case hasStyle(Bold)(n): a = atom.B case hasStyle(Italic)(n): a = atom.I } } switch a { case atom.Br: buf.WriteByte('\n') case atom.P: unwrap(&buf, childText(n)) buf.WriteString("\n\n") case atom.Li: buf.WriteString("- ") unwrap(&buf, childText(n)) buf.WriteByte('\n') case atom.Pre: indent(&buf, childText(n)) buf.WriteByte('\n') case atom.A: fmt.Fprintf(&buf, "[[%s][%s]]", attr(n, "href"), childText(n)) case atom.Code: buf.WriteString(highlight(n, "`")) case atom.B: buf.WriteString(highlight(n, "*")) case atom.I: buf.WriteString(highlight(n, "_")) case atom.Img: src := attr(n, "src") fmt.Fprintf(&buf, ".image %s\n", src) case atom.Iframe: src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) case atom.Param: if attr(n, "name") == "movie" { // Old style YouTube embed. u := attr(n, "value") u = strings.Replace(u, "/v/", "/embed/", 1) if i := strings.Index(u, "&"); i >= 0 { u = u[:i] } fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) } default: return true } return false }) return buf.String() } func childText(node *html.Node) string { var buf bytes.Buffer for n := node.FirstChild; n != nil; n = n.NextSibling { fmt.Fprint(&buf, text(n)) } return buf.String() } func highlight(node *html.Node, char string) string { t := strings.Replace(childText(node), " ", char, -1) return fmt.Sprintf("%s%s%s", char, t, char) } type selector func(*html.Node) bool func isTag(a atom.Atom) selector { return func(n *html.Node) bool { return n.DataAtom == a } } func hasClass(name string) selector { return func(n *html.Node) bool { for _, a := range n.Attr { if a.Key == "class" { for _, c := range strings.Fields(a.Val) { if c == name { return true } } } } return false } } func hasStyle(s Style) selector { return func(n *html.Node) bool { for rule, s2 := range cssRules { if s2 != s { continue } if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { return true } if n.DataAtom.String() == rule { return true } } return false } } func hasAttr(key, val string) selector { return func(n *html.Node) bool { for _, a := range n.Attr { if a.Key == key && a.Val == val { return true } } return false } } func attr(node *html.Node, key string) (value string) { for _, attr := range node.Attr { if attr.Key == key { return attr.Val } } return "" } func findAll(node *html.Node, fn selector) (nodes []*html.Node) { walk(node, func(n *html.Node) bool { if fn(n) { nodes = append(nodes, n) } return true }) return } func find(n *html.Node, fn selector) *html.Node { var result *html.Node walk(n, func(n *html.Node) bool { if result != nil { return false } if fn(n) { result = n return false } return true }) return result } func walk(n *html.Node, fn selector) { if fn(n) { for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c, fn) } } }