1
0
mirror of https://github.com/golang/go synced 2024-11-05 20:36:10 -07:00
go/cmd/html2article/conv.go

349 lines
6.8 KiB
Go
Raw Normal View History

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This program takes an HTML file and outputs a corresponding article file in
// present format. See: code.google.com/p/go.tools/present
package main
import (
"bufio"
"bytes"
"errors"
"flag"
"fmt"
"io"
"log"
"os"
"regexp"
"strings"
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/atom"
)
func main() {
flag.Parse()
err := convert(os.Stdout, os.Stdin)
if err != nil {
log.Fatal(err)
}
}
func convert(w io.Writer, r io.Reader) error {
root, err := html.Parse(r)
if err != nil {
return err
}
style := find(root, isTag(atom.Style))
parseStyles(style)
body := find(root, isTag(atom.Body))
if body == nil {
return errors.New("couldn't find body")
}
article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
_, err = fmt.Fprintf(w, "Title\n\n%s", article)
return err
}
type Style string
const (
Bold Style = "*"
Italic Style = "_"
Code Style = "`"
)
var cssRules = make(map[string]Style)
func parseStyles(style *html.Node) {
if style == nil || style.FirstChild == nil {
log.Println("couldn't find styles")
return
}
s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data))
findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
if i := bytes.Index(b, []byte("{")); i >= 0 {
token = bytes.TrimSpace(b[:i])
advance = i
}
return
}
findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
if len(b) == 0 {
return
}
if b[0] != '{' {
err = fmt.Errorf("expected {, got %c", b[0])
return
}
if i := bytes.Index(b, []byte("}")); i < 0 {
err = fmt.Errorf("can't find closing }")
return
} else {
token = b[1:i]
advance = i + 1
}
return
}
s.Split(findRule)
for s.Scan() {
rule := s.Text()
s.Split(findBody)
if !s.Scan() {
break
}
b := strings.ToLower(s.Text())
switch {
case strings.Contains(b, "italic"):
cssRules[rule] = Italic
case strings.Contains(b, "bold"):
cssRules[rule] = Bold
case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"):
cssRules[rule] = Code
}
s.Split(findRule)
}
if err := s.Err(); err != nil {
log.Println(err)
}
}
var newlineRun = regexp.MustCompile(`\n\n+`)
func limitNewlineRuns(s string) string {
return newlineRun.ReplaceAllString(s, "\n\n")
}
func makeHeadings(body string) string {
buf := new(bytes.Buffer)
lines := strings.Split(body, "\n")
for i, s := range lines {
if i == 0 && !isBoldTitle(s) {
buf.WriteString("* Introduction\n\n")
}
if isBoldTitle(s) {
s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
s = "* " + s
}
buf.WriteString(s)
buf.WriteByte('\n')
}
return buf.String()
}
func isBoldTitle(s string) bool {
return !strings.Contains(s, " ") &&
strings.HasPrefix(s, "*") &&
strings.HasSuffix(s, "*")
}
func indent(buf *bytes.Buffer, s string) {
for _, l := range strings.Split(s, "\n") {
if l != "" {
buf.WriteByte('\t')
buf.WriteString(l)
}
buf.WriteByte('\n')
}
}
func unwrap(buf *bytes.Buffer, s string) {
var cont bool
for _, l := range strings.Split(s, "\n") {
l = strings.TrimSpace(l)
if len(l) == 0 {
if cont {
buf.WriteByte('\n')
buf.WriteByte('\n')
}
cont = false
} else {
if cont {
buf.WriteByte(' ')
}
buf.WriteString(l)
cont = true
}
}
}
func text(n *html.Node) string {
var buf bytes.Buffer
walk(n, func(n *html.Node) bool {
switch n.Type {
case html.TextNode:
buf.WriteString(n.Data)
return false
case html.ElementNode:
// no-op
default:
return true
}
a := n.DataAtom
if a == atom.Span {
switch {
case hasStyle(Code)(n):
a = atom.Code
case hasStyle(Bold)(n):
a = atom.B
case hasStyle(Italic)(n):
a = atom.I
}
}
switch a {
case atom.Br:
buf.WriteByte('\n')
case atom.P:
unwrap(&buf, childText(n))
buf.WriteString("\n\n")
case atom.Li:
buf.WriteString("- ")
unwrap(&buf, childText(n))
buf.WriteByte('\n')
case atom.Pre:
indent(&buf, childText(n))
buf.WriteByte('\n')
case atom.A:
fmt.Fprintf(&buf, "[[%s][%s]]", attr(n, "href"), childText(n))
case atom.Code:
buf.WriteString(highlight(n, "`"))
case atom.B:
buf.WriteString(highlight(n, "*"))
case atom.I:
buf.WriteString(highlight(n, "_"))
case atom.Img:
src := attr(n, "src")
fmt.Fprintf(&buf, ".image %s\n", src)
case atom.Iframe:
src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
case atom.Param:
if attr(n, "name") == "movie" {
// Old style YouTube embed.
u := attr(n, "value")
u = strings.Replace(u, "/v/", "/embed/", 1)
if i := strings.Index(u, "&"); i >= 0 {
u = u[:i]
}
fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
}
default:
return true
}
return false
})
return buf.String()
}
func childText(node *html.Node) string {
var buf bytes.Buffer
for n := node.FirstChild; n != nil; n = n.NextSibling {
fmt.Fprint(&buf, text(n))
}
return buf.String()
}
func highlight(node *html.Node, char string) string {
t := strings.Replace(childText(node), " ", char, -1)
return fmt.Sprintf("%s%s%s", char, t, char)
}
type selector func(*html.Node) bool
func isTag(a atom.Atom) selector {
return func(n *html.Node) bool {
return n.DataAtom == a
}
}
func hasClass(name string) selector {
return func(n *html.Node) bool {
for _, a := range n.Attr {
if a.Key == "class" {
for _, c := range strings.Fields(a.Val) {
if c == name {
return true
}
}
}
}
return false
}
}
func hasStyle(s Style) selector {
return func(n *html.Node) bool {
for rule, s2 := range cssRules {
if s2 != s {
continue
}
if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
return true
}
if n.DataAtom.String() == rule {
return true
}
}
return false
}
}
func hasAttr(key, val string) selector {
return func(n *html.Node) bool {
for _, a := range n.Attr {
if a.Key == key && a.Val == val {
return true
}
}
return false
}
}
func attr(node *html.Node, key string) (value string) {
for _, attr := range node.Attr {
if attr.Key == key {
return attr.Val
}
}
return ""
}
func findAll(node *html.Node, fn selector) (nodes []*html.Node) {
walk(node, func(n *html.Node) bool {
if fn(n) {
nodes = append(nodes, n)
}
return true
})
return
}
func find(n *html.Node, fn selector) *html.Node {
var result *html.Node
walk(n, func(n *html.Node) bool {
if result != nil {
return false
}
if fn(n) {
result = n
return false
}
return true
})
return result
}
func walk(n *html.Node, fn selector) {
if fn(n) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c, fn)
}
}
}