diff options
-rw-r--r-- | cmd/html2article/conv.go | 350 |
1 files changed, 350 insertions, 0 deletions
diff --git a/cmd/html2article/conv.go b/cmd/html2article/conv.go new file mode 100644 index 0000000..1cb6f07 --- /dev/null +++ b/cmd/html2article/conv.go @@ -0,0 +1,350 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine + +// This program takes an HTML file and outputs an corresponding article file in +// present format. +package main + +import ( + "bufio" + "bytes" + "errors" + "flag" + "fmt" + "io" + "log" + "os" + "regexp" + "strings" + + "code.google.com/p/go.net/html" + "code.google.com/p/go.net/html/atom" +) + +func main() { + flag.Parse() + + err := convert(os.Stdout, os.Stdin) + if err != nil { + log.Fatal(err) + } +} + +func convert(w io.Writer, r io.Reader) error { + root, err := html.Parse(r) + if err != nil { + return err + } + + style := find(root, isTag(atom.Style)) + parseStyles(style) + + body := find(root, isTag(atom.Body)) + if body == nil { + return errors.New("couldn't find body") + } + article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) + _, err = fmt.Fprintf(w, "Title\n\n%s", article) + return err +} + +type Style string + +const ( + Bold Style = "*" + Italic Style = "_" + Code Style = "`" +) + +var cssRules = make(map[string]Style) + +func parseStyles(style *html.Node) { + if style == nil || style.FirstChild == nil { + log.Println("couldn't find styles") + return + } + s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data)) + + findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) { + if i := bytes.Index(b, []byte("{")); i >= 0 { + token = bytes.TrimSpace(b[:i]) + advance = i + } + return + } + findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) { + if len(b) == 0 { + return + } + if b[0] != '{' { + err = fmt.Errorf("expected {, got %c", b[0]) + return + } + if i := bytes.Index(b, []byte("}")); i < 0 { + err = fmt.Errorf("can't find closing }") + return + } else { + token = b[1:i] + advance = i + 1 + } + return + } + + s.Split(findRule) + for s.Scan() { + rule := s.Text() + s.Split(findBody) + if !s.Scan() { + break + } + b := strings.ToLower(s.Text()) + switch { + case strings.Contains(b, "italic"): + cssRules[rule] = Italic + case strings.Contains(b, "bold"): + cssRules[rule] = Bold + case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"): + cssRules[rule] = Code + } + s.Split(findRule) + } + if err := s.Err(); err != nil { + log.Println(err) + } +} + +var newlineRun = regexp.MustCompile(`\n\n+`) + +func limitNewlineRuns(s string) string { + return newlineRun.ReplaceAllString(s, "\n\n") +} + +func makeHeadings(body string) string { + buf := new(bytes.Buffer) + lines := strings.Split(body, "\n") + for i, s := range lines { + if i == 0 && !isBoldTitle(s) { + buf.WriteString("* Introduction\n\n") + } + if isBoldTitle(s) { + s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) + s = "* " + s + } + buf.WriteString(s) + buf.WriteByte('\n') + } + return buf.String() +} + +func isBoldTitle(s string) bool { + return !strings.Contains(s, " ") && + strings.HasPrefix(s, "*") && + strings.HasSuffix(s, "*") +} + +func indent(buf *bytes.Buffer, s string) { + for _, l := range strings.Split(s, "\n") { + if l != "" { + buf.WriteByte('\t') + buf.WriteString(l) + } + buf.WriteByte('\n') + } +} + +func unwrap(buf *bytes.Buffer, s string) { + var cont bool + for _, l := range strings.Split(s, "\n") { + l = strings.TrimSpace(l) + if len(l) == 0 { + if cont { + buf.WriteByte('\n') + buf.WriteByte('\n') + } + cont = false + } else { + if cont { + buf.WriteByte(' ') + } + buf.WriteString(l) + cont = true + } + } +} + +func text(n *html.Node) string { + var buf bytes.Buffer + walk(n, func(n *html.Node) bool { + switch n.Type { + case html.TextNode: + buf.WriteString(n.Data) + return false + case html.ElementNode: + // no-op + default: + return true + } + a := n.DataAtom + if a == atom.Span { + switch { + case hasStyle(Code)(n): + a = atom.Code + case hasStyle(Bold)(n): + a = atom.B + case hasStyle(Italic)(n): + a = atom.I + } + } + switch a { + case atom.Br: + buf.WriteByte('\n') + case atom.P: + unwrap(&buf, childText(n)) + buf.WriteString("\n\n") + case atom.Li: + buf.WriteString("- ") + unwrap(&buf, childText(n)) + buf.WriteByte('\n') + case atom.Pre: + indent(&buf, childText(n)) + buf.WriteByte('\n') + case atom.A: + fmt.Fprintf(&buf, "[[%s][%s]]", attr(n, "href"), childText(n)) + case atom.Code: + buf.WriteString(highlight(n, "`")) + case atom.B: + buf.WriteString(highlight(n, "*")) + case atom.I: + buf.WriteString(highlight(n, "_")) + case atom.Img: + src := attr(n, "src") + fmt.Fprintf(&buf, ".image %s\n", src) + case atom.Iframe: + src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") + fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) + case atom.Param: + if attr(n, "name") == "movie" { + // Old style YouTube embed. + u := attr(n, "value") + u = strings.Replace(u, "/v/", "/embed/", 1) + if i := strings.Index(u, "&"); i >= 0 { + u = u[:i] + } + fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) + } + default: + return true + } + return false + }) + return buf.String() +} + +func childText(node *html.Node) string { + var buf bytes.Buffer + for n := node.FirstChild; n != nil; n = n.NextSibling { + fmt.Fprint(&buf, text(n)) + } + return buf.String() +} + +func highlight(node *html.Node, char string) string { + t := strings.Replace(childText(node), " ", char, -1) + return fmt.Sprintf("%s%s%s", char, t, char) +} + +type selector func(*html.Node) bool + +func isTag(a atom.Atom) selector { + return func(n *html.Node) bool { + return n.DataAtom == a + } +} + +func hasClass(name string) selector { + return func(n *html.Node) bool { + for _, a := range n.Attr { + if a.Key == "class" { + for _, c := range strings.Fields(a.Val) { + if c == name { + return true + } + } + } + } + return false + } +} + +func hasStyle(s Style) selector { + return func(n *html.Node) bool { + for rule, s2 := range cssRules { + if s2 != s { + continue + } + if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { + return true + } + if n.DataAtom.String() == rule { + return true + } + } + return false + } +} + +func hasAttr(key, val string) selector { + return func(n *html.Node) bool { + for _, a := range n.Attr { + if a.Key == key && a.Val == val { + return true + } + } + return false + } +} + +func attr(node *html.Node, key string) (value string) { + for _, attr := range node.Attr { + if attr.Key == key { + return attr.Val + } + } + return "" +} + +func findAll(node *html.Node, fn selector) (nodes []*html.Node) { + walk(node, func(n *html.Node) bool { + if fn(n) { + nodes = append(nodes, n) + } + return true + }) + return +} + +func find(n *html.Node, fn selector) *html.Node { + var result *html.Node + walk(n, func(n *html.Node) bool { + if result != nil { + return false + } + if fn(n) { + result = n + return false + } + return true + }) + return result +} + +func walk(n *html.Node, fn selector) { + if fn(n) { + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c, fn) + } + } +} |