aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/html2article/conv.go350
1 files changed, 350 insertions, 0 deletions
diff --git a/cmd/html2article/conv.go b/cmd/html2article/conv.go
new file mode 100644
index 0000000..1cb6f07
--- /dev/null
+++ b/cmd/html2article/conv.go
@@ -0,0 +1,350 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+
+// This program takes an HTML file and outputs an corresponding article file in
+// present format.
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "errors"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "regexp"
+ "strings"
+
+ "code.google.com/p/go.net/html"
+ "code.google.com/p/go.net/html/atom"
+)
+
+func main() {
+ flag.Parse()
+
+ err := convert(os.Stdout, os.Stdin)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func convert(w io.Writer, r io.Reader) error {
+ root, err := html.Parse(r)
+ if err != nil {
+ return err
+ }
+
+ style := find(root, isTag(atom.Style))
+ parseStyles(style)
+
+ body := find(root, isTag(atom.Body))
+ if body == nil {
+ return errors.New("couldn't find body")
+ }
+ article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
+ _, err = fmt.Fprintf(w, "Title\n\n%s", article)
+ return err
+}
+
+type Style string
+
+const (
+ Bold Style = "*"
+ Italic Style = "_"
+ Code Style = "`"
+)
+
+var cssRules = make(map[string]Style)
+
+func parseStyles(style *html.Node) {
+ if style == nil || style.FirstChild == nil {
+ log.Println("couldn't find styles")
+ return
+ }
+ s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data))
+
+ findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
+ if i := bytes.Index(b, []byte("{")); i >= 0 {
+ token = bytes.TrimSpace(b[:i])
+ advance = i
+ }
+ return
+ }
+ findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
+ if len(b) == 0 {
+ return
+ }
+ if b[0] != '{' {
+ err = fmt.Errorf("expected {, got %c", b[0])
+ return
+ }
+ if i := bytes.Index(b, []byte("}")); i < 0 {
+ err = fmt.Errorf("can't find closing }")
+ return
+ } else {
+ token = b[1:i]
+ advance = i + 1
+ }
+ return
+ }
+
+ s.Split(findRule)
+ for s.Scan() {
+ rule := s.Text()
+ s.Split(findBody)
+ if !s.Scan() {
+ break
+ }
+ b := strings.ToLower(s.Text())
+ switch {
+ case strings.Contains(b, "italic"):
+ cssRules[rule] = Italic
+ case strings.Contains(b, "bold"):
+ cssRules[rule] = Bold
+ case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"):
+ cssRules[rule] = Code
+ }
+ s.Split(findRule)
+ }
+ if err := s.Err(); err != nil {
+ log.Println(err)
+ }
+}
+
+var newlineRun = regexp.MustCompile(`\n\n+`)
+
+func limitNewlineRuns(s string) string {
+ return newlineRun.ReplaceAllString(s, "\n\n")
+}
+
+func makeHeadings(body string) string {
+ buf := new(bytes.Buffer)
+ lines := strings.Split(body, "\n")
+ for i, s := range lines {
+ if i == 0 && !isBoldTitle(s) {
+ buf.WriteString("* Introduction\n\n")
+ }
+ if isBoldTitle(s) {
+ s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
+ s = "* " + s
+ }
+ buf.WriteString(s)
+ buf.WriteByte('\n')
+ }
+ return buf.String()
+}
+
+func isBoldTitle(s string) bool {
+ return !strings.Contains(s, " ") &&
+ strings.HasPrefix(s, "*") &&
+ strings.HasSuffix(s, "*")
+}
+
+func indent(buf *bytes.Buffer, s string) {
+ for _, l := range strings.Split(s, "\n") {
+ if l != "" {
+ buf.WriteByte('\t')
+ buf.WriteString(l)
+ }
+ buf.WriteByte('\n')
+ }
+}
+
+func unwrap(buf *bytes.Buffer, s string) {
+ var cont bool
+ for _, l := range strings.Split(s, "\n") {
+ l = strings.TrimSpace(l)
+ if len(l) == 0 {
+ if cont {
+ buf.WriteByte('\n')
+ buf.WriteByte('\n')
+ }
+ cont = false
+ } else {
+ if cont {
+ buf.WriteByte(' ')
+ }
+ buf.WriteString(l)
+ cont = true
+ }
+ }
+}
+
+func text(n *html.Node) string {
+ var buf bytes.Buffer
+ walk(n, func(n *html.Node) bool {
+ switch n.Type {
+ case html.TextNode:
+ buf.WriteString(n.Data)
+ return false
+ case html.ElementNode:
+ // no-op
+ default:
+ return true
+ }
+ a := n.DataAtom
+ if a == atom.Span {
+ switch {
+ case hasStyle(Code)(n):
+ a = atom.Code
+ case hasStyle(Bold)(n):
+ a = atom.B
+ case hasStyle(Italic)(n):
+ a = atom.I
+ }
+ }
+ switch a {
+ case atom.Br:
+ buf.WriteByte('\n')
+ case atom.P:
+ unwrap(&buf, childText(n))
+ buf.WriteString("\n\n")
+ case atom.Li:
+ buf.WriteString("- ")
+ unwrap(&buf, childText(n))
+ buf.WriteByte('\n')
+ case atom.Pre:
+ indent(&buf, childText(n))
+ buf.WriteByte('\n')
+ case atom.A:
+ fmt.Fprintf(&buf, "[[%s][%s]]", attr(n, "href"), childText(n))
+ case atom.Code:
+ buf.WriteString(highlight(n, "`"))
+ case atom.B:
+ buf.WriteString(highlight(n, "*"))
+ case atom.I:
+ buf.WriteString(highlight(n, "_"))
+ case atom.Img:
+ src := attr(n, "src")
+ fmt.Fprintf(&buf, ".image %s\n", src)
+ case atom.Iframe:
+ src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
+ fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
+ case atom.Param:
+ if attr(n, "name") == "movie" {
+ // Old style YouTube embed.
+ u := attr(n, "value")
+ u = strings.Replace(u, "/v/", "/embed/", 1)
+ if i := strings.Index(u, "&"); i >= 0 {
+ u = u[:i]
+ }
+ fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
+ }
+ default:
+ return true
+ }
+ return false
+ })
+ return buf.String()
+}
+
+func childText(node *html.Node) string {
+ var buf bytes.Buffer
+ for n := node.FirstChild; n != nil; n = n.NextSibling {
+ fmt.Fprint(&buf, text(n))
+ }
+ return buf.String()
+}
+
+func highlight(node *html.Node, char string) string {
+ t := strings.Replace(childText(node), " ", char, -1)
+ return fmt.Sprintf("%s%s%s", char, t, char)
+}
+
+type selector func(*html.Node) bool
+
+func isTag(a atom.Atom) selector {
+ return func(n *html.Node) bool {
+ return n.DataAtom == a
+ }
+}
+
+func hasClass(name string) selector {
+ return func(n *html.Node) bool {
+ for _, a := range n.Attr {
+ if a.Key == "class" {
+ for _, c := range strings.Fields(a.Val) {
+ if c == name {
+ return true
+ }
+ }
+ }
+ }
+ return false
+ }
+}
+
+func hasStyle(s Style) selector {
+ return func(n *html.Node) bool {
+ for rule, s2 := range cssRules {
+ if s2 != s {
+ continue
+ }
+ if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
+ return true
+ }
+ if n.DataAtom.String() == rule {
+ return true
+ }
+ }
+ return false
+ }
+}
+
+func hasAttr(key, val string) selector {
+ return func(n *html.Node) bool {
+ for _, a := range n.Attr {
+ if a.Key == key && a.Val == val {
+ return true
+ }
+ }
+ return false
+ }
+}
+
+func attr(node *html.Node, key string) (value string) {
+ for _, attr := range node.Attr {
+ if attr.Key == key {
+ return attr.Val
+ }
+ }
+ return ""
+}
+
+func findAll(node *html.Node, fn selector) (nodes []*html.Node) {
+ walk(node, func(n *html.Node) bool {
+ if fn(n) {
+ nodes = append(nodes, n)
+ }
+ return true
+ })
+ return
+}
+
+func find(n *html.Node, fn selector) *html.Node {
+ var result *html.Node
+ walk(n, func(n *html.Node) bool {
+ if result != nil {
+ return false
+ }
+ if fn(n) {
+ result = n
+ return false
+ }
+ return true
+ })
+ return result
+}
+
+func walk(n *html.Node, fn selector) {
+ if fn(n) {
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ walk(c, fn)
+ }
+ }
+}