diff options
Diffstat (limited to 'vendor/github.com/jaytaylor/html2text/html2text.go')
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/html2text.go | 300 |
1 files changed, 300 insertions, 0 deletions
diff --git a/vendor/github.com/jaytaylor/html2text/html2text.go b/vendor/github.com/jaytaylor/html2text/html2text.go new file mode 100644 index 00000000..66454cfc --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/html2text.go @@ -0,0 +1,300 @@ +package html2text + +import ( + "bytes" + "io" + "regexp" + "strings" + "unicode" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +var ( + spacingRe = regexp.MustCompile(`[ \r\n\t]+`) + newlineRe = regexp.MustCompile(`\n\n+`) +) + +type textifyTraverseCtx struct { + Buf bytes.Buffer + + prefix string + blockquoteLevel int + lineLength int + endsWithSpace bool + endsWithNewline bool + justClosedDiv bool +} + +func (ctx *textifyTraverseCtx) traverse(node *html.Node) error { + switch node.Type { + + default: + return ctx.traverseChildren(node) + + case html.TextNode: + data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ") + return ctx.emit(data) + + case html.ElementNode: + + ctx.justClosedDiv = false + switch node.DataAtom { + case atom.Br: + return ctx.emit("\n") + + case atom.H1, atom.H2, atom.H3: + subCtx := textifyTraverseCtx{} + if err := subCtx.traverseChildren(node); err != nil { + return err + } + + str := subCtx.Buf.String() + dividerLen := 0 + for _, line := range strings.Split(str, "\n") { + if lineLen := len([]rune(line)); lineLen-1 > dividerLen { + dividerLen = lineLen - 1 + } + } + divider := "" + if node.DataAtom == atom.H1 { + divider = strings.Repeat("*", dividerLen) + } else { + divider = strings.Repeat("-", dividerLen) + } + + if node.DataAtom == atom.H3 { + return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") + } + return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") + + case atom.Blockquote: + ctx.blockquoteLevel++ + ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " + if err := ctx.emit("\n"); err != nil { + return err + } + if ctx.blockquoteLevel == 1 { + if err := ctx.emit("\n"); err != nil { + return err + } + } + if err := ctx.traverseChildren(node); err != nil { + return err + } + ctx.blockquoteLevel-- + ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + if ctx.blockquoteLevel > 0 { + ctx.prefix += " " + } + return ctx.emit("\n\n") + + case atom.Div: + if ctx.lineLength > 0 { + if err := ctx.emit("\n"); err != nil { + return err + } + } + if err := ctx.traverseChildren(node); err != nil { + return err + } + var err error + if ctx.justClosedDiv == false { + err = ctx.emit("\n") + } + ctx.justClosedDiv = true + return err + + case atom.Li: + if err := ctx.emit("* "); err != nil { + return err + } + + if err := ctx.traverseChildren(node); err != nil { + return err + } + + return ctx.emit("\n") + + case atom.B, atom.Strong: + subCtx := textifyTraverseCtx{} + subCtx.endsWithSpace = true + if err := subCtx.traverseChildren(node); err != nil { + return err + } + str := subCtx.Buf.String() + return ctx.emit("*" + str + "*") + + case atom.A: + // If image is the only child, take its alt text as the link text + if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { + if altText := getAttrVal(img, "alt"); altText != "" { + ctx.emit(altText) + } + } else if err := ctx.traverseChildren(node); err != nil { + return err + } + + hrefLink := "" + if attrVal := getAttrVal(node, "href"); attrVal != "" { + attrVal = ctx.normalizeHrefLink(attrVal) + if attrVal != "" { + hrefLink = "( " + attrVal + " )" + } + } + + return ctx.emit(hrefLink) + + case atom.P, atom.Ul, atom.Table: + if err := ctx.emit("\n\n"); err != nil { + return err + } + + if err := ctx.traverseChildren(node); err != nil { + return err + } + + return ctx.emit("\n\n") + + case atom.Tr: + if err := ctx.traverseChildren(node); err != nil { + return err + } + + return ctx.emit("\n") + + case atom.Style, atom.Script, atom.Head: + // Ignore the subtree + return nil + + default: + return ctx.traverseChildren(node) + } + } +} + +func (ctx *textifyTraverseCtx) traverseChildren(node *html.Node) error { + for c := node.FirstChild; c != nil; c = c.NextSibling { + if err := ctx.traverse(c); err != nil { + return err + } + } + + return nil +} + +func (ctx *textifyTraverseCtx) emit(data string) error { + if len(data) == 0 { + return nil + } + lines := ctx.breakLongLines(data) + var err error + for _, line := range lines { + runes := []rune(line) + startsWithSpace := unicode.IsSpace(runes[0]) + if !startsWithSpace && !ctx.endsWithSpace { + ctx.Buf.WriteByte(' ') + ctx.lineLength++ + } + ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) + for _, c := range line { + _, err = ctx.Buf.WriteString(string(c)) + if err != nil { + return err + } + ctx.lineLength++ + if c == '\n' { + ctx.lineLength = 0 + if ctx.prefix != "" { + _, err = ctx.Buf.WriteString(ctx.prefix) + if err != nil { + return err + } + } + } + } + } + return nil +} + +func (ctx *textifyTraverseCtx) breakLongLines(data string) []string { + // only break lines when we are in blockquotes + if ctx.blockquoteLevel == 0 { + return []string{data} + } + var ret []string + runes := []rune(data) + l := len(runes) + existing := ctx.lineLength + if existing >= 74 { + ret = append(ret, "\n") + existing = 0 + } + for l+existing > 74 { + i := 74 - existing + for i >= 0 && !unicode.IsSpace(runes[i]) { + i-- + } + if i == -1 { + // no spaces, so go the other way + i = 74 - existing + for i < l && !unicode.IsSpace(runes[i]) { + i++ + } + } + ret = append(ret, string(runes[:i])+"\n") + for i < l && unicode.IsSpace(runes[i]) { + i++ + } + runes = runes[i:] + l = len(runes) + existing = 0 + } + if len(runes) > 0 { + ret = append(ret, string(runes)) + } + return ret +} + +func (ctx *textifyTraverseCtx) normalizeHrefLink(link string) string { + link = strings.TrimSpace(link) + link = strings.TrimPrefix(link, "mailto:") + return link +} + +func getAttrVal(node *html.Node, attrName string) string { + for _, attr := range node.Attr { + if attr.Key == attrName { + return attr.Val + } + } + + return "" +} + +func FromReader(reader io.Reader) (string, error) { + doc, err := html.Parse(reader) + if err != nil { + return "", err + } + + ctx := textifyTraverseCtx{ + Buf: bytes.Buffer{}, + } + if err = ctx.traverse(doc); err != nil { + return "", err + } + + text := strings.TrimSpace(newlineRe.ReplaceAllString( + strings.Replace(ctx.Buf.String(), "\n ", "\n", -1), "\n\n")) + return text, nil +} + +func FromString(input string) (string, error) { + text, err := FromReader(strings.NewReader(input)) + if err != nil { + return "", err + } + return text, nil +} |