aboutsummaryrefslogtreecommitdiff
path: root/modules/mahonia/entity.go
diff options
context:
space:
mode:
Diffstat (limited to 'modules/mahonia/entity.go')
-rw-r--r--modules/mahonia/entity.go179
1 files changed, 179 insertions, 0 deletions
diff --git a/modules/mahonia/entity.go b/modules/mahonia/entity.go
new file mode 100644
index 00000000..ed31bbaa
--- /dev/null
+++ b/modules/mahonia/entity.go
@@ -0,0 +1,179 @@
+package mahonia
+
+// decoding HTML entities
+
+import (
+ "sort"
+)
+
+// EntityDecoder returns a Decoder that decodes HTML character entities.
+// If there is no valid character entity at the current position, it returns INVALID_CHAR.
+// So it needs to be combined with another Decoder via FallbackDecoder.
+func EntityDecoder() Decoder {
+ var leftover rune // leftover rune from two-rune entity
+ return func(p []byte) (r rune, size int, status Status) {
+ if leftover != 0 {
+ r = leftover
+ leftover = 0
+ return r, 0, SUCCESS
+ }
+
+ if len(p) == 0 {
+ return 0, 0, NO_ROOM
+ }
+
+ if p[0] != '&' {
+ return 0xfffd, 1, INVALID_CHAR
+ }
+
+ if len(p) < 3 {
+ return 0, 1, NO_ROOM
+ }
+
+ r, size, status = 0xfffd, 1, INVALID_CHAR
+ n := 1 // number of bytes read so far
+
+ if p[n] == '#' {
+ n++
+ c := p[n]
+ hex := false
+ if c == 'x' || c == 'X' {
+ hex = true
+ n++
+ }
+
+ var x rune
+ for n < len(p) {
+ c = p[n]
+ n++
+ if hex {
+ if '0' <= c && c <= '9' {
+ x = 16*x + rune(c) - '0'
+ continue
+ } else if 'a' <= c && c <= 'f' {
+ x = 16*x + rune(c) - 'a' + 10
+ continue
+ } else if 'A' <= c && c <= 'F' {
+ x = 16*x + rune(c) - 'A' + 10
+ continue
+ }
+ } else if '0' <= c && c <= '9' {
+ x = 10*x + rune(c) - '0'
+ continue
+ }
+ if c != ';' {
+ n--
+ }
+ break
+ }
+
+ if n == len(p) && p[n-1] != ';' {
+ return 0, 0, NO_ROOM
+ }
+
+ size = n
+ if p[n-1] == ';' {
+ n--
+ }
+ if hex {
+ n--
+ }
+ n--
+ // Now n is the number of actual digits read.
+ if n == 0 {
+ return 0xfffd, 1, INVALID_CHAR
+ }
+
+ if 0x80 <= x && x <= 0x9F {
+ // Replace characters from Windows-1252 with UTF-8 equivalents.
+ x = replacementTable[x-0x80]
+ } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
+ // Replace invalid characters with the replacement character.
+ return 0xfffd, size, INVALID_CHAR
+ }
+
+ r = x
+ status = SUCCESS
+ return
+ }
+
+ // Look for a named entity in EntityList.
+
+ possible := entityList
+ for len(possible) > 0 {
+ if len(p) <= n {
+ leftover = 0
+ return 0, 0, NO_ROOM
+ }
+
+ c := p[n]
+
+ // Narrow down the selection in possible to those items that have c in the
+ // appropriate byte.
+ first := sort.Search(len(possible), func(i int) bool {
+ e := possible[i].name
+ if len(e) < n {
+ return false
+ }
+ return e[n-1] >= c
+ })
+ possible = possible[first:]
+ last := sort.Search(len(possible), func(i int) bool {
+ return possible[i].name[n-1] > c
+ })
+ possible = possible[:last]
+
+ n++
+ if len(possible) > 0 && len(possible[0].name) == n-1 {
+ r, leftover = possible[0].r1, possible[0].r2
+ size = n
+ status = SUCCESS
+ // but don't return yet, since we need the longest match
+ }
+ }
+
+ return
+ }
+}
+
+// This table is copied from /src/pkg/html/escape.go in the Go source
+//
+// These replacements permit compatibility with old numeric entities that
+// assumed Windows-1252 encoding.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+var replacementTable = [...]rune{
+ '\u20AC', // First entry is what 0x80 should be replaced with.
+ '\u0081',
+ '\u201A',
+ '\u0192',
+ '\u201E',
+ '\u2026',
+ '\u2020',
+ '\u2021',
+ '\u02C6',
+ '\u2030',
+ '\u0160',
+ '\u2039',
+ '\u0152',
+ '\u008D',
+ '\u017D',
+ '\u008F',
+ '\u0090',
+ '\u2018',
+ '\u2019',
+ '\u201C',
+ '\u201D',
+ '\u2022',
+ '\u2013',
+ '\u2014',
+ '\u02DC',
+ '\u2122',
+ '\u0161',
+ '\u203A',
+ '\u0153',
+ '\u009D',
+ '\u017E',
+ '\u0178', // Last entry is 0x9F.
+ // 0x00->'\uFFFD' is handled programmatically.
+ // 0x0D->'\u000D' is a no-op.
+}