diff options
Diffstat (limited to 'modules/mahonia/entity.go')
-rw-r--r-- | modules/mahonia/entity.go | 179 |
1 files changed, 0 insertions, 179 deletions
diff --git a/modules/mahonia/entity.go b/modules/mahonia/entity.go deleted file mode 100644 index ed31bbaa..00000000 --- a/modules/mahonia/entity.go +++ /dev/null @@ -1,179 +0,0 @@ -package mahonia - -// decoding HTML entities - -import ( - "sort" -) - -// EntityDecoder returns a Decoder that decodes HTML character entities. -// If there is no valid character entity at the current position, it returns INVALID_CHAR. -// So it needs to be combined with another Decoder via FallbackDecoder. -func EntityDecoder() Decoder { - var leftover rune // leftover rune from two-rune entity - return func(p []byte) (r rune, size int, status Status) { - if leftover != 0 { - r = leftover - leftover = 0 - return r, 0, SUCCESS - } - - if len(p) == 0 { - return 0, 0, NO_ROOM - } - - if p[0] != '&' { - return 0xfffd, 1, INVALID_CHAR - } - - if len(p) < 3 { - return 0, 1, NO_ROOM - } - - r, size, status = 0xfffd, 1, INVALID_CHAR - n := 1 // number of bytes read so far - - if p[n] == '#' { - n++ - c := p[n] - hex := false - if c == 'x' || c == 'X' { - hex = true - n++ - } - - var x rune - for n < len(p) { - c = p[n] - n++ - if hex { - if '0' <= c && c <= '9' { - x = 16*x + rune(c) - '0' - continue - } else if 'a' <= c && c <= 'f' { - x = 16*x + rune(c) - 'a' + 10 - continue - } else if 'A' <= c && c <= 'F' { - x = 16*x + rune(c) - 'A' + 10 - continue - } - } else if '0' <= c && c <= '9' { - x = 10*x + rune(c) - '0' - continue - } - if c != ';' { - n-- - } - break - } - - if n == len(p) && p[n-1] != ';' { - return 0, 0, NO_ROOM - } - - size = n - if p[n-1] == ';' { - n-- - } - if hex { - n-- - } - n-- - // Now n is the number of actual digits read. - if n == 0 { - return 0xfffd, 1, INVALID_CHAR - } - - if 0x80 <= x && x <= 0x9F { - // Replace characters from Windows-1252 with UTF-8 equivalents. - x = replacementTable[x-0x80] - } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { - // Replace invalid characters with the replacement character. - return 0xfffd, size, INVALID_CHAR - } - - r = x - status = SUCCESS - return - } - - // Look for a named entity in EntityList. - - possible := entityList - for len(possible) > 0 { - if len(p) <= n { - leftover = 0 - return 0, 0, NO_ROOM - } - - c := p[n] - - // Narrow down the selection in possible to those items that have c in the - // appropriate byte. - first := sort.Search(len(possible), func(i int) bool { - e := possible[i].name - if len(e) < n { - return false - } - return e[n-1] >= c - }) - possible = possible[first:] - last := sort.Search(len(possible), func(i int) bool { - return possible[i].name[n-1] > c - }) - possible = possible[:last] - - n++ - if len(possible) > 0 && len(possible[0].name) == n-1 { - r, leftover = possible[0].r1, possible[0].r2 - size = n - status = SUCCESS - // but don't return yet, since we need the longest match - } - } - - return - } -} - -// This table is copied from /src/pkg/html/escape.go in the Go source -// -// These replacements permit compatibility with old numeric entities that -// assumed Windows-1252 encoding. -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference -var replacementTable = [...]rune{ - '\u20AC', // First entry is what 0x80 should be replaced with. - '\u0081', - '\u201A', - '\u0192', - '\u201E', - '\u2026', - '\u2020', - '\u2021', - '\u02C6', - '\u2030', - '\u0160', - '\u2039', - '\u0152', - '\u008D', - '\u017D', - '\u008F', - '\u0090', - '\u2018', - '\u2019', - '\u201C', - '\u201D', - '\u2022', - '\u2013', - '\u2014', - '\u02DC', - '\u2122', - '\u0161', - '\u203A', - '\u0153', - '\u009D', - '\u017E', - '\u0178', // Last entry is 0x9F. - // 0x00->'\uFFFD' is handled programmatically. - // 0x0D->'\u000D' is a no-op. -} |