diff options
Diffstat (limited to 'modules/mahonia/utf16.go')
-rw-r--r-- | modules/mahonia/utf16.go | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/modules/mahonia/utf16.go b/modules/mahonia/utf16.go new file mode 100644 index 00000000..3ec7b770 --- /dev/null +++ b/modules/mahonia/utf16.go @@ -0,0 +1,170 @@ +package mahonia + +import ( + "unicode/utf16" +) + +func init() { + for i := 0; i < len(utf16Charsets); i++ { + RegisterCharset(&utf16Charsets[i]) + } +} + +var utf16Charsets = []Charset{ + { + Name: "UTF-16", + NewDecoder: func() Decoder { + var decodeRune Decoder + return func(p []byte) (c rune, size int, status Status) { + if decodeRune == nil { + // haven't read the BOM yet + if len(p) < 2 { + status = NO_ROOM + return + } + + switch { + case p[0] == 0xfe && p[1] == 0xff: + decodeRune = decodeUTF16beRune + return 0, 2, STATE_ONLY + case p[0] == 0xff && p[1] == 0xfe: + decodeRune = decodeUTF16leRune + return 0, 2, STATE_ONLY + default: + decodeRune = decodeUTF16beRune + } + } + + return decodeRune(p) + } + }, + NewEncoder: func() Encoder { + wroteBOM := false + return func(p []byte, c rune) (size int, status Status) { + if !wroteBOM { + if len(p) < 2 { + status = NO_ROOM + return + } + + p[0] = 0xfe + p[1] = 0xff + wroteBOM = true + return 2, STATE_ONLY + } + + return encodeUTF16beRune(p, c) + } + }, + }, + { + Name: "UTF-16BE", + NewDecoder: func() Decoder { return decodeUTF16beRune }, + NewEncoder: func() Encoder { return encodeUTF16beRune }, + }, + { + Name: "UTF-16LE", + NewDecoder: func() Decoder { return decodeUTF16leRune }, + NewEncoder: func() Encoder { return encodeUTF16leRune }, + }, +} + +func decodeUTF16beRune(p []byte) (r rune, size int, status Status) { + if len(p) < 2 { + status = NO_ROOM + return + } + + c := rune(p[0])<<8 + rune(p[1]) + + if utf16.IsSurrogate(c) { + if len(p) < 4 { + status = NO_ROOM + return + } + + c2 := rune(p[2])<<8 + rune(p[3]) + c = utf16.DecodeRune(c, c2) + + if c == 0xfffd { + return c, 2, INVALID_CHAR + } else { + return c, 4, SUCCESS + } + } + + return c, 2, SUCCESS +} + +func encodeUTF16beRune(p []byte, c rune) (size int, status Status) { + if c < 0x10000 { + if len(p) < 2 { + status = NO_ROOM + return + } + p[0] = byte(c >> 8) + p[1] = byte(c) + return 2, SUCCESS + } + + if len(p) < 4 { + status = NO_ROOM + return + } + s1, s2 := utf16.EncodeRune(c) + p[0] = byte(s1 >> 8) + p[1] = byte(s1) + p[2] = byte(s2 >> 8) + p[3] = byte(s2) + return 4, SUCCESS +} + +func decodeUTF16leRune(p []byte) (r rune, size int, status Status) { + if len(p) < 2 { + status = NO_ROOM + return + } + + c := rune(p[1])<<8 + rune(p[0]) + + if utf16.IsSurrogate(c) { + if len(p) < 4 { + status = NO_ROOM + return + } + + c2 := rune(p[3])<<8 + rune(p[2]) + c = utf16.DecodeRune(c, c2) + + if c == 0xfffd { + return c, 2, INVALID_CHAR + } else { + return c, 4, SUCCESS + } + } + + return c, 2, SUCCESS +} + +func encodeUTF16leRune(p []byte, c rune) (size int, status Status) { + if c < 0x10000 { + if len(p) < 2 { + status = NO_ROOM + return + } + p[1] = byte(c >> 8) + p[0] = byte(c) + return 2, SUCCESS + } + + if len(p) < 4 { + status = NO_ROOM + return + } + s1, s2 := utf16.EncodeRune(c) + p[1] = byte(s1 >> 8) + p[0] = byte(s1) + p[3] = byte(s2 >> 8) + p[2] = byte(s2) + return 4, SUCCESS +} |