aboutsummaryrefslogtreecommitdiff
path: root/modules/mahonia/gb18030.go
diff options
context:
space:
mode:
Diffstat (limited to 'modules/mahonia/gb18030.go')
-rw-r--r--modules/mahonia/gb18030.go156
1 files changed, 156 insertions, 0 deletions
diff --git a/modules/mahonia/gb18030.go b/modules/mahonia/gb18030.go
new file mode 100644
index 00000000..d2759250
--- /dev/null
+++ b/modules/mahonia/gb18030.go
@@ -0,0 +1,156 @@
+package mahonia
+
+import (
+ "sync"
+)
+
+// Converters for GB18030 encoding.
+
+func init() {
+ RegisterCharset(&Charset{
+ Name: "GB18030",
+ NewDecoder: func() Decoder {
+ gb18030Once.Do(buildGB18030Tables)
+ return decodeGB18030Rune
+ },
+ NewEncoder: func() Encoder {
+ gb18030Once.Do(buildGB18030Tables)
+ return encodeGB18030Rune
+ },
+ })
+}
+
+func decodeGB18030Rune(p []byte) (r rune, size int, status Status) {
+ if len(p) == 0 {
+ status = NO_ROOM
+ return
+ }
+
+ b := p[0]
+ if b < 128 {
+ return rune(b), 1, SUCCESS
+ }
+
+ if len(p) < 2 {
+ status = NO_ROOM
+ return
+ }
+
+ if p[0] < 0x81 || p[0] > 0xfe {
+ return 0xfffd, 1, INVALID_CHAR
+ }
+
+ if p[1] >= 0x40 {
+ // 2-byte character
+ c := uint16(p[0])<<8 + uint16(p[1])
+ r = rune(gbkToUnicode[c])
+ if r == 0 {
+ r = gbkToUnicodeExtra[c]
+ }
+
+ if r != 0 {
+ return r, 2, SUCCESS
+ }
+ } else if p[1] >= 0x30 {
+ // 4-byte character
+ if len(p) < 4 {
+ return 0, 0, NO_ROOM
+ }
+ if p[2] < 0x81 || p[2] > 0xfe || p[3] < 0x30 || p[3] > 0x39 {
+ return 0xfffd, 1, INVALID_CHAR
+ }
+
+ code := uint32(p[0])<<24 + uint32(p[1])<<16 + uint32(p[2])<<8 + uint32(p[3])
+ lin := gb18030Linear(code)
+
+ if lin <= maxGB18030Linear {
+ r = rune(gb18030LinearToUnicode[lin])
+ if r != 0 {
+ return r, 4, SUCCESS
+ }
+ }
+
+ for _, rng := range gb18030Ranges {
+ if lin >= rng.firstGB && lin <= rng.lastGB {
+ return rng.firstRune + rune(lin) - rune(rng.firstGB), 4, SUCCESS
+ }
+ }
+ }
+
+ return 0xfffd, 1, INVALID_CHAR
+}
+
+func encodeGB18030Rune(p []byte, r rune) (size int, status Status) {
+ if len(p) == 0 {
+ status = NO_ROOM
+ return
+ }
+
+ if r < 128 {
+ p[0] = byte(r)
+ return 1, SUCCESS
+ }
+
+ if len(p) < 2 {
+ status = NO_ROOM
+ return
+ }
+
+ var c uint16
+ if r < 0x10000 {
+ c = unicodeToGBK[r]
+ } else {
+ c = unicodeToGBKExtra[r]
+ }
+
+ if c != 0 {
+ p[0] = byte(c >> 8)
+ p[1] = byte(c)
+ return 2, SUCCESS
+ }
+
+ if len(p) < 4 {
+ return 0, NO_ROOM
+ }
+
+ if r < 0x10000 {
+ f := unicodeToGB18030[r]
+ if f != 0 {
+ p[0] = byte(f >> 24)
+ p[1] = byte(f >> 16)
+ p[2] = byte(f >> 8)
+ p[3] = byte(f)
+ return 4, SUCCESS
+ }
+ }
+
+ for _, rng := range gb18030Ranges {
+ if r >= rng.firstRune && r <= rng.lastRune {
+ lin := rng.firstGB + uint32(r) - uint32(rng.firstRune)
+ p[0] = byte(lin/(10*126*10)) + 0x81
+ p[1] = byte(lin/(126*10)%10) + 0x30
+ p[2] = byte(lin/10%126) + 0x81
+ p[3] = byte(lin%10) + 0x30
+ return 4, SUCCESS
+ }
+ }
+
+ p[0] = 0x1a
+ return 1, INVALID_CHAR
+}
+
+var gb18030Once sync.Once
+
+// Mapping from gb18039Linear values to Unicode.
+var gb18030LinearToUnicode []uint16
+
+var unicodeToGB18030 []uint32
+
+func buildGB18030Tables() {
+ gb18030LinearToUnicode = make([]uint16, maxGB18030Linear+1)
+ unicodeToGB18030 = make([]uint32, 65536)
+ for _, data := range gb18030Data {
+ gb18030LinearToUnicode[gb18030Linear(data.gb18030)] = data.unicode
+ unicodeToGB18030[data.unicode] = data.gb18030
+ }
+}