diff options
Diffstat (limited to 'modules/mahonia/gb18030.go')
-rw-r--r-- | modules/mahonia/gb18030.go | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/modules/mahonia/gb18030.go b/modules/mahonia/gb18030.go new file mode 100644 index 00000000..d2759250 --- /dev/null +++ b/modules/mahonia/gb18030.go @@ -0,0 +1,156 @@ +package mahonia + +import ( + "sync" +) + +// Converters for GB18030 encoding. + +func init() { + RegisterCharset(&Charset{ + Name: "GB18030", + NewDecoder: func() Decoder { + gb18030Once.Do(buildGB18030Tables) + return decodeGB18030Rune + }, + NewEncoder: func() Encoder { + gb18030Once.Do(buildGB18030Tables) + return encodeGB18030Rune + }, + }) +} + +func decodeGB18030Rune(p []byte) (r rune, size int, status Status) { + if len(p) == 0 { + status = NO_ROOM + return + } + + b := p[0] + if b < 128 { + return rune(b), 1, SUCCESS + } + + if len(p) < 2 { + status = NO_ROOM + return + } + + if p[0] < 0x81 || p[0] > 0xfe { + return 0xfffd, 1, INVALID_CHAR + } + + if p[1] >= 0x40 { + // 2-byte character + c := uint16(p[0])<<8 + uint16(p[1]) + r = rune(gbkToUnicode[c]) + if r == 0 { + r = gbkToUnicodeExtra[c] + } + + if r != 0 { + return r, 2, SUCCESS + } + } else if p[1] >= 0x30 { + // 4-byte character + if len(p) < 4 { + return 0, 0, NO_ROOM + } + if p[2] < 0x81 || p[2] > 0xfe || p[3] < 0x30 || p[3] > 0x39 { + return 0xfffd, 1, INVALID_CHAR + } + + code := uint32(p[0])<<24 + uint32(p[1])<<16 + uint32(p[2])<<8 + uint32(p[3]) + lin := gb18030Linear(code) + + if lin <= maxGB18030Linear { + r = rune(gb18030LinearToUnicode[lin]) + if r != 0 { + return r, 4, SUCCESS + } + } + + for _, rng := range gb18030Ranges { + if lin >= rng.firstGB && lin <= rng.lastGB { + return rng.firstRune + rune(lin) - rune(rng.firstGB), 4, SUCCESS + } + } + } + + return 0xfffd, 1, INVALID_CHAR +} + +func encodeGB18030Rune(p []byte, r rune) (size int, status Status) { + if len(p) == 0 { + status = NO_ROOM + return + } + + if r < 128 { + p[0] = byte(r) + return 1, SUCCESS + } + + if len(p) < 2 { + status = NO_ROOM + return + } + + var c uint16 + if r < 0x10000 { + c = unicodeToGBK[r] + } else { + c = unicodeToGBKExtra[r] + } + + if c != 0 { + p[0] = byte(c >> 8) + p[1] = byte(c) + return 2, SUCCESS + } + + if len(p) < 4 { + return 0, NO_ROOM + } + + if r < 0x10000 { + f := unicodeToGB18030[r] + if f != 0 { + p[0] = byte(f >> 24) + p[1] = byte(f >> 16) + p[2] = byte(f >> 8) + p[3] = byte(f) + return 4, SUCCESS + } + } + + for _, rng := range gb18030Ranges { + if r >= rng.firstRune && r <= rng.lastRune { + lin := rng.firstGB + uint32(r) - uint32(rng.firstRune) + p[0] = byte(lin/(10*126*10)) + 0x81 + p[1] = byte(lin/(126*10)%10) + 0x30 + p[2] = byte(lin/10%126) + 0x81 + p[3] = byte(lin%10) + 0x30 + return 4, SUCCESS + } + } + + p[0] = 0x1a + return 1, INVALID_CHAR +} + +var gb18030Once sync.Once + +// Mapping from gb18039Linear values to Unicode. +var gb18030LinearToUnicode []uint16 + +var unicodeToGB18030 []uint32 + +func buildGB18030Tables() { + gb18030LinearToUnicode = make([]uint16, maxGB18030Linear+1) + unicodeToGB18030 = make([]uint32, 65536) + for _, data := range gb18030Data { + gb18030LinearToUnicode[gb18030Linear(data.gb18030)] = data.unicode + unicodeToGB18030[data.unicode] = data.gb18030 + } +} |