aboutsummaryrefslogtreecommitdiff
path: root/modules/mahonia/kuten.go
diff options
context:
space:
mode:
Diffstat (limited to 'modules/mahonia/kuten.go')
-rw-r--r--modules/mahonia/kuten.go88
1 files changed, 88 insertions, 0 deletions
diff --git a/modules/mahonia/kuten.go b/modules/mahonia/kuten.go
new file mode 100644
index 00000000..24929666
--- /dev/null
+++ b/modules/mahonia/kuten.go
@@ -0,0 +1,88 @@
+package mahonia
+
+import (
+ "sync"
+ "unicode/utf8"
+)
+
+// A kutenTable holds the data for a double-byte character set, arranged by ku
+// (区, zone) and ten (点, position). These can be converted to various actual
+// encoding schemes.
+type kutenTable struct {
+ // Data[ku][ten] is the unicode value for the character at that zone and
+ // position.
+ Data [94][94]uint16
+
+ // FromUnicode holds the ku and ten for each Unicode code point.
+ // It is not available until Reverse() has been called.
+ FromUnicode [][2]byte
+
+ // once is used to synchronize the generation of FromUnicode.
+ once sync.Once
+}
+
+// Reverse generates FromUnicode.
+func (t *kutenTable) Reverse() {
+ t.once.Do(func() {
+ t.FromUnicode = make([][2]byte, 65536)
+ for ku := range t.Data {
+ for ten, unicode := range t.Data[ku] {
+ t.FromUnicode[unicode] = [2]byte{byte(ku), byte(ten)}
+ }
+ }
+ })
+}
+
+// DecodeLow decodes a character from an encoding that does not have the high
+// bit set.
+func (t *kutenTable) DecodeLow(p []byte) (c rune, size int, status Status) {
+ if len(p) < 2 {
+ return 0, 0, NO_ROOM
+ }
+ ku := p[0] - 0x21
+ ten := p[1] - 0x21
+ if ku > 93 || ten > 93 {
+ return utf8.RuneError, 1, INVALID_CHAR
+ }
+ u := t.Data[ku][ten]
+ if u == 0 {
+ return utf8.RuneError, 1, INVALID_CHAR
+ }
+ return rune(u), 2, SUCCESS
+}
+
+// DecodeHigh decodes a character from an encoding that has the high bit set.
+func (t *kutenTable) DecodeHigh(p []byte) (c rune, size int, status Status) {
+ if len(p) < 2 {
+ return 0, 0, NO_ROOM
+ }
+ ku := p[0] - 0xa1
+ ten := p[1] - 0xa1
+ if ku > 93 || ten > 93 {
+ return utf8.RuneError, 1, INVALID_CHAR
+ }
+ u := t.Data[ku][ten]
+ if u == 0 {
+ return utf8.RuneError, 1, INVALID_CHAR
+ }
+ return rune(u), 2, SUCCESS
+}
+
+// EncodeHigh encodes a character in an encoding that has the high bit set.
+func (t *kutenTable) EncodeHigh(p []byte, c rune) (size int, status Status) {
+ if len(p) < 2 {
+ return 0, NO_ROOM
+ }
+ if c > 0xffff {
+ p[0] = '?'
+ return 1, INVALID_CHAR
+ }
+ kuten := t.FromUnicode[c]
+ if kuten == [2]byte{0, 0} && c != rune(t.Data[0][0]) {
+ p[0] = '?'
+ return 1, INVALID_CHAR
+ }
+ p[0] = kuten[0] + 0xa1
+ p[1] = kuten[1] + 0xa1
+ return 2, SUCCESS
+}