diff options
author | Toni Uhlig <matzeton@googlemail.com> | 2021-10-01 14:21:33 +0200 |
---|---|---|
committer | Toni Uhlig <matzeton@googlemail.com> | 2021-10-01 14:21:33 +0200 |
commit | d071b4177c1a3897f4682e245046a45f362b6ac5 (patch) | |
tree | e8af69a24b2b97758d4e7f12a65a93185b9b5890 /scripts/build_punct_map.py |
Squashed 'deps/md4c/' content from commit 7f05330
git-subtree-dir: deps/md4c
git-subtree-split: 7f0533068b4319d8cb3d0f33ca9aa66a857734a6
Diffstat (limited to 'scripts/build_punct_map.py')
-rw-r--r-- | scripts/build_punct_map.py | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/scripts/build_punct_map.py b/scripts/build_punct_map.py new file mode 100644 index 0000000..13102f2 --- /dev/null +++ b/scripts/build_punct_map.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r") + +codepoint_list = [] +category_list = [ "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" ] + +# Filter codepoints falling in the right category: +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + char_range, category = line.split(";") + char_range = char_range.strip() + category = category.strip() + + if not category in category_list: + continue + + delim_off = char_range.find("..") + if delim_off >= 0: + codepoint0 = int(char_range[:delim_off], 16) + codepoint1 = int(char_range[delim_off+2:], 16) + for codepoint in range(codepoint0, codepoint1 + 1): + codepoint_list.append(codepoint) + else: + codepoint = int(char_range, 16) + codepoint_list.append(codepoint) +f.close() + + +codepoint_list.sort() + + +index0 = 0 +count = len(codepoint_list) + +records = list() +while index0 < count: + index1 = index0 + 1 + while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1: + index1 += 1 + + if index1 - index0 > 1: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + + index0 = index1 + +sys.stdout.write("static const unsigned PUNCT_MAP[] = {\n") +sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) +sys.stdout.write("\n};\n\n") |