mirror of
https://github.com/python/cpython.git
synced 2024-11-23 09:54:58 +08:00
bpo-40328: Add tool for generating cjk mapping headers (GH-19602)
This commit is contained in:
parent
2d8757758d
commit
113feb3ec2
@ -0,0 +1 @@
|
||||
Add tools for generating mappings headers for CJKCodecs.
|
@ -1,8 +1,6 @@
|
||||
To generate or modify mapping headers
|
||||
-------------------------------------
|
||||
Mapping headers are imported from CJKCodecs as pre-generated form.
|
||||
If you need to tweak or add something on it, please look at tools/
|
||||
subdirectory of CJKCodecs' distribution.
|
||||
Mapping headers are generated from Tools/unicode/genmap_*.py
|
||||
|
||||
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
// AUTO-GENERATED FILE FROM genmap_schinese.py: DO NOT EDIT
|
||||
static const ucs2_t __gb2312_decmap[7482] = {
|
||||
12288,12289,12290,12539,713,711,168,12291,12293,8213,65374,8214,8230,8216,
|
||||
8217,8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,
|
||||
|
@ -1,3 +1,4 @@
|
||||
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
|
||||
#define JISX0213_ENCPAIRS 46
|
||||
#ifdef EXTERN_JISX0213_PAIR
|
||||
static const struct widedbcs_index *jisx0213_pair_decmap;
|
||||
|
@ -1,3 +1,4 @@
|
||||
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
|
||||
static const ucs2_t __jisx0208_decmap[6956] = {
|
||||
12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,
|
||||
65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,
|
||||
|
@ -1,3 +1,4 @@
|
||||
// AUTO-GENERATED FILE FROM genmap_korean.py: DO NOT EDIT
|
||||
static const ucs2_t __ksx1001_decmap[8264] = {
|
||||
12288,12289,12290,183,8229,8230,168,12291,173,8213,8741,65340,8764,8216,8217,
|
||||
8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,12304,
|
||||
@ -3249,3 +3250,4 @@ __cp949_encmap+31959,0,255},{__cp949_encmap+32215,0,255},{__cp949_encmap+32471
|
||||
__cp949_encmap+32891,0,11},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{__cp949_encmap+
|
||||
32903,1,230},
|
||||
};
|
||||
|
||||
|
251
Tools/unicode/genmap_japanese.py
Normal file
251
Tools/unicode/genmap_japanese.py
Normal file
@ -0,0 +1,251 @@
|
||||
#
|
||||
# genmap_ja_codecs.py: Japanese Codecs Map Generator
|
||||
#
|
||||
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
|
||||
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
|
||||
#
|
||||
import os
|
||||
|
||||
from genmap_support import *
|
||||
|
||||
JISX0208_C1 = (0x21, 0x74)
|
||||
JISX0208_C2 = (0x21, 0x7e)
|
||||
JISX0212_C1 = (0x22, 0x6d)
|
||||
JISX0212_C2 = (0x21, 0x7e)
|
||||
JISX0213_C1 = (0x21, 0x7e)
|
||||
JISX0213_C2 = (0x21, 0x7e)
|
||||
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
|
||||
CP932P0_C2 = (0x5f, 0xca)
|
||||
CP932P1_C1 = (0x87, 0x87) # CP932 P1
|
||||
CP932P1_C2 = (0x40, 0x9c)
|
||||
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
|
||||
CP932P2_C2 = (0x40, 0xfc)
|
||||
|
||||
MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
|
||||
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
|
||||
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
|
||||
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'
|
||||
|
||||
|
||||
def loadmap_jisx0213(fo):
|
||||
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
|
||||
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
|
||||
decmap3_pair = {} # maps to BMP-pair for level 3
|
||||
for line in fo:
|
||||
line = line.split('#', 1)[0].strip()
|
||||
if not line or len(line.split()) < 2:
|
||||
continue
|
||||
|
||||
row = line.split()
|
||||
loc = eval('0x' + row[0][2:])
|
||||
level = eval(row[0][0])
|
||||
m = None
|
||||
if len(row[1].split('+')) == 2: # single unicode
|
||||
uni = eval('0x' + row[1][2:])
|
||||
if level == 3:
|
||||
if uni < 0x10000:
|
||||
m = decmap3
|
||||
elif 0x20000 <= uni < 0x30000:
|
||||
uni -= 0x20000
|
||||
m = decmap3_2
|
||||
elif level == 4:
|
||||
if uni < 0x10000:
|
||||
m = decmap4
|
||||
elif 0x20000 <= uni < 0x30000:
|
||||
uni -= 0x20000
|
||||
m = decmap4_2
|
||||
m.setdefault((loc >> 8), {})
|
||||
m[(loc >> 8)][(loc & 0xff)] = uni
|
||||
else: # pair
|
||||
uniprefix = eval('0x' + row[1][2:6]) # body
|
||||
uni = eval('0x' + row[1][7:11]) # modifier
|
||||
if level != 3:
|
||||
raise ValueError("invalid map")
|
||||
decmap3_pair.setdefault(uniprefix, {})
|
||||
m = decmap3_pair[uniprefix]
|
||||
|
||||
if m is None:
|
||||
raise ValueError("invalid map")
|
||||
m.setdefault((loc >> 8), {})
|
||||
m[(loc >> 8)][(loc & 0xff)] = uni
|
||||
|
||||
return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair
|
||||
|
||||
|
||||
def main():
|
||||
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
|
||||
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
|
||||
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
|
||||
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)
|
||||
|
||||
print("Loading Mapping File...")
|
||||
|
||||
sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
|
||||
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
|
||||
jisx0212decmap = loadmap(jisx0212file)
|
||||
cp932decmap = loadmap(cp932file)
|
||||
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)
|
||||
|
||||
if jis3decmap[0x21][0x24] != 0xff0c:
|
||||
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')
|
||||
|
||||
sjisencmap, cp932encmap = {}, {}
|
||||
jisx0208_0212encmap = {}
|
||||
for c1, m in sjisdecmap.items():
|
||||
for c2, code in m.items():
|
||||
sjisencmap.setdefault(code >> 8, {})
|
||||
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
for c1, m in cp932decmap.items():
|
||||
for c2, code in m.items():
|
||||
cp932encmap.setdefault(code >> 8, {})
|
||||
if (code & 0xff) not in cp932encmap[code >> 8]:
|
||||
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
for c1, m in cp932encmap.copy().items():
|
||||
for c2, code in m.copy().items():
|
||||
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
|
||||
del cp932encmap[c1][c2]
|
||||
if not cp932encmap[c1]:
|
||||
del cp932encmap[c1]
|
||||
|
||||
jisx0213pairdecmap = {}
|
||||
jisx0213pairencmap = []
|
||||
for unibody, m1 in jis3_pairdecmap.items():
|
||||
for c1, m2 in m1.items():
|
||||
for c2, modifier in m2.items():
|
||||
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
|
||||
jisx0213pairdecmap.setdefault(c1, {})
|
||||
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier
|
||||
|
||||
# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
|
||||
for c1, m in jisx0208decmap.items():
|
||||
for c2, code in m.items():
|
||||
jisx0208_0212encmap.setdefault(code >> 8, {})
|
||||
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
|
||||
for c1, m in jisx0212decmap.items():
|
||||
for c2, code in m.items():
|
||||
jisx0208_0212encmap.setdefault(code >> 8, {})
|
||||
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
|
||||
print("OOPS!!!", (code))
|
||||
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
|
||||
|
||||
jisx0213bmpencmap = {}
|
||||
for c1, m in jis3decmap.copy().items():
|
||||
for c2, code in m.copy().items():
|
||||
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
|
||||
if code in jis3_pairdecmap:
|
||||
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
|
||||
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
|
||||
elif jisx0208decmap[c1][c2] == code:
|
||||
del jis3decmap[c1][c2]
|
||||
if not jis3decmap[c1]:
|
||||
del jis3decmap[c1]
|
||||
else:
|
||||
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
|
||||
else:
|
||||
jisx0213bmpencmap.setdefault(code >> 8, {})
|
||||
if code not in jis3_pairdecmap:
|
||||
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
else:
|
||||
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
|
||||
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
|
||||
|
||||
for c1, m in jis4decmap.items():
|
||||
for c2, code in m.items():
|
||||
jisx0213bmpencmap.setdefault(code >> 8, {})
|
||||
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
|
||||
|
||||
jisx0213empencmap = {}
|
||||
for c1, m in jis3_2_decmap.items():
|
||||
for c2, code in m.items():
|
||||
jisx0213empencmap.setdefault(code >> 8, {})
|
||||
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
for c1, m in jis4_2_decmap.items():
|
||||
for c2, code in m.items():
|
||||
jisx0213empencmap.setdefault(code >> 8, {})
|
||||
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
|
||||
|
||||
with open("mappings_jp.h", "w") as fp:
|
||||
print_autogen(fp, os.path.basename(__file__))
|
||||
print("Generating JIS X 0208 decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
|
||||
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0212 decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
|
||||
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0208 && JIS X 0212 encode map...")
|
||||
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
|
||||
writer.generate()
|
||||
|
||||
print("Generating CP932 Extension decode map...")
|
||||
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
|
||||
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
|
||||
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
|
||||
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating CP932 Extension encode map...")
|
||||
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 Plane 1 BMP decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
|
||||
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 Plane 2 BMP decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
|
||||
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 BMP encode map...")
|
||||
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 Plane 1 EMP decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
|
||||
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 Plane 2 EMP decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
|
||||
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating JIS X 0213 EMP encode map...")
|
||||
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
|
||||
writer.generate()
|
||||
|
||||
with open('mappings_jisx0213_pair.h', 'w') as fp:
|
||||
print_autogen(fp, os.path.basename(__file__))
|
||||
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
|
||||
fp.write("""\
|
||||
#ifdef EXTERN_JISX0213_PAIR
|
||||
static const struct widedbcs_index *jisx0213_pair_decmap;
|
||||
static const struct pair_encodemap *jisx0213_pair_encmap;
|
||||
#else
|
||||
""")
|
||||
|
||||
print("Generating JIS X 0213 unicode-pair decode map...")
|
||||
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
|
||||
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
|
||||
writer.generate(wide=True)
|
||||
|
||||
print("Generating JIS X 0213 unicode-pair encode map...")
|
||||
jisx0213pairencmap.sort()
|
||||
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
|
||||
filler = BufferedFiller()
|
||||
for body, modifier, jis in jisx0213pairencmap:
|
||||
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
|
||||
filler.printout(fp)
|
||||
fp.write("};\n")
|
||||
fp.write("#endif\n")
|
||||
|
||||
print("Done!")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
62
Tools/unicode/genmap_korean.py
Normal file
62
Tools/unicode/genmap_korean.py
Normal file
@ -0,0 +1,62 @@
|
||||
#
|
||||
# genmap_korean.py: Korean Codecs Map Generator
|
||||
#
|
||||
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
|
||||
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
|
||||
#
|
||||
import os
|
||||
|
||||
from genmap_support import *
|
||||
|
||||
|
||||
KSX1001_C1 = (0x21, 0x7e)
|
||||
KSX1001_C2 = (0x21, 0x7e)
|
||||
UHCL1_C1 = (0x81, 0xa0)
|
||||
UHCL1_C2 = (0x41, 0xfe)
|
||||
UHCL2_C1 = (0xa1, 0xfe)
|
||||
UHCL2_C2 = (0x41, 0xa0)
|
||||
MAPPINGS_CP949 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT'
|
||||
|
||||
|
||||
def main():
|
||||
mapfile = open_mapping_file('python-mappings/CP949.TXT', MAPPINGS_CP949)
|
||||
print("Loading Mapping File...")
|
||||
decmap = loadmap(mapfile)
|
||||
uhcdecmap, ksx1001decmap, cp949encmap = {}, {}, {}
|
||||
for c1, c2map in decmap.items():
|
||||
for c2, code in c2map.items():
|
||||
if c1 >= 0xa1 and c2 >= 0xa1:
|
||||
ksx1001decmap.setdefault(c1 & 0x7f, {})
|
||||
ksx1001decmap[c1 & 0x7f][c2 & 0x7f] = c2map[c2]
|
||||
cp949encmap.setdefault(code >> 8, {})
|
||||
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) & 0x7f7f
|
||||
else:
|
||||
# uhc
|
||||
uhcdecmap.setdefault(c1, {})
|
||||
uhcdecmap[c1][c2] = c2map[c2]
|
||||
cp949encmap.setdefault(code >> 8, {}) # MSB set
|
||||
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2)
|
||||
|
||||
with open('mappings_kr.h', 'w') as fp:
|
||||
print_autogen(fp, os.path.basename(__file__))
|
||||
|
||||
print("Generating KS X 1001 decode map...")
|
||||
writer = DecodeMapWriter(fp, "ksx1001", ksx1001decmap)
|
||||
writer.update_decode_map(KSX1001_C1, KSX1001_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating UHC decode map...")
|
||||
writer = DecodeMapWriter(fp, "cp949ext", uhcdecmap)
|
||||
writer.update_decode_map(UHCL1_C1, UHCL1_C2)
|
||||
writer.update_decode_map(UHCL2_C1, UHCL2_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating CP949 (includes KS X 1001) encode map...")
|
||||
writer = EncodeMapWriter(fp, "cp949", cp949encmap)
|
||||
writer.generate()
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
149
Tools/unicode/genmap_schinese.py
Normal file
149
Tools/unicode/genmap_schinese.py
Normal file
@ -0,0 +1,149 @@
|
||||
#
|
||||
# genmap_schinese.py: Simplified Chinese Codecs Map Generator
|
||||
#
|
||||
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
|
||||
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
|
||||
#
|
||||
import os
|
||||
import re
|
||||
|
||||
from genmap_support import *
|
||||
|
||||
|
||||
GB2312_C1 = (0x21, 0x7e)
|
||||
GB2312_C2 = (0x21, 0x7e)
|
||||
GBKL1_C1 = (0x81, 0xa8)
|
||||
GBKL1_C2 = (0x40, 0xfe)
|
||||
GBKL2_C1 = (0xa9, 0xfe)
|
||||
GBKL2_C2 = (0x40, 0xa0)
|
||||
GB18030EXTP1_C1 = (0xa1, 0xa9)
|
||||
GB18030EXTP1_C2 = (0x40, 0xfe)
|
||||
GB18030EXTP2_C1 = (0xaa, 0xaf)
|
||||
GB18030EXTP2_C2 = (0xa1, 0xfe)
|
||||
GB18030EXTP3_C1 = (0xd7, 0xd7)
|
||||
GB18030EXTP3_C2 = (0xfa, 0xfe)
|
||||
GB18030EXTP4_C1 = (0xf8, 0xfd)
|
||||
GB18030EXTP4_C2 = (0xa1, 0xfe)
|
||||
GB18030EXTP5_C1 = (0xfe, 0xfe)
|
||||
GB18030EXTP5_C2 = (0x50, 0xfe)
|
||||
|
||||
MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
|
||||
MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
|
||||
MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
|
||||
|
||||
re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
|
||||
|
||||
|
||||
def parse_gb18030map(fo):
|
||||
m, gbuni = {}, {}
|
||||
for i in range(65536):
|
||||
if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
|
||||
gbuni[i] = None
|
||||
for uni, native in re_gb18030ass.findall(fo.read()):
|
||||
uni = eval('0x'+uni)
|
||||
native = [eval('0x'+u) for u in native.split()]
|
||||
if len(native) <= 2:
|
||||
del gbuni[uni]
|
||||
if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
|
||||
m.setdefault(native[0], {})
|
||||
m[native[0]][native[1]] = uni
|
||||
gbuni = [k for k in gbuni.keys()]
|
||||
gbuni.sort()
|
||||
return m, gbuni
|
||||
|
||||
def main():
|
||||
print("Loading Mapping File...")
|
||||
gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
|
||||
cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
|
||||
gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
|
||||
|
||||
gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
|
||||
gbkdecmap = loadmap(cp936map)
|
||||
gb2312decmap = loadmap(gb2312map)
|
||||
difmap = {}
|
||||
for c1, m in gbkdecmap.items():
|
||||
for c2, code in m.items():
|
||||
del gb18030decmap[c1][c2]
|
||||
if not gb18030decmap[c1]:
|
||||
del gb18030decmap[c1]
|
||||
for c1, m in gb2312decmap.items():
|
||||
for c2, code in m.items():
|
||||
gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
|
||||
if gbkdecmap[gbkc1][gbkc2] == code:
|
||||
del gbkdecmap[gbkc1][gbkc2]
|
||||
if not gbkdecmap[gbkc1]:
|
||||
del gbkdecmap[gbkc1]
|
||||
|
||||
gb2312_gbkencmap, gb18030encmap = {}, {}
|
||||
for c1, m in gbkdecmap.items():
|
||||
for c2, code in m.items():
|
||||
gb2312_gbkencmap.setdefault(code >> 8, {})
|
||||
gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
|
||||
for c1, m in gb2312decmap.items():
|
||||
for c2, code in m.items():
|
||||
gb2312_gbkencmap.setdefault(code >> 8, {})
|
||||
gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
|
||||
for c1, m in gb18030decmap.items():
|
||||
for c2, code in m.items():
|
||||
gb18030encmap.setdefault(code >> 8, {})
|
||||
gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
|
||||
|
||||
with open('mappings_cn.h', 'w') as fp:
|
||||
print_autogen(fp, os.path.basename(__file__))
|
||||
|
||||
print("Generating GB2312 decode map...")
|
||||
writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
|
||||
writer.update_decode_map(GB2312_C1, GB2312_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating GBK decode map...")
|
||||
writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
|
||||
writer.update_decode_map(GBKL1_C1, GBKL1_C2)
|
||||
writer.update_decode_map(GBKL2_C1, GBKL2_C2)
|
||||
writer.generate()
|
||||
|
||||
print("Generating GB2312 && GBK encode map...")
|
||||
writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
|
||||
writer.generate()
|
||||
|
||||
print("Generating GB18030 extension decode map...")
|
||||
writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
|
||||
for i in range(1, 6):
|
||||
writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
|
||||
|
||||
writer.generate()
|
||||
|
||||
print("Generating GB18030 extension encode map...")
|
||||
writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
|
||||
writer.generate()
|
||||
|
||||
print("Generating GB18030 Unicode BMP Mapping Ranges...")
|
||||
ranges = [[-1, -1, -1]]
|
||||
gblinnum = 0
|
||||
fp.write("""
|
||||
static const struct _gb18030_to_unibmp_ranges {
|
||||
Py_UCS4 first, last;
|
||||
DBCHAR base;
|
||||
} gb18030_to_unibmp_ranges[] = {
|
||||
""")
|
||||
|
||||
for uni in gb18030unilinear:
|
||||
if uni == ranges[-1][1] + 1:
|
||||
ranges[-1][1] = uni
|
||||
else:
|
||||
ranges.append([uni, uni, gblinnum])
|
||||
gblinnum += 1
|
||||
|
||||
filler = BufferedFiller()
|
||||
for first, last, base in ranges[1:]:
|
||||
filler.write('{', str(first), ',', str(last), ',', str(base), '},')
|
||||
|
||||
filler.write('{', '0,', '0,', str(
|
||||
ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
|
||||
filler.printout(fp)
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
198
Tools/unicode/genmap_support.py
Normal file
198
Tools/unicode/genmap_support.py
Normal file
@ -0,0 +1,198 @@
|
||||
#
|
||||
# genmap_support.py: Multibyte Codec Map Generator
|
||||
#
|
||||
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
|
||||
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
|
||||
#
|
||||
|
||||
|
||||
class BufferedFiller:
|
||||
def __init__(self, column=78):
|
||||
self.column = column
|
||||
self.buffered = []
|
||||
self.cline = []
|
||||
self.clen = 0
|
||||
self.count = 0
|
||||
|
||||
def write(self, *data):
|
||||
for s in data:
|
||||
if len(s) > self.column:
|
||||
raise ValueError("token is too long")
|
||||
if len(s) + self.clen > self.column:
|
||||
self.flush()
|
||||
self.clen += len(s)
|
||||
self.cline.append(s)
|
||||
self.count += 1
|
||||
|
||||
def flush(self):
|
||||
if not self.cline:
|
||||
return
|
||||
self.buffered.append(''.join(self.cline))
|
||||
self.clen = 0
|
||||
del self.cline[:]
|
||||
|
||||
def printout(self, fp):
|
||||
self.flush()
|
||||
for l in self.buffered:
|
||||
fp.write(f'{l}\n')
|
||||
del self.buffered[:]
|
||||
|
||||
def __len__(self):
|
||||
return self.count
|
||||
|
||||
|
||||
class DecodeMapWriter:
|
||||
filler_class = BufferedFiller
|
||||
|
||||
def __init__(self, fp, prefix, decode_map):
|
||||
self.fp = fp
|
||||
self.prefix = prefix
|
||||
self.decode_map = decode_map
|
||||
self.filler = self.filler_class()
|
||||
|
||||
def update_decode_map(self, c1range, c2range, onlymask=(), wide=0):
|
||||
c2values = range(c2range[0], c2range[1] + 1)
|
||||
|
||||
for c1 in range(c1range[0], c1range[1] + 1):
|
||||
if c1 not in self.decode_map or (onlymask and c1 not in onlymask):
|
||||
continue
|
||||
c2map = self.decode_map[c1]
|
||||
rc2values = [n for n in c2values if n in c2map]
|
||||
if not rc2values:
|
||||
continue
|
||||
|
||||
c2map[self.prefix] = True
|
||||
c2map['min'] = rc2values[0]
|
||||
c2map['max'] = rc2values[-1]
|
||||
c2map['midx'] = len(self.filler)
|
||||
|
||||
for v in range(rc2values[0], rc2values[-1] + 1):
|
||||
if v in c2map:
|
||||
self.filler.write('%d,' % c2map[v])
|
||||
else:
|
||||
self.filler.write('U,')
|
||||
|
||||
def generate(self, wide=False):
|
||||
if not wide:
|
||||
self.fp.write(f"static const ucs2_t __{self.prefix}_decmap[{len(self.filler)}] = {{\n")
|
||||
else:
|
||||
self.fp.write(f"static const Py_UCS4 __{self.prefix}_decmap[{len(self.filler)}] = {{\n")
|
||||
|
||||
self.filler.printout(self.fp)
|
||||
self.fp.write("};\n\n")
|
||||
|
||||
if not wide:
|
||||
self.fp.write(f"static const struct dbcs_index {self.prefix}_decmap[256] = {{\n")
|
||||
else:
|
||||
self.fp.write(f"static const struct widedbcs_index {self.prefix}_decmap[256] = {{\n")
|
||||
|
||||
for i in range(256):
|
||||
if i in self.decode_map and self.prefix in self.decode_map[i]:
|
||||
m = self.decode_map
|
||||
prefix = self.prefix
|
||||
else:
|
||||
self.filler.write("{", "0,", "0,", "0", "},")
|
||||
continue
|
||||
|
||||
self.filler.write("{", "__%s_decmap" % prefix, "+", "%d" % m[i]['midx'],
|
||||
",", "%d," % m[i]['min'], "%d" % m[i]['max'], "},")
|
||||
self.filler.printout(self.fp)
|
||||
self.fp.write("};\n\n")
|
||||
|
||||
|
||||
class EncodeMapWriter:
|
||||
filler_class = BufferedFiller
|
||||
elemtype = 'DBCHAR'
|
||||
indextype = 'struct unim_index'
|
||||
|
||||
def __init__(self, fp, prefix, encode_map):
|
||||
self.fp = fp
|
||||
self.prefix = prefix
|
||||
self.encode_map = encode_map
|
||||
self.filler = self.filler_class()
|
||||
|
||||
def generate(self):
|
||||
self.buildmap()
|
||||
self.printmap()
|
||||
|
||||
def buildmap(self):
|
||||
for c1 in range(0, 256):
|
||||
if c1 not in self.encode_map:
|
||||
continue
|
||||
c2map = self.encode_map[c1]
|
||||
rc2values = [k for k in c2map.keys()]
|
||||
rc2values.sort()
|
||||
if not rc2values:
|
||||
continue
|
||||
|
||||
c2map[self.prefix] = True
|
||||
c2map['min'] = rc2values[0]
|
||||
c2map['max'] = rc2values[-1]
|
||||
c2map['midx'] = len(self.filler)
|
||||
|
||||
for v in range(rc2values[0], rc2values[-1] + 1):
|
||||
if v not in c2map:
|
||||
self.write_nochar()
|
||||
elif isinstance(c2map[v], int):
|
||||
self.write_char(c2map[v])
|
||||
elif isinstance(c2map[v], tuple):
|
||||
self.write_multic(c2map[v])
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def write_nochar(self):
|
||||
self.filler.write('N,')
|
||||
|
||||
def write_multic(self, point):
|
||||
self.filler.write('M,')
|
||||
|
||||
def write_char(self, point):
|
||||
self.filler.write(str(point) + ',')
|
||||
|
||||
def printmap(self):
|
||||
self.fp.write(f"static const {self.elemtype} __{self.prefix}_encmap[{len(self.filler)}] = {{\n")
|
||||
self.filler.printout(self.fp)
|
||||
self.fp.write("};\n\n")
|
||||
self.fp.write(f"static const {self.indextype} {self.prefix}_encmap[256] = {{\n")
|
||||
|
||||
for i in range(256):
|
||||
if i in self.encode_map and self.prefix in self.encode_map[i]:
|
||||
self.filler.write("{", "__%s_encmap" % self.prefix, "+",
|
||||
"%d" % self.encode_map[i]['midx'], ",",
|
||||
"%d," % self.encode_map[i]['min'],
|
||||
"%d" % self.encode_map[i]['max'], "},")
|
||||
else:
|
||||
self.filler.write("{", "0,", "0,", "0", "},")
|
||||
continue
|
||||
self.filler.printout(self.fp)
|
||||
self.fp.write("};\n\n")
|
||||
|
||||
|
||||
def open_mapping_file(path, source):
|
||||
try:
|
||||
f = open(path)
|
||||
except IOError:
|
||||
raise SystemExit(f'{source} is needed')
|
||||
return f
|
||||
|
||||
|
||||
def print_autogen(fo, source):
|
||||
fo.write(f'// AUTO-GENERATED FILE FROM {source}: DO NOT EDIT\n')
|
||||
|
||||
|
||||
def loadmap(fo, natcol=0, unicol=1, sbcs=0):
|
||||
print("Loading from", fo)
|
||||
fo.seek(0, 0)
|
||||
decmap = {}
|
||||
for line in fo:
|
||||
line = line.split('#', 1)[0].strip()
|
||||
if not line or len(line.split()) < 2:
|
||||
continue
|
||||
|
||||
row = [eval(e) for e in line.split()]
|
||||
loc, uni = row[natcol], row[unicol]
|
||||
if loc >= 0x100 or sbcs:
|
||||
decmap.setdefault((loc >> 8), {})
|
||||
decmap[(loc >> 8)][(loc & 0xff)] = uni
|
||||
|
||||
return decmap
|
7515
Tools/unicode/python-mappings/GB2312.TXT
Normal file
7515
Tools/unicode/python-mappings/GB2312.TXT
Normal file
File diff suppressed because it is too large
Load Diff
271
Tools/unicode/python-mappings/diff/jisx0213-2000-std.txt.diff
Normal file
271
Tools/unicode/python-mappings/diff/jisx0213-2000-std.txt.diff
Normal file
@ -0,0 +1,271 @@
|
||||
--- jisx0213-2000-std.txt.orig Tue Apr 16 23:32:38 2002
|
||||
+++ jisx0213-2000-std.txt Wed Jun 16 14:49:05 2004
|
||||
@@ -23,21 +23,21 @@
|
||||
3-2121 U+3000 # IDEOGRAPHIC SPACE
|
||||
3-2122 U+3001 # IDEOGRAPHIC COMMA
|
||||
3-2123 U+3002 # IDEOGRAPHIC FULL STOP
|
||||
-3-2124 U+002C # COMMA Fullwidth: U+FF0C
|
||||
-3-2125 U+002E # FULL STOP Fullwidth: U+FF0E
|
||||
+3-2124 U+FF0C # COMMA Fullwidth: U+FF0C
|
||||
+3-2125 U+FF0E # FULL STOP Fullwidth: U+FF0E
|
||||
3-2126 U+30FB # KATAKANA MIDDLE DOT
|
||||
-3-2127 U+003A # COLON Fullwidth: U+FF1A
|
||||
-3-2128 U+003B # SEMICOLON Fullwidth: U+FF1B
|
||||
-3-2129 U+003F # QUESTION MARK Fullwidth: U+FF1F
|
||||
-3-212A U+0021 # EXCLAMATION MARK Fullwidth: U+FF01
|
||||
+3-2127 U+FF1A # COLON Fullwidth: U+FF1A
|
||||
+3-2128 U+FF1B # SEMICOLON Fullwidth: U+FF1B
|
||||
+3-2129 U+FF1F # QUESTION MARK Fullwidth: U+FF1F
|
||||
+3-212A U+FF01 # EXCLAMATION MARK Fullwidth: U+FF01
|
||||
3-212B U+309B # KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||
3-212C U+309C # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
3-212D U+00B4 # ACUTE ACCENT
|
||||
-3-212E U+0060 # GRAVE ACCENT Fullwidth: U+FF40
|
||||
+3-212E U+FF40 # GRAVE ACCENT Fullwidth: U+FF40
|
||||
3-212F U+00A8 # DIAERESIS
|
||||
-3-2130 U+005E # CIRCUMFLEX ACCENT Fullwidth: U+FF3E
|
||||
-3-2131 U+203E # OVERLINE Windows: U+FFE3
|
||||
-3-2132 U+005F # LOW LINE Fullwidth: U+FF3F
|
||||
+3-2130 U+FF3E # CIRCUMFLEX ACCENT Fullwidth: U+FF3E
|
||||
+3-2131 U+FFE3 # OVERLINE Windows: U+FFE3
|
||||
+3-2132 U+FF3F # LOW LINE Fullwidth: U+FF3F
|
||||
3-2133 U+30FD # KATAKANA ITERATION MARK
|
||||
3-2134 U+30FE # KATAKANA VOICED ITERATION MARK
|
||||
3-2135 U+309D # HIRAGANA ITERATION MARK
|
||||
@@ -48,27 +48,27 @@
|
||||
3-213A U+3006 # IDEOGRAPHIC CLOSING MARK
|
||||
3-213B U+3007 # IDEOGRAPHIC NUMBER ZERO
|
||||
3-213C U+30FC # KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
-3-213D U+2014 # EM DASH Windows: U+2015
|
||||
+3-213D U+2015 # EM DASH Windows: U+2015
|
||||
3-213E U+2010 # HYPHEN
|
||||
-3-213F U+002F # SOLIDUS Fullwidth: U+FF0F
|
||||
+3-213F U+FF0F # SOLIDUS Fullwidth: U+FF0F
|
||||
3-2140 U+005C # REVERSE SOLIDUS Fullwidth: U+FF3C
|
||||
3-2141 U+301C # WAVE DASH Windows: U+FF5E
|
||||
3-2142 U+2016 # DOUBLE VERTICAL LINE Windows: U+2225
|
||||
-3-2143 U+007C # VERTICAL LINE Fullwidth: U+FF5C
|
||||
+3-2143 U+FF5C # VERTICAL LINE Fullwidth: U+FF5C
|
||||
3-2144 U+2026 # HORIZONTAL ELLIPSIS
|
||||
3-2145 U+2025 # TWO DOT LEADER
|
||||
3-2146 U+2018 # LEFT SINGLE QUOTATION MARK
|
||||
3-2147 U+2019 # RIGHT SINGLE QUOTATION MARK
|
||||
3-2148 U+201C # LEFT DOUBLE QUOTATION MARK
|
||||
3-2149 U+201D # RIGHT DOUBLE QUOTATION MARK
|
||||
-3-214A U+0028 # LEFT PARENTHESIS Fullwidth: U+FF08
|
||||
-3-214B U+0029 # RIGHT PARENTHESIS Fullwidth: U+FF09
|
||||
+3-214A U+FF08 # LEFT PARENTHESIS Fullwidth: U+FF08
|
||||
+3-214B U+FF09 # RIGHT PARENTHESIS Fullwidth: U+FF09
|
||||
3-214C U+3014 # LEFT TORTOISE SHELL BRACKET
|
||||
3-214D U+3015 # RIGHT TORTOISE SHELL BRACKET
|
||||
-3-214E U+005B # LEFT SQUARE BRACKET Fullwidth: U+FF3B
|
||||
-3-214F U+005D # RIGHT SQUARE BRACKET Fullwidth: U+FF3D
|
||||
-3-2150 U+007B # LEFT CURLY BRACKET Fullwidth: U+FF5B
|
||||
-3-2151 U+007D # RIGHT CURLY BRACKET Fullwidth: U+FF5D
|
||||
+3-214E U+FF3B # LEFT SQUARE BRACKET Fullwidth: U+FF3B
|
||||
+3-214F U+FF3D # RIGHT SQUARE BRACKET Fullwidth: U+FF3D
|
||||
+3-2150 U+FF5B # LEFT CURLY BRACKET Fullwidth: U+FF5B
|
||||
+3-2151 U+FF5D # RIGHT CURLY BRACKET Fullwidth: U+FF5D
|
||||
3-2152 U+3008 # LEFT ANGLE BRACKET
|
||||
3-2153 U+3009 # RIGHT ANGLE BRACKET
|
||||
3-2154 U+300A # LEFT DOUBLE ANGLE BRACKET
|
||||
@@ -79,15 +79,15 @@
|
||||
3-2159 U+300F # RIGHT WHITE CORNER BRACKET
|
||||
3-215A U+3010 # LEFT BLACK LENTICULAR BRACKET
|
||||
3-215B U+3011 # RIGHT BLACK LENTICULAR BRACKET
|
||||
-3-215C U+002B # PLUS SIGN Fullwidth: U+FF0B
|
||||
+3-215C U+FF0B # PLUS SIGN Fullwidth: U+FF0B
|
||||
3-215D U+2212 # MINUS SIGN Windows: U+FF0D
|
||||
3-215E U+00B1 # PLUS-MINUS SIGN
|
||||
3-215F U+00D7 # MULTIPLICATION SIGN
|
||||
3-2160 U+00F7 # DIVISION SIGN
|
||||
-3-2161 U+003D # EQUALS SIGN Fullwidth: U+FF1D
|
||||
+3-2161 U+FF1D # EQUALS SIGN Fullwidth: U+FF1D
|
||||
3-2162 U+2260 # NOT EQUAL TO
|
||||
-3-2163 U+003C # LESS-THAN SIGN Fullwidth: U+FF1C
|
||||
-3-2164 U+003E # GREATER-THAN SIGN Fullwidth: U+FF1E
|
||||
+3-2163 U+FF1C # LESS-THAN SIGN Fullwidth: U+FF1C
|
||||
+3-2164 U+FF1E # GREATER-THAN SIGN Fullwidth: U+FF1E
|
||||
3-2165 U+2266 # LESS-THAN OVER EQUAL TO
|
||||
3-2166 U+2267 # GREATER-THAN OVER EQUAL TO
|
||||
3-2167 U+221E # INFINITY
|
||||
@@ -98,15 +98,15 @@
|
||||
3-216C U+2032 # PRIME
|
||||
3-216D U+2033 # DOUBLE PRIME
|
||||
3-216E U+2103 # DEGREE CELSIUS
|
||||
-3-216F U+00A5 # YEN SIGN Windows: U+FFE5
|
||||
-3-2170 U+0024 # DOLLAR SIGN Fullwidth: U+FF04
|
||||
+3-216F U+FFE5 # YEN SIGN Windows: U+FFE5
|
||||
+3-2170 U+FF04 # DOLLAR SIGN Fullwidth: U+FF04
|
||||
3-2171 U+00A2 # CENT SIGN Windows: U+FFE0
|
||||
3-2172 U+00A3 # POUND SIGN Windows: U+FFE1
|
||||
-3-2173 U+0025 # PERCENT SIGN Fullwidth: U+FF05
|
||||
-3-2174 U+0023 # NUMBER SIGN Fullwidth: U+FF03
|
||||
-3-2175 U+0026 # AMPERSAND Fullwidth: U+FF06
|
||||
-3-2176 U+002A # ASTERISK Fullwidth: U+FF0A
|
||||
-3-2177 U+0040 # COMMERCIAL AT Fullwidth: U+FF20
|
||||
+3-2173 U+FF05 # PERCENT SIGN Fullwidth: U+FF05
|
||||
+3-2174 U+FF03 # NUMBER SIGN Fullwidth: U+FF03
|
||||
+3-2175 U+FF06 # AMPERSAND Fullwidth: U+FF06
|
||||
+3-2176 U+FF0A # ASTERISK Fullwidth: U+FF0A
|
||||
+3-2177 U+FF20 # COMMERCIAL AT Fullwidth: U+FF20
|
||||
3-2178 U+00A7 # SECTION SIGN
|
||||
3-2179 U+2606 # WHITE STAR
|
||||
3-217A U+2605 # BLACK STAR
|
||||
@@ -128,9 +128,9 @@
|
||||
3-222C U+2191 # UPWARDS ARROW
|
||||
3-222D U+2193 # DOWNWARDS ARROW
|
||||
3-222E U+3013 # GETA MARK
|
||||
-3-222F U+0027 # APOSTROPHE Fullwidth: U+FF07
|
||||
-3-2230 U+0022 # QUOTATION MARK [2000] Fullwidth: U+FF02
|
||||
-3-2231 U+002D # HYPHEN-MINUS [2000] Fullwidth: U+FF0D
|
||||
+3-222F U+FF07 # APOSTROPHE Fullwidth: U+FF07
|
||||
+3-2230 U+FF02 # QUOTATION MARK [2000] Fullwidth: U+FF02
|
||||
+3-2231 U+FF0D # HYPHEN-MINUS [2000] Fullwidth: U+FF0D
|
||||
3-2232 U+007E # TILDE [2000] Fullwidth: U+FF5E
|
||||
3-2233 U+3033 # VERTICAL KANA REPEAT MARK UPPER HALF [2000]
|
||||
3-2234 U+3034 # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF [2000]
|
||||
@@ -223,16 +223,16 @@
|
||||
3-232D U+21E9 # DOWNWARDS WHITE ARROW [2000]
|
||||
3-232E U+2934 # ARROW POINTING RIGHTWARDS THEN CURVING UPWARDS [2000] [Unicode3.2]
|
||||
3-232F U+2935 # ARROW POINTING RIGHTWARDS THEN CURVING DOWNWARDS [2000] [Unicode3.2]
|
||||
-3-2330 U+0030 # DIGIT ZERO Fullwidth: U+FF10
|
||||
-3-2331 U+0031 # DIGIT ONE Fullwidth: U+FF11
|
||||
-3-2332 U+0032 # DIGIT TWO Fullwidth: U+FF12
|
||||
-3-2333 U+0033 # DIGIT THREE Fullwidth: U+FF13
|
||||
-3-2334 U+0034 # DIGIT FOUR Fullwidth: U+FF14
|
||||
-3-2335 U+0035 # DIGIT FIVE Fullwidth: U+FF15
|
||||
-3-2336 U+0036 # DIGIT SIX Fullwidth: U+FF16
|
||||
-3-2337 U+0037 # DIGIT SEVEN Fullwidth: U+FF17
|
||||
-3-2338 U+0038 # DIGIT EIGHT Fullwidth: U+FF18
|
||||
-3-2339 U+0039 # DIGIT NINE Fullwidth: U+FF19
|
||||
+3-2330 U+FF10 # DIGIT ZERO Fullwidth: U+FF10
|
||||
+3-2331 U+FF11 # DIGIT ONE Fullwidth: U+FF11
|
||||
+3-2332 U+FF12 # DIGIT TWO Fullwidth: U+FF12
|
||||
+3-2333 U+FF13 # DIGIT THREE Fullwidth: U+FF13
|
||||
+3-2334 U+FF14 # DIGIT FOUR Fullwidth: U+FF14
|
||||
+3-2335 U+FF15 # DIGIT FIVE Fullwidth: U+FF15
|
||||
+3-2336 U+FF16 # DIGIT SIX Fullwidth: U+FF16
|
||||
+3-2337 U+FF17 # DIGIT SEVEN Fullwidth: U+FF17
|
||||
+3-2338 U+FF18 # DIGIT EIGHT Fullwidth: U+FF18
|
||||
+3-2339 U+FF19 # DIGIT NINE Fullwidth: U+FF19
|
||||
3-233A U+29BF # CIRCLED BULLET [2000] [Unicode3.2]
|
||||
3-233B U+25C9 # FISHEYE [2000]
|
||||
3-233C U+303D # PART ALTERNATION MARK [2000] [Unicode3.2]
|
||||
@@ -240,64 +240,64 @@
|
||||
3-233E U+FE45 # SESAME DOT [2000] [Unicode3.2]
|
||||
3-233F U+25E6 # WHITE BULLET [2000]
|
||||
3-2340 U+2022 # BULLET [2000]
|
||||
-3-2341 U+0041 # LATIN CAPITAL LETTER A Fullwidth: U+FF21
|
||||
-3-2342 U+0042 # LATIN CAPITAL LETTER B Fullwidth: U+FF22
|
||||
-3-2343 U+0043 # LATIN CAPITAL LETTER C Fullwidth: U+FF23
|
||||
-3-2344 U+0044 # LATIN CAPITAL LETTER D Fullwidth: U+FF24
|
||||
-3-2345 U+0045 # LATIN CAPITAL LETTER E Fullwidth: U+FF25
|
||||
-3-2346 U+0046 # LATIN CAPITAL LETTER F Fullwidth: U+FF26
|
||||
-3-2347 U+0047 # LATIN CAPITAL LETTER G Fullwidth: U+FF27
|
||||
-3-2348 U+0048 # LATIN CAPITAL LETTER H Fullwidth: U+FF28
|
||||
-3-2349 U+0049 # LATIN CAPITAL LETTER I Fullwidth: U+FF29
|
||||
-3-234A U+004A # LATIN CAPITAL LETTER J Fullwidth: U+FF2A
|
||||
-3-234B U+004B # LATIN CAPITAL LETTER K Fullwidth: U+FF2B
|
||||
-3-234C U+004C # LATIN CAPITAL LETTER L Fullwidth: U+FF2C
|
||||
-3-234D U+004D # LATIN CAPITAL LETTER M Fullwidth: U+FF2D
|
||||
-3-234E U+004E # LATIN CAPITAL LETTER N Fullwidth: U+FF2E
|
||||
-3-234F U+004F # LATIN CAPITAL LETTER O Fullwidth: U+FF2F
|
||||
-3-2350 U+0050 # LATIN CAPITAL LETTER P Fullwidth: U+FF30
|
||||
-3-2351 U+0051 # LATIN CAPITAL LETTER Q Fullwidth: U+FF31
|
||||
-3-2352 U+0052 # LATIN CAPITAL LETTER R Fullwidth: U+FF32
|
||||
-3-2353 U+0053 # LATIN CAPITAL LETTER S Fullwidth: U+FF33
|
||||
-3-2354 U+0054 # LATIN CAPITAL LETTER T Fullwidth: U+FF34
|
||||
-3-2355 U+0055 # LATIN CAPITAL LETTER U Fullwidth: U+FF35
|
||||
-3-2356 U+0056 # LATIN CAPITAL LETTER V Fullwidth: U+FF36
|
||||
-3-2357 U+0057 # LATIN CAPITAL LETTER W Fullwidth: U+FF37
|
||||
-3-2358 U+0058 # LATIN CAPITAL LETTER X Fullwidth: U+FF38
|
||||
-3-2359 U+0059 # LATIN CAPITAL LETTER Y Fullwidth: U+FF39
|
||||
-3-235A U+005A # LATIN CAPITAL LETTER Z Fullwidth: U+FF3A
|
||||
+3-2341 U+FF21 # LATIN CAPITAL LETTER A Fullwidth: U+FF21
|
||||
+3-2342 U+FF22 # LATIN CAPITAL LETTER B Fullwidth: U+FF22
|
||||
+3-2343 U+FF23 # LATIN CAPITAL LETTER C Fullwidth: U+FF23
|
||||
+3-2344 U+FF24 # LATIN CAPITAL LETTER D Fullwidth: U+FF24
|
||||
+3-2345 U+FF25 # LATIN CAPITAL LETTER E Fullwidth: U+FF25
|
||||
+3-2346 U+FF26 # LATIN CAPITAL LETTER F Fullwidth: U+FF26
|
||||
+3-2347 U+FF27 # LATIN CAPITAL LETTER G Fullwidth: U+FF27
|
||||
+3-2348 U+FF28 # LATIN CAPITAL LETTER H Fullwidth: U+FF28
|
||||
+3-2349 U+FF29 # LATIN CAPITAL LETTER I Fullwidth: U+FF29
|
||||
+3-234A U+FF2A # LATIN CAPITAL LETTER J Fullwidth: U+FF2A
|
||||
+3-234B U+FF2B # LATIN CAPITAL LETTER K Fullwidth: U+FF2B
|
||||
+3-234C U+FF2C # LATIN CAPITAL LETTER L Fullwidth: U+FF2C
|
||||
+3-234D U+FF2D # LATIN CAPITAL LETTER M Fullwidth: U+FF2D
|
||||
+3-234E U+FF2E # LATIN CAPITAL LETTER N Fullwidth: U+FF2E
|
||||
+3-234F U+FF2F # LATIN CAPITAL LETTER O Fullwidth: U+FF2F
|
||||
+3-2350 U+FF30 # LATIN CAPITAL LETTER P Fullwidth: U+FF30
|
||||
+3-2351 U+FF31 # LATIN CAPITAL LETTER Q Fullwidth: U+FF31
|
||||
+3-2352 U+FF32 # LATIN CAPITAL LETTER R Fullwidth: U+FF32
|
||||
+3-2353 U+FF33 # LATIN CAPITAL LETTER S Fullwidth: U+FF33
|
||||
+3-2354 U+FF34 # LATIN CAPITAL LETTER T Fullwidth: U+FF34
|
||||
+3-2355 U+FF35 # LATIN CAPITAL LETTER U Fullwidth: U+FF35
|
||||
+3-2356 U+FF36 # LATIN CAPITAL LETTER V Fullwidth: U+FF36
|
||||
+3-2357 U+FF37 # LATIN CAPITAL LETTER W Fullwidth: U+FF37
|
||||
+3-2358 U+FF38 # LATIN CAPITAL LETTER X Fullwidth: U+FF38
|
||||
+3-2359 U+FF39 # LATIN CAPITAL LETTER Y Fullwidth: U+FF39
|
||||
+3-235A U+FF3A # LATIN CAPITAL LETTER Z Fullwidth: U+FF3A
|
||||
3-235B U+2213 # MINUS-OR-PLUS SIGN [2000]
|
||||
3-235C U+2135 # ALEF SYMBOL [2000]
|
||||
3-235D U+210F # PLANCK CONSTANT OVER TWO PI [2000]
|
||||
3-235E U+33CB # SQUARE HP [2000]
|
||||
3-235F U+2113 # SCRIPT SMALL L [2000]
|
||||
3-2360 U+2127 # INVERTED OHM SIGN [2000]
|
||||
-3-2361 U+0061 # LATIN SMALL LETTER A Fullwidth: U+FF41
|
||||
-3-2362 U+0062 # LATIN SMALL LETTER B Fullwidth: U+FF42
|
||||
-3-2363 U+0063 # LATIN SMALL LETTER C Fullwidth: U+FF43
|
||||
-3-2364 U+0064 # LATIN SMALL LETTER D Fullwidth: U+FF44
|
||||
-3-2365 U+0065 # LATIN SMALL LETTER E Fullwidth: U+FF45
|
||||
-3-2366 U+0066 # LATIN SMALL LETTER F Fullwidth: U+FF46
|
||||
-3-2367 U+0067 # LATIN SMALL LETTER G Fullwidth: U+FF47
|
||||
-3-2368 U+0068 # LATIN SMALL LETTER H Fullwidth: U+FF48
|
||||
-3-2369 U+0069 # LATIN SMALL LETTER I Fullwidth: U+FF49
|
||||
-3-236A U+006A # LATIN SMALL LETTER J Fullwidth: U+FF4A
|
||||
-3-236B U+006B # LATIN SMALL LETTER K Fullwidth: U+FF4B
|
||||
-3-236C U+006C # LATIN SMALL LETTER L Fullwidth: U+FF4C
|
||||
-3-236D U+006D # LATIN SMALL LETTER M Fullwidth: U+FF4D
|
||||
-3-236E U+006E # LATIN SMALL LETTER N Fullwidth: U+FF4E
|
||||
-3-236F U+006F # LATIN SMALL LETTER O Fullwidth: U+FF4F
|
||||
-3-2370 U+0070 # LATIN SMALL LETTER P Fullwidth: U+FF50
|
||||
-3-2371 U+0071 # LATIN SMALL LETTER Q Fullwidth: U+FF51
|
||||
-3-2372 U+0072 # LATIN SMALL LETTER R Fullwidth: U+FF52
|
||||
-3-2373 U+0073 # LATIN SMALL LETTER S Fullwidth: U+FF53
|
||||
-3-2374 U+0074 # LATIN SMALL LETTER T Fullwidth: U+FF54
|
||||
-3-2375 U+0075 # LATIN SMALL LETTER U Fullwidth: U+FF55
|
||||
-3-2376 U+0076 # LATIN SMALL LETTER V Fullwidth: U+FF56
|
||||
-3-2377 U+0077 # LATIN SMALL LETTER W Fullwidth: U+FF57
|
||||
-3-2378 U+0078 # LATIN SMALL LETTER X Fullwidth: U+FF58
|
||||
-3-2379 U+0079 # LATIN SMALL LETTER Y Fullwidth: U+FF59
|
||||
-3-237A U+007A # LATIN SMALL LETTER Z Fullwidth: U+FF5A
|
||||
+3-2361 U+FF41 # LATIN SMALL LETTER A Fullwidth: U+FF41
|
||||
+3-2362 U+FF42 # LATIN SMALL LETTER B Fullwidth: U+FF42
|
||||
+3-2363 U+FF43 # LATIN SMALL LETTER C Fullwidth: U+FF43
|
||||
+3-2364 U+FF44 # LATIN SMALL LETTER D Fullwidth: U+FF44
|
||||
+3-2365 U+FF45 # LATIN SMALL LETTER E Fullwidth: U+FF45
|
||||
+3-2366 U+FF46 # LATIN SMALL LETTER F Fullwidth: U+FF46
|
||||
+3-2367 U+FF47 # LATIN SMALL LETTER G Fullwidth: U+FF47
|
||||
+3-2368 U+FF48 # LATIN SMALL LETTER H Fullwidth: U+FF48
|
||||
+3-2369 U+FF49 # LATIN SMALL LETTER I Fullwidth: U+FF49
|
||||
+3-236A U+FF4A # LATIN SMALL LETTER J Fullwidth: U+FF4A
|
||||
+3-236B U+FF4B # LATIN SMALL LETTER K Fullwidth: U+FF4B
|
||||
+3-236C U+FF4C # LATIN SMALL LETTER L Fullwidth: U+FF4C
|
||||
+3-236D U+FF4D # LATIN SMALL LETTER M Fullwidth: U+FF4D
|
||||
+3-236E U+FF4E # LATIN SMALL LETTER N Fullwidth: U+FF4E
|
||||
+3-236F U+FF4F # LATIN SMALL LETTER O Fullwidth: U+FF4F
|
||||
+3-2370 U+FF50 # LATIN SMALL LETTER P Fullwidth: U+FF50
|
||||
+3-2371 U+FF51 # LATIN SMALL LETTER Q Fullwidth: U+FF51
|
||||
+3-2372 U+FF52 # LATIN SMALL LETTER R Fullwidth: U+FF52
|
||||
+3-2373 U+FF53 # LATIN SMALL LETTER S Fullwidth: U+FF53
|
||||
+3-2374 U+FF54 # LATIN SMALL LETTER T Fullwidth: U+FF54
|
||||
+3-2375 U+FF55 # LATIN SMALL LETTER U Fullwidth: U+FF55
|
||||
+3-2376 U+FF56 # LATIN SMALL LETTER V Fullwidth: U+FF56
|
||||
+3-2377 U+FF57 # LATIN SMALL LETTER W Fullwidth: U+FF57
|
||||
+3-2378 U+FF58 # LATIN SMALL LETTER X Fullwidth: U+FF58
|
||||
+3-2379 U+FF59 # LATIN SMALL LETTER Y Fullwidth: U+FF59
|
||||
+3-237A U+FF5A # LATIN SMALL LETTER Z Fullwidth: U+FF5A
|
||||
3-237B U+30A0 # KATAKANA-HIRAGANA DOUBLE HYPHEN [2000] [Unicode3.2]
|
||||
3-237C U+2013 # EN DASH [2000]
|
||||
3-237D U+29FA # DOUBLE PLUS [2000] [Unicode3.2]
|
351
Tools/unicode/python-mappings/diff/jisx0213-2004-std.txt.diff
Normal file
351
Tools/unicode/python-mappings/diff/jisx0213-2004-std.txt.diff
Normal file
@ -0,0 +1,351 @@
|
||||
--- jisx0213-2000-std.txt.orig Tue Apr 16 23:32:38 2002
|
||||
+++ jisx0213-2004-std.txt Thu Jul 8 11:51:54 2004
|
||||
@@ -1,6 +1,6 @@
|
||||
-## JIS X 0213:2000 vs Unicode mapping table
|
||||
+## JIS X 0213:2004 vs Unicode mapping table
|
||||
##
|
||||
-## Date: 16 Apr 2002 13:09:49 GMT
|
||||
+## Date: 7 Jul 2004 13:09:49 GMT
|
||||
## License:
|
||||
## Copyright (C) 2001 earthian@tama.or.jp, All Rights Reserved.
|
||||
## Copyright (C) 2001 I'O, All Rights Reserved.
|
||||
@@ -23,21 +23,21 @@
|
||||
3-2121 U+3000 # IDEOGRAPHIC SPACE
|
||||
3-2122 U+3001 # IDEOGRAPHIC COMMA
|
||||
3-2123 U+3002 # IDEOGRAPHIC FULL STOP
|
||||
-3-2124 U+002C # COMMA Fullwidth: U+FF0C
|
||||
-3-2125 U+002E # FULL STOP Fullwidth: U+FF0E
|
||||
+3-2124 U+FF0C # COMMA Fullwidth: U+FF0C
|
||||
+3-2125 U+FF0E # FULL STOP Fullwidth: U+FF0E
|
||||
3-2126 U+30FB # KATAKANA MIDDLE DOT
|
||||
-3-2127 U+003A # COLON Fullwidth: U+FF1A
|
||||
-3-2128 U+003B # SEMICOLON Fullwidth: U+FF1B
|
||||
-3-2129 U+003F # QUESTION MARK Fullwidth: U+FF1F
|
||||
-3-212A U+0021 # EXCLAMATION MARK Fullwidth: U+FF01
|
||||
+3-2127 U+FF1A # COLON Fullwidth: U+FF1A
|
||||
+3-2128 U+FF1B # SEMICOLON Fullwidth: U+FF1B
|
||||
+3-2129 U+FF1F # QUESTION MARK Fullwidth: U+FF1F
|
||||
+3-212A U+FF01 # EXCLAMATION MARK Fullwidth: U+FF01
|
||||
3-212B U+309B # KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||
3-212C U+309C # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
3-212D U+00B4 # ACUTE ACCENT
|
||||
-3-212E U+0060 # GRAVE ACCENT Fullwidth: U+FF40
|
||||
+3-212E U+FF40 # GRAVE ACCENT Fullwidth: U+FF40
|
||||
3-212F U+00A8 # DIAERESIS
|
||||
-3-2130 U+005E # CIRCUMFLEX ACCENT Fullwidth: U+FF3E
|
||||
-3-2131 U+203E # OVERLINE Windows: U+FFE3
|
||||
-3-2132 U+005F # LOW LINE Fullwidth: U+FF3F
|
||||
+3-2130 U+FF3E # CIRCUMFLEX ACCENT Fullwidth: U+FF3E
|
||||
+3-2131 U+FFE3 # OVERLINE Windows: U+FFE3
|
||||
+3-2132 U+FF3F # LOW LINE Fullwidth: U+FF3F
|
||||
3-2133 U+30FD # KATAKANA ITERATION MARK
|
||||
3-2134 U+30FE # KATAKANA VOICED ITERATION MARK
|
||||
3-2135 U+309D # HIRAGANA ITERATION MARK
|
||||
@@ -48,27 +48,27 @@
|
||||
3-213A U+3006 # IDEOGRAPHIC CLOSING MARK
|
||||
3-213B U+3007 # IDEOGRAPHIC NUMBER ZERO
|
||||
3-213C U+30FC # KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
-3-213D U+2014 # EM DASH Windows: U+2015
|
||||
+3-213D U+2015 # EM DASH Windows: U+2015
|
||||
3-213E U+2010 # HYPHEN
|
||||
-3-213F U+002F # SOLIDUS Fullwidth: U+FF0F
|
||||
+3-213F U+FF0F # SOLIDUS Fullwidth: U+FF0F
|
||||
3-2140 U+005C # REVERSE SOLIDUS Fullwidth: U+FF3C
|
||||
3-2141 U+301C # WAVE DASH Windows: U+FF5E
|
||||
3-2142 U+2016 # DOUBLE VERTICAL LINE Windows: U+2225
|
||||
-3-2143 U+007C # VERTICAL LINE Fullwidth: U+FF5C
|
||||
+3-2143 U+FF5C # VERTICAL LINE Fullwidth: U+FF5C
|
||||
3-2144 U+2026 # HORIZONTAL ELLIPSIS
|
||||
3-2145 U+2025 # TWO DOT LEADER
|
||||
3-2146 U+2018 # LEFT SINGLE QUOTATION MARK
|
||||
3-2147 U+2019 # RIGHT SINGLE QUOTATION MARK
|
||||
3-2148 U+201C # LEFT DOUBLE QUOTATION MARK
|
||||
3-2149 U+201D # RIGHT DOUBLE QUOTATION MARK
|
||||
-3-214A U+0028 # LEFT PARENTHESIS Fullwidth: U+FF08
|
||||
-3-214B U+0029 # RIGHT PARENTHESIS Fullwidth: U+FF09
|
||||
+3-214A U+FF08 # LEFT PARENTHESIS Fullwidth: U+FF08
|
||||
+3-214B U+FF09 # RIGHT PARENTHESIS Fullwidth: U+FF09
|
||||
3-214C U+3014 # LEFT TORTOISE SHELL BRACKET
|
||||
3-214D U+3015 # RIGHT TORTOISE SHELL BRACKET
|
||||
-3-214E U+005B # LEFT SQUARE BRACKET Fullwidth: U+FF3B
|
||||
-3-214F U+005D # RIGHT SQUARE BRACKET Fullwidth: U+FF3D
|
||||
-3-2150 U+007B # LEFT CURLY BRACKET Fullwidth: U+FF5B
|
||||
-3-2151 U+007D # RIGHT CURLY BRACKET Fullwidth: U+FF5D
|
||||
+3-214E U+FF3B # LEFT SQUARE BRACKET Fullwidth: U+FF3B
|
||||
+3-214F U+FF3D # RIGHT SQUARE BRACKET Fullwidth: U+FF3D
|
||||
+3-2150 U+FF5B # LEFT CURLY BRACKET Fullwidth: U+FF5B
|
||||
+3-2151 U+FF5D # RIGHT CURLY BRACKET Fullwidth: U+FF5D
|
||||
3-2152 U+3008 # LEFT ANGLE BRACKET
|
||||
3-2153 U+3009 # RIGHT ANGLE BRACKET
|
||||
3-2154 U+300A # LEFT DOUBLE ANGLE BRACKET
|
||||
@@ -79,15 +79,15 @@
|
||||
3-2159 U+300F # RIGHT WHITE CORNER BRACKET
|
||||
3-215A U+3010 # LEFT BLACK LENTICULAR BRACKET
|
||||
3-215B U+3011 # RIGHT BLACK LENTICULAR BRACKET
|
||||
-3-215C U+002B # PLUS SIGN Fullwidth: U+FF0B
|
||||
+3-215C U+FF0B # PLUS SIGN Fullwidth: U+FF0B
|
||||
3-215D U+2212 # MINUS SIGN Windows: U+FF0D
|
||||
3-215E U+00B1 # PLUS-MINUS SIGN
|
||||
3-215F U+00D7 # MULTIPLICATION SIGN
|
||||
3-2160 U+00F7 # DIVISION SIGN
|
||||
-3-2161 U+003D # EQUALS SIGN Fullwidth: U+FF1D
|
||||
+3-2161 U+FF1D # EQUALS SIGN Fullwidth: U+FF1D
|
||||
3-2162 U+2260 # NOT EQUAL TO
|
||||
-3-2163 U+003C # LESS-THAN SIGN Fullwidth: U+FF1C
|
||||
-3-2164 U+003E # GREATER-THAN SIGN Fullwidth: U+FF1E
|
||||
+3-2163 U+FF1C # LESS-THAN SIGN Fullwidth: U+FF1C
|
||||
+3-2164 U+FF1E # GREATER-THAN SIGN Fullwidth: U+FF1E
|
||||
3-2165 U+2266 # LESS-THAN OVER EQUAL TO
|
||||
3-2166 U+2267 # GREATER-THAN OVER EQUAL TO
|
||||
3-2167 U+221E # INFINITY
|
||||
@@ -98,15 +98,15 @@
|
||||
3-216C U+2032 # PRIME
|
||||
3-216D U+2033 # DOUBLE PRIME
|
||||
3-216E U+2103 # DEGREE CELSIUS
|
||||
-3-216F U+00A5 # YEN SIGN Windows: U+FFE5
|
||||
-3-2170 U+0024 # DOLLAR SIGN Fullwidth: U+FF04
|
||||
+3-216F U+FFE5 # YEN SIGN Windows: U+FFE5
|
||||
+3-2170 U+FF04 # DOLLAR SIGN Fullwidth: U+FF04
|
||||
3-2171 U+00A2 # CENT SIGN Windows: U+FFE0
|
||||
3-2172 U+00A3 # POUND SIGN Windows: U+FFE1
|
||||
-3-2173 U+0025 # PERCENT SIGN Fullwidth: U+FF05
|
||||
-3-2174 U+0023 # NUMBER SIGN Fullwidth: U+FF03
|
||||
-3-2175 U+0026 # AMPERSAND Fullwidth: U+FF06
|
||||
-3-2176 U+002A # ASTERISK Fullwidth: U+FF0A
|
||||
-3-2177 U+0040 # COMMERCIAL AT Fullwidth: U+FF20
|
||||
+3-2173 U+FF05 # PERCENT SIGN Fullwidth: U+FF05
|
||||
+3-2174 U+FF03 # NUMBER SIGN Fullwidth: U+FF03
|
||||
+3-2175 U+FF06 # AMPERSAND Fullwidth: U+FF06
|
||||
+3-2176 U+FF0A # ASTERISK Fullwidth: U+FF0A
|
||||
+3-2177 U+FF20 # COMMERCIAL AT Fullwidth: U+FF20
|
||||
3-2178 U+00A7 # SECTION SIGN
|
||||
3-2179 U+2606 # WHITE STAR
|
||||
3-217A U+2605 # BLACK STAR
|
||||
@@ -128,9 +128,9 @@
|
||||
3-222C U+2191 # UPWARDS ARROW
|
||||
3-222D U+2193 # DOWNWARDS ARROW
|
||||
3-222E U+3013 # GETA MARK
|
||||
-3-222F U+0027 # APOSTROPHE Fullwidth: U+FF07
|
||||
-3-2230 U+0022 # QUOTATION MARK [2000] Fullwidth: U+FF02
|
||||
-3-2231 U+002D # HYPHEN-MINUS [2000] Fullwidth: U+FF0D
|
||||
+3-222F U+FF07 # APOSTROPHE Fullwidth: U+FF07
|
||||
+3-2230 U+FF02 # QUOTATION MARK [2000] Fullwidth: U+FF02
|
||||
+3-2231 U+FF0D # HYPHEN-MINUS [2000] Fullwidth: U+FF0D
|
||||
3-2232 U+007E # TILDE [2000] Fullwidth: U+FF5E
|
||||
3-2233 U+3033 # VERTICAL KANA REPEAT MARK UPPER HALF [2000]
|
||||
3-2234 U+3034 # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF [2000]
|
||||
@@ -223,16 +223,16 @@
|
||||
3-232D U+21E9 # DOWNWARDS WHITE ARROW [2000]
|
||||
3-232E U+2934 # ARROW POINTING RIGHTWARDS THEN CURVING UPWARDS [2000] [Unicode3.2]
|
||||
3-232F U+2935 # ARROW POINTING RIGHTWARDS THEN CURVING DOWNWARDS [2000] [Unicode3.2]
|
||||
-3-2330 U+0030 # DIGIT ZERO Fullwidth: U+FF10
|
||||
-3-2331 U+0031 # DIGIT ONE Fullwidth: U+FF11
|
||||
-3-2332 U+0032 # DIGIT TWO Fullwidth: U+FF12
|
||||
-3-2333 U+0033 # DIGIT THREE Fullwidth: U+FF13
|
||||
-3-2334 U+0034 # DIGIT FOUR Fullwidth: U+FF14
|
||||
-3-2335 U+0035 # DIGIT FIVE Fullwidth: U+FF15
|
||||
-3-2336 U+0036 # DIGIT SIX Fullwidth: U+FF16
|
||||
-3-2337 U+0037 # DIGIT SEVEN Fullwidth: U+FF17
|
||||
-3-2338 U+0038 # DIGIT EIGHT Fullwidth: U+FF18
|
||||
-3-2339 U+0039 # DIGIT NINE Fullwidth: U+FF19
|
||||
+3-2330 U+FF10 # DIGIT ZERO Fullwidth: U+FF10
|
||||
+3-2331 U+FF11 # DIGIT ONE Fullwidth: U+FF11
|
||||
+3-2332 U+FF12 # DIGIT TWO Fullwidth: U+FF12
|
||||
+3-2333 U+FF13 # DIGIT THREE Fullwidth: U+FF13
|
||||
+3-2334 U+FF14 # DIGIT FOUR Fullwidth: U+FF14
|
||||
+3-2335 U+FF15 # DIGIT FIVE Fullwidth: U+FF15
|
||||
+3-2336 U+FF16 # DIGIT SIX Fullwidth: U+FF16
|
||||
+3-2337 U+FF17 # DIGIT SEVEN Fullwidth: U+FF17
|
||||
+3-2338 U+FF18 # DIGIT EIGHT Fullwidth: U+FF18
|
||||
+3-2339 U+FF19 # DIGIT NINE Fullwidth: U+FF19
|
||||
3-233A U+29BF # CIRCLED BULLET [2000] [Unicode3.2]
|
||||
3-233B U+25C9 # FISHEYE [2000]
|
||||
3-233C U+303D # PART ALTERNATION MARK [2000] [Unicode3.2]
|
||||
@@ -240,64 +240,64 @@
|
||||
3-233E U+FE45 # SESAME DOT [2000] [Unicode3.2]
|
||||
3-233F U+25E6 # WHITE BULLET [2000]
|
||||
3-2340 U+2022 # BULLET [2000]
|
||||
-3-2341 U+0041 # LATIN CAPITAL LETTER A Fullwidth: U+FF21
|
||||
-3-2342 U+0042 # LATIN CAPITAL LETTER B Fullwidth: U+FF22
|
||||
-3-2343 U+0043 # LATIN CAPITAL LETTER C Fullwidth: U+FF23
|
||||
-3-2344 U+0044 # LATIN CAPITAL LETTER D Fullwidth: U+FF24
|
||||
-3-2345 U+0045 # LATIN CAPITAL LETTER E Fullwidth: U+FF25
|
||||
-3-2346 U+0046 # LATIN CAPITAL LETTER F Fullwidth: U+FF26
|
||||
-3-2347 U+0047 # LATIN CAPITAL LETTER G Fullwidth: U+FF27
|
||||
-3-2348 U+0048 # LATIN CAPITAL LETTER H Fullwidth: U+FF28
|
||||
-3-2349 U+0049 # LATIN CAPITAL LETTER I Fullwidth: U+FF29
|
||||
-3-234A U+004A # LATIN CAPITAL LETTER J Fullwidth: U+FF2A
|
||||
-3-234B U+004B # LATIN CAPITAL LETTER K Fullwidth: U+FF2B
|
||||
-3-234C U+004C # LATIN CAPITAL LETTER L Fullwidth: U+FF2C
|
||||
-3-234D U+004D # LATIN CAPITAL LETTER M Fullwidth: U+FF2D
|
||||
-3-234E U+004E # LATIN CAPITAL LETTER N Fullwidth: U+FF2E
|
||||
-3-234F U+004F # LATIN CAPITAL LETTER O Fullwidth: U+FF2F
|
||||
-3-2350 U+0050 # LATIN CAPITAL LETTER P Fullwidth: U+FF30
|
||||
-3-2351 U+0051 # LATIN CAPITAL LETTER Q Fullwidth: U+FF31
|
||||
-3-2352 U+0052 # LATIN CAPITAL LETTER R Fullwidth: U+FF32
|
||||
-3-2353 U+0053 # LATIN CAPITAL LETTER S Fullwidth: U+FF33
|
||||
-3-2354 U+0054 # LATIN CAPITAL LETTER T Fullwidth: U+FF34
|
||||
-3-2355 U+0055 # LATIN CAPITAL LETTER U Fullwidth: U+FF35
|
||||
-3-2356 U+0056 # LATIN CAPITAL LETTER V Fullwidth: U+FF36
|
||||
-3-2357 U+0057 # LATIN CAPITAL LETTER W Fullwidth: U+FF37
|
||||
-3-2358 U+0058 # LATIN CAPITAL LETTER X Fullwidth: U+FF38
|
||||
-3-2359 U+0059 # LATIN CAPITAL LETTER Y Fullwidth: U+FF39
|
||||
-3-235A U+005A # LATIN CAPITAL LETTER Z Fullwidth: U+FF3A
|
||||
+3-2341 U+FF21 # LATIN CAPITAL LETTER A Fullwidth: U+FF21
|
||||
+3-2342 U+FF22 # LATIN CAPITAL LETTER B Fullwidth: U+FF22
|
||||
+3-2343 U+FF23 # LATIN CAPITAL LETTER C Fullwidth: U+FF23
|
||||
+3-2344 U+FF24 # LATIN CAPITAL LETTER D Fullwidth: U+FF24
|
||||
+3-2345 U+FF25 # LATIN CAPITAL LETTER E Fullwidth: U+FF25
|
||||
+3-2346 U+FF26 # LATIN CAPITAL LETTER F Fullwidth: U+FF26
|
||||
+3-2347 U+FF27 # LATIN CAPITAL LETTER G Fullwidth: U+FF27
|
||||
+3-2348 U+FF28 # LATIN CAPITAL LETTER H Fullwidth: U+FF28
|
||||
+3-2349 U+FF29 # LATIN CAPITAL LETTER I Fullwidth: U+FF29
|
||||
+3-234A U+FF2A # LATIN CAPITAL LETTER J Fullwidth: U+FF2A
|
||||
+3-234B U+FF2B # LATIN CAPITAL LETTER K Fullwidth: U+FF2B
|
||||
+3-234C U+FF2C # LATIN CAPITAL LETTER L Fullwidth: U+FF2C
|
||||
+3-234D U+FF2D # LATIN CAPITAL LETTER M Fullwidth: U+FF2D
|
||||
+3-234E U+FF2E # LATIN CAPITAL LETTER N Fullwidth: U+FF2E
|
||||
+3-234F U+FF2F # LATIN CAPITAL LETTER O Fullwidth: U+FF2F
|
||||
+3-2350 U+FF30 # LATIN CAPITAL LETTER P Fullwidth: U+FF30
|
||||
+3-2351 U+FF31 # LATIN CAPITAL LETTER Q Fullwidth: U+FF31
|
||||
+3-2352 U+FF32 # LATIN CAPITAL LETTER R Fullwidth: U+FF32
|
||||
+3-2353 U+FF33 # LATIN CAPITAL LETTER S Fullwidth: U+FF33
|
||||
+3-2354 U+FF34 # LATIN CAPITAL LETTER T Fullwidth: U+FF34
|
||||
+3-2355 U+FF35 # LATIN CAPITAL LETTER U Fullwidth: U+FF35
|
||||
+3-2356 U+FF36 # LATIN CAPITAL LETTER V Fullwidth: U+FF36
|
||||
+3-2357 U+FF37 # LATIN CAPITAL LETTER W Fullwidth: U+FF37
|
||||
+3-2358 U+FF38 # LATIN CAPITAL LETTER X Fullwidth: U+FF38
|
||||
+3-2359 U+FF39 # LATIN CAPITAL LETTER Y Fullwidth: U+FF39
|
||||
+3-235A U+FF3A # LATIN CAPITAL LETTER Z Fullwidth: U+FF3A
|
||||
3-235B U+2213 # MINUS-OR-PLUS SIGN [2000]
|
||||
3-235C U+2135 # ALEF SYMBOL [2000]
|
||||
3-235D U+210F # PLANCK CONSTANT OVER TWO PI [2000]
|
||||
3-235E U+33CB # SQUARE HP [2000]
|
||||
3-235F U+2113 # SCRIPT SMALL L [2000]
|
||||
3-2360 U+2127 # INVERTED OHM SIGN [2000]
|
||||
-3-2361 U+0061 # LATIN SMALL LETTER A Fullwidth: U+FF41
|
||||
-3-2362 U+0062 # LATIN SMALL LETTER B Fullwidth: U+FF42
|
||||
-3-2363 U+0063 # LATIN SMALL LETTER C Fullwidth: U+FF43
|
||||
-3-2364 U+0064 # LATIN SMALL LETTER D Fullwidth: U+FF44
|
||||
-3-2365 U+0065 # LATIN SMALL LETTER E Fullwidth: U+FF45
|
||||
-3-2366 U+0066 # LATIN SMALL LETTER F Fullwidth: U+FF46
|
||||
-3-2367 U+0067 # LATIN SMALL LETTER G Fullwidth: U+FF47
|
||||
-3-2368 U+0068 # LATIN SMALL LETTER H Fullwidth: U+FF48
|
||||
-3-2369 U+0069 # LATIN SMALL LETTER I Fullwidth: U+FF49
|
||||
-3-236A U+006A # LATIN SMALL LETTER J Fullwidth: U+FF4A
|
||||
-3-236B U+006B # LATIN SMALL LETTER K Fullwidth: U+FF4B
|
||||
-3-236C U+006C # LATIN SMALL LETTER L Fullwidth: U+FF4C
|
||||
-3-236D U+006D # LATIN SMALL LETTER M Fullwidth: U+FF4D
|
||||
-3-236E U+006E # LATIN SMALL LETTER N Fullwidth: U+FF4E
|
||||
-3-236F U+006F # LATIN SMALL LETTER O Fullwidth: U+FF4F
|
||||
-3-2370 U+0070 # LATIN SMALL LETTER P Fullwidth: U+FF50
|
||||
-3-2371 U+0071 # LATIN SMALL LETTER Q Fullwidth: U+FF51
|
||||
-3-2372 U+0072 # LATIN SMALL LETTER R Fullwidth: U+FF52
|
||||
-3-2373 U+0073 # LATIN SMALL LETTER S Fullwidth: U+FF53
|
||||
-3-2374 U+0074 # LATIN SMALL LETTER T Fullwidth: U+FF54
|
||||
-3-2375 U+0075 # LATIN SMALL LETTER U Fullwidth: U+FF55
|
||||
-3-2376 U+0076 # LATIN SMALL LETTER V Fullwidth: U+FF56
|
||||
-3-2377 U+0077 # LATIN SMALL LETTER W Fullwidth: U+FF57
|
||||
-3-2378 U+0078 # LATIN SMALL LETTER X Fullwidth: U+FF58
|
||||
-3-2379 U+0079 # LATIN SMALL LETTER Y Fullwidth: U+FF59
|
||||
-3-237A U+007A # LATIN SMALL LETTER Z Fullwidth: U+FF5A
|
||||
+3-2361 U+FF41 # LATIN SMALL LETTER A Fullwidth: U+FF41
|
||||
+3-2362 U+FF42 # LATIN SMALL LETTER B Fullwidth: U+FF42
|
||||
+3-2363 U+FF43 # LATIN SMALL LETTER C Fullwidth: U+FF43
|
||||
+3-2364 U+FF44 # LATIN SMALL LETTER D Fullwidth: U+FF44
|
||||
+3-2365 U+FF45 # LATIN SMALL LETTER E Fullwidth: U+FF45
|
||||
+3-2366 U+FF46 # LATIN SMALL LETTER F Fullwidth: U+FF46
|
||||
+3-2367 U+FF47 # LATIN SMALL LETTER G Fullwidth: U+FF47
|
||||
+3-2368 U+FF48 # LATIN SMALL LETTER H Fullwidth: U+FF48
|
||||
+3-2369 U+FF49 # LATIN SMALL LETTER I Fullwidth: U+FF49
|
||||
+3-236A U+FF4A # LATIN SMALL LETTER J Fullwidth: U+FF4A
|
||||
+3-236B U+FF4B # LATIN SMALL LETTER K Fullwidth: U+FF4B
|
||||
+3-236C U+FF4C # LATIN SMALL LETTER L Fullwidth: U+FF4C
|
||||
+3-236D U+FF4D # LATIN SMALL LETTER M Fullwidth: U+FF4D
|
||||
+3-236E U+FF4E # LATIN SMALL LETTER N Fullwidth: U+FF4E
|
||||
+3-236F U+FF4F # LATIN SMALL LETTER O Fullwidth: U+FF4F
|
||||
+3-2370 U+FF50 # LATIN SMALL LETTER P Fullwidth: U+FF50
|
||||
+3-2371 U+FF51 # LATIN SMALL LETTER Q Fullwidth: U+FF51
|
||||
+3-2372 U+FF52 # LATIN SMALL LETTER R Fullwidth: U+FF52
|
||||
+3-2373 U+FF53 # LATIN SMALL LETTER S Fullwidth: U+FF53
|
||||
+3-2374 U+FF54 # LATIN SMALL LETTER T Fullwidth: U+FF54
|
||||
+3-2375 U+FF55 # LATIN SMALL LETTER U Fullwidth: U+FF55
|
||||
+3-2376 U+FF56 # LATIN SMALL LETTER V Fullwidth: U+FF56
|
||||
+3-2377 U+FF57 # LATIN SMALL LETTER W Fullwidth: U+FF57
|
||||
+3-2378 U+FF58 # LATIN SMALL LETTER X Fullwidth: U+FF58
|
||||
+3-2379 U+FF59 # LATIN SMALL LETTER Y Fullwidth: U+FF59
|
||||
+3-237A U+FF5A # LATIN SMALL LETTER Z Fullwidth: U+FF5A
|
||||
3-237B U+30A0 # KATAKANA-HIRAGANA DOUBLE HYPHEN [2000] [Unicode3.2]
|
||||
3-237C U+2013 # EN DASH [2000]
|
||||
3-237D U+29FA # DOUBLE PLUS [2000] [Unicode3.2]
|
||||
@@ -1242,7 +1242,7 @@
|
||||
3-2D7C # <reserved> Windows: U+222A
|
||||
3-2D7D U+2756 # BLACK DIAMOND MINUS WHITE X [2000]
|
||||
3-2D7E U+261E # WHITE RIGHT POINTING INDEX [2000]
|
||||
-3-2E21 # <reserved>
|
||||
+3-2E21 U+4FF1 # <cjk> [2004]
|
||||
3-2E22 U+2000B # <cjk> [2000] [Unicode3.1] Private: U+F780
|
||||
3-2E23 U+3402 # <cjk> [2000]
|
||||
3-2E24 U+4E28 # <cjk> [2000]
|
||||
@@ -1429,7 +1429,7 @@
|
||||
3-2F7B U+218BD # <cjk> [2000] [Unicode3.1] Private: U+F78F
|
||||
3-2F7C U+5B19 # <cjk> [2000]
|
||||
3-2F7D U+5B25 # <cjk> [2000]
|
||||
-3-2F7E # <reserved>
|
||||
+3-2F7E U+525D # <cjk> [2004]
|
||||
3-3021 U+4E9C # <cjk>
|
||||
3-3022 U+5516 # <cjk>
|
||||
3-3023 U+5A03 # <cjk>
|
||||
@@ -4395,7 +4395,7 @@
|
||||
3-4F51 U+6E7E # <cjk>
|
||||
3-4F52 U+7897 # <cjk>
|
||||
3-4F53 U+8155 # <cjk>
|
||||
-3-4F54 # <reserved>
|
||||
+3-4F54 U+20B9F # <cjk> [2004]
|
||||
3-4F55 U+5B41 # <cjk> [2000]
|
||||
3-4F56 U+5B56 # <cjk> [2000]
|
||||
3-4F57 U+5B7D # <cjk> [2000]
|
||||
@@ -4437,7 +4437,7 @@
|
||||
3-4F7B U+5DA7 # <cjk> [2000]
|
||||
3-4F7C U+5DB8 # <cjk> [2000]
|
||||
3-4F7D U+5DCB # <cjk> [2000]
|
||||
-3-4F7E # <reserved>
|
||||
+3-4F7E U+541E # <cjk> [2004]
|
||||
3-5021 U+5F0C # <cjk>
|
||||
3-5022 U+4E10 # <cjk>
|
||||
3-5023 U+4E15 # <cjk>
|
||||
@@ -7828,7 +7828,7 @@
|
||||
3-7424 U+7464 # <cjk> [1983]
|
||||
3-7425 U+51DC # <cjk> [1990]
|
||||
3-7426 U+7199 # <cjk> [1990]
|
||||
-3-7427 # <reserved>
|
||||
+3-7427 U+5653 # <cjk> [2004]
|
||||
3-7428 U+5DE2 # <cjk> [2000]
|
||||
3-7429 U+5E14 # <cjk> [2000]
|
||||
3-742A U+5E18 # <cjk> [2000]
|
||||
@@ -8851,11 +8851,11 @@
|
||||
3-7E77 U+9F94 # <cjk> [2000]
|
||||
3-7E78 U+9F97 # <cjk> [2000]
|
||||
3-7E79 U+9FA2 # <cjk> [2000]
|
||||
-3-7E7A # <reserved>
|
||||
-3-7E7B # <reserved>
|
||||
-3-7E7C # <reserved>
|
||||
-3-7E7D # <reserved>
|
||||
-3-7E7E # <reserved>
|
||||
+3-7E7A U+59F8 # <cjk> [2004]
|
||||
+3-7E7B U+5C5B # <cjk> [2004]
|
||||
+3-7E7C U+5E77 # <cjk> [2004]
|
||||
+3-7E7D U+7626 # <cjk> [2004]
|
||||
+3-7E7E U+7E6B # <cjk> [2004]
|
||||
4-2121 U+20089 # <cjk> [2000] [Unicode3.1] Private: U+F7D1
|
||||
4-2122 U+4E02 # <cjk> [2000]
|
||||
4-2123 U+4E0F # <cjk> [2000]
|
||||
@@ -11138,7 +11138,7 @@
|
||||
4-7D38 U+9B10 # <cjk> [2000]
|
||||
4-7D39 U+9B12 # <cjk> [2000]
|
||||
4-7D3A U+9B16 # <cjk> [2000]
|
||||
-4-7D3B U+9B1D # <cjk> [2000]
|
||||
+4-7D3B U+9B1C # <cjk> [2000]
|
||||
4-7D3C U+9B2B # <cjk> [2000]
|
||||
4-7D3D U+9B33 # <cjk> [2000]
|
||||
4-7D3E U+9B3D # <cjk> [2000]
|
30917
Tools/unicode/python-mappings/gb-18030-2000.xml
Normal file
30917
Tools/unicode/python-mappings/gb-18030-2000.xml
Normal file
File diff suppressed because it is too large
Load Diff
11294
Tools/unicode/python-mappings/jisx0213-2004-std.txt
Normal file
11294
Tools/unicode/python-mappings/jisx0213-2004-std.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user