Add more tests for CP5022{0,1,2} text conversion

This commit is contained in:
Alex Dowad 2021-08-14 21:02:29 +02:00
parent e3f6a9fbfe
commit a0415b22ab

View File

@ -34,6 +34,7 @@ function shiftJISDecode($bytes) {
/* Read in table of all characters in CP932 charset */
$cp932Chars = array(); /* CP932 -> UTF-16BE */
$nonInvertible = array();
$fromUnicode = array();
$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
@ -43,10 +44,12 @@ while ($line = fgets($fp, 256)) {
if ($bytes < 256)
continue;
if ($bytes >= 0xFA00) {
if (isset($fromUnicode[$codepoint])) {
$nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
} else {
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
$fromUnicode[$codepoint] = $bytes;
}
}
}
@ -63,21 +66,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
}
}
/* There are 396 Unicode codepoints which are non-invertible in CP932
* (multiple CP932 byte sequences map to the same codepoint) */
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
$bytes = pack('n', shiftJISDecode($i));
if (isset($cp932Chars[$bytes])) {
$nonInvertible[$bytes] = $cp932Chars[$bytes];
unset($cp932Chars[$bytes]); // will test these separately
}
}
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
$bytes = pack('n', shiftJISDecode($i));
$nonInvertible[$bytes] = $cp932Chars[$bytes];
unset($cp932Chars[$bytes]); // will test these separately
}
/* Read in table of all characters in JISX-0201 charset */
$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
@ -89,6 +77,18 @@ while ($line = fgets($fp, 256)) {
$jisx0201Chars[chr($byte)] = pack('n', $codepoint);
}
/* Read in table of all characters in JISX-0212 charset */
$jisx0212Chars = array();
$fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
continue;
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
$jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint);
}
}
/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
* differ in a number of places from the table provided by the Unicode Consortium */
$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
@ -151,6 +151,10 @@ for ($i = 0x80; $i < 256; $i++) {
testInvalid("\x0F" . chr($i), "\x00%", 'CP50222');
}
// Switch back to ASCII after a multibyte character
convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false);
convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false);
echo "ASCII support OK\n";
/* All valid JIS X 0201 characters
@ -164,6 +168,7 @@ foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
testValid($jisx0201, $utf16BE, 'CP50220', false);
testValid($jisx0201, $utf16BE, 'CP50221', false);
testValid($jisx0201, $utf16BE, 'CP50222', false);
convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false);
} else { /* Latin */
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
@ -182,6 +187,11 @@ for ($i = 0x80; $i < 256; $i++) {
testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
}
/* Go from JIS X 0201 to ASCII or JIS X 0208 */
convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false);
convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false);
convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222');
echo "JIS X 0201 support OK\n";
/* All valid CP932 characters */
@ -196,6 +206,15 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
}
/* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false);
convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false);
convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false);
convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false);
convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false);
}
/* All invalid 2-byte CP932 characters */
for ($i = 0x21; $i <= 0x97; $i++) {
for ($j = 0; $j < 256; $j++) {
@ -215,8 +234,36 @@ for ($i = 0x21; $i <= 0x97; $i++) {
testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
}
/* Test alternative escape sequence to select CP932 */
testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false);
echo "CP932 support OK\n";
foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false);
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false);
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false);
}
for ($i = 0x21; $i <= 0x97; $i++) {
for ($j = 0; $j < 256; $j++) {
$testString = chr($i) . chr($j);
if (!isset($jisx0212Chars[$testString])) {
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220');
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221');
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222');
}
}
}
for ($i = 0x21; $i <= 0x97; $i++) {
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220');
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221');
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222');
}
echo "JIS X 0212 support OK\n";
/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
$fullwidthKatakana = array(
0xFF61 => 0x2123, /* Ideographic full stop */
@ -310,6 +357,7 @@ echo "Long error markers OK\n";
ASCII support OK
JIS X 0201 support OK
CP932 support OK
JIS X 0212 support OK
Folding of fullwidth katakana for CP50220 OK
Invalid Unicode is flagged when converting to CP5022x
Long error markers OK