diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index d316bddf3a7..6f2f0c534ed 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -96,6 +96,12 @@ int main() { return foo(10, "", 3.14); } oniguruma/reggnu.c oniguruma/regparse.c oniguruma/regenc.c + oniguruma/regext.c + oniguruma/regsyntax.c + oniguruma/regtrav.c + oniguruma/regversion.c + oniguruma/st.c + oniguruma/enc/unicode.c oniguruma/enc/ascii.c oniguruma/enc/utf8.c oniguruma/enc/euc_jp.c @@ -120,6 +126,10 @@ int main() { return foo(10, "", 3.14); } oniguruma/enc/koi8.c oniguruma/enc/koi8_r.c oniguruma/enc/big5.c + oniguruma/enc/utf16_be.c + oniguruma/enc/utf16_le.c + oniguruma/enc/utf32_be.c + oniguruma/enc/utf32_le.c ]) fi ]) @@ -164,6 +174,7 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [ libmbfl/filters/mbfilter_iso8859_13.c libmbfl/filters/mbfilter_iso8859_14.c libmbfl/filters/mbfilter_iso8859_15.c + libmbfl/filters/mbfilter_iso8859_16.c libmbfl/filters/mbfilter_iso8859_2.c libmbfl/filters/mbfilter_iso8859_3.c libmbfl/filters/mbfilter_iso8859_4.c @@ -261,3 +272,5 @@ if test "$PHP_MBSTRING" != "no"; then PHP_MBSTRING_SETUP_LIBMBFL PHP_MBSTRING_EXTENSION fi + +# vim600: sts=2 sw=2 et diff --git a/ext/mbstring/config.w32 b/ext/mbstring/config.w32 index 8484b462317..665f4b4ba03 100644 --- a/ext/mbstring/config.w32 +++ b/ext/mbstring/config.w32 @@ -24,13 +24,14 @@ if (PHP_MBSTRING == "yes") { mbfilter_euc_jp.c mbfilter_euc_jp_win.c mbfilter_euc_kr.c \ mbfilter_euc_tw.c mbfilter_htmlent.c mbfilter_hz.c mbfilter_iso2022_kr.c \ mbfilter_iso8859_1.c mbfilter_iso8859_10.c mbfilter_iso8859_13.c \ - mbfilter_iso8859_14.c mbfilter_iso8859_15.c mbfilter_iso8859_2.c \ - mbfilter_iso8859_3.c mbfilter_iso8859_4.c mbfilter_iso8859_5.c \ - mbfilter_iso8859_6.c mbfilter_iso8859_7.c mbfilter_iso8859_8.c \ - mbfilter_iso8859_9.c mbfilter_jis.c mbfilter_koi8r.c mbfilter_qprint.c \ - mbfilter_sjis.c mbfilter_ucs2.c mbfilter_ucs4.c mbfilter_uhc.c \ - mbfilter_utf16.c mbfilter_utf32.c mbfilter_utf7.c mbfilter_utf7imap.c \ - mbfilter_utf8.c mbfilter_uuencode.c", "mbstring"); + mbfilter_iso8859_14.c mbfilter_iso8859_15.c mbfilter_iso8859_16.c \ + mbfilter_iso8859_2.c mbfilter_iso8859_3.c mbfilter_iso8859_4.c \ + mbfilter_iso8859_5.c mbfilter_iso8859_6.c mbfilter_iso8859_7.c \ + mbfilter_iso8859_8.c mbfilter_iso8859_9.c mbfilter_jis.c \ + mbfilter_koi8r.c mbfilter_qprint.c mbfilter_sjis.c mbfilter_ucs2.c \ + mbfilter_ucs4.c mbfilter_uhc.c mbfilter_utf16.c mbfilter_utf32.c \ + mbfilter_utf7.c mbfilter_utf7imap.c mbfilter_utf8.c \ + mbfilter_uuencode.c", "mbstring"); ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \ mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \ @@ -51,13 +52,15 @@ if (PHP_MBSTRING == "yes") { AC_DEFINE('HAVE_STDARG_PROTOTYPES', 1, 'have stdarg.h'); AC_DEFINE('HAVE_MBREGEX', 1); ADD_SOURCES("ext/mbstring/oniguruma", "regcomp.c regerror.c \ - regenc.c regexec.c reggnu.c regparse.c regposerr.c", "mbstring"); + regenc.c regexec.c reggnu.c regparse.c regposerr.c \ + regext.c regsyntax.c regtrav.c regversion.c st.c", "mbstring"); ADD_SOURCES("ext/mbstring/oniguruma/enc", "ascii.c big5.c \ euc_jp.c euc_kr.c euc_tw.c iso8859_1.c iso8859_2.c \ iso8859_3.c iso8859_4.c iso8859_5.c iso8859_6.c \ iso8859_7.c iso8859_8.c iso8859_9.c iso8859_10.c \ iso8859_11.c iso8859_13.c iso8859_14.c iso8859_15.c iso8859_16.c \ - koi8.c koi8_r.c sjis.c utf8.c", "mbstring"); + koi8.c koi8_r.c sjis.c utf8.c unicode.c utf16_be.c utf16_le.c \ + utf32_be.c utf32_le.c", "mbstring"); ADD_SOURCES("ext/mbstring", "php_mbregex.c", "mbstring"); } } diff --git a/ext/mbstring/libmbfl/filters/Makefile.am b/ext/mbstring/libmbfl/filters/Makefile.am index d3e5cfd257e..aab009b9d5d 100644 --- a/ext/mbstring/libmbfl/filters/Makefile.am +++ b/ext/mbstring/libmbfl/filters/Makefile.am @@ -1,5 +1,104 @@ -EXTRA_DIST=Makefile.bcc32 +EXTRA_DIST=Makefile.bcc32 mk_sb_tbl.awk noinst_LTLIBRARIES=libmbfl_filters.la INCLUDES=-I../mbfl libmbfl_filters_la_LDFLAGS=-version-info $(SHLIB_VERSION) -libmbfl_filters_la_SOURCES=mbfilter_cp936.c mbfilter_hz.c mbfilter_euc_tw.c mbfilter_big5.c mbfilter_euc_jp.c mbfilter_jis.c mbfilter_iso8859_1.c mbfilter_iso8859_2.c mbfilter_cp1252.c mbfilter_cp1251.c mbfilter_ascii.c mbfilter_iso8859_3.c mbfilter_iso8859_4.c mbfilter_iso8859_5.c mbfilter_iso8859_6.c mbfilter_iso8859_7.c mbfilter_iso8859_8.c mbfilter_iso8859_9.c mbfilter_iso8859_10.c mbfilter_iso8859_13.c mbfilter_iso8859_14.c mbfilter_iso8859_15.c mbfilter_htmlent.c mbfilter_byte2.c mbfilter_byte4.c mbfilter_uuencode.c mbfilter_base64.c mbfilter_sjis.c mbfilter_7bit.c mbfilter_qprint.c mbfilter_ucs4.c mbfilter_ucs2.c mbfilter_utf32.c mbfilter_utf16.c mbfilter_utf8.c mbfilter_utf7.c mbfilter_utf7imap.c mbfilter_euc_jp_win.c mbfilter_cp932.c mbfilter_euc_cn.c mbfilter_euc_kr.c mbfilter_uhc.c mbfilter_iso2022_kr.c mbfilter_cp866.c mbfilter_koi8r.c html_entities.c cp932_table.h html_entities.h mbfilter_7bit.h mbfilter_ascii.h mbfilter_base64.h mbfilter_big5.h mbfilter_byte2.h mbfilter_byte4.h mbfilter_cp1251.h mbfilter_cp1252.h mbfilter_cp866.h mbfilter_cp932.h mbfilter_cp936.h mbfilter_euc_cn.h mbfilter_euc_jp.h mbfilter_euc_jp_win.h mbfilter_euc_kr.h mbfilter_euc_tw.h mbfilter_htmlent.h mbfilter_hz.h mbfilter_iso2022_kr.h mbfilter_iso8859_1.h mbfilter_iso8859_10.h mbfilter_iso8859_13.h mbfilter_iso8859_14.h mbfilter_iso8859_15.h mbfilter_iso8859_2.h mbfilter_iso8859_3.h mbfilter_iso8859_4.h mbfilter_iso8859_5.h mbfilter_iso8859_6.h mbfilter_iso8859_7.h mbfilter_iso8859_8.h mbfilter_iso8859_9.h mbfilter_jis.h mbfilter_koi8r.h mbfilter_qprint.h mbfilter_sjis.h mbfilter_ucs2.h mbfilter_ucs4.h mbfilter_uhc.h mbfilter_utf16.h mbfilter_utf32.h mbfilter_utf7.h mbfilter_utf7imap.h mbfilter_utf8.h mbfilter_uuencode.h unicode_prop.h unicode_table_big5.h unicode_table_cns11643.h unicode_table_cp1251.h unicode_table_cp1252.h unicode_table_cp866.h unicode_table_cp932_ext.h unicode_table_cp936.h unicode_table_iso8859_10.h unicode_table_iso8859_13.h unicode_table_iso8859_14.h unicode_table_iso8859_15.h unicode_table_iso8859_2.h unicode_table_iso8859_3.h unicode_table_iso8859_4.h unicode_table_iso8859_5.h unicode_table_iso8859_6.h unicode_table_iso8859_7.h unicode_table_iso8859_8.h unicode_table_iso8859_9.h unicode_table_jis.h unicode_table_koi8r.h unicode_table_uhc.h +libmbfl_filters_la_SOURCES=mbfilter_cp936.c mbfilter_hz.c mbfilter_euc_tw.c mbfilter_big5.c mbfilter_euc_jp.c mbfilter_jis.c mbfilter_iso8859_1.c mbfilter_iso8859_2.c mbfilter_cp1252.c mbfilter_cp1251.c mbfilter_ascii.c mbfilter_iso8859_3.c mbfilter_iso8859_4.c mbfilter_iso8859_5.c mbfilter_iso8859_6.c mbfilter_iso8859_7.c mbfilter_iso8859_8.c mbfilter_iso8859_9.c mbfilter_iso8859_10.c mbfilter_iso8859_13.c mbfilter_iso8859_14.c mbfilter_iso8859_15.c mbfilter_iso8859_16.c mbfilter_htmlent.c mbfilter_byte2.c mbfilter_byte4.c mbfilter_uuencode.c mbfilter_base64.c mbfilter_sjis.c mbfilter_7bit.c mbfilter_qprint.c mbfilter_ucs4.c mbfilter_ucs2.c mbfilter_utf32.c mbfilter_utf16.c mbfilter_utf8.c mbfilter_utf7.c mbfilter_utf7imap.c mbfilter_euc_jp_win.c mbfilter_cp932.c mbfilter_euc_cn.c mbfilter_euc_kr.c mbfilter_uhc.c mbfilter_iso2022_kr.c mbfilter_cp866.c mbfilter_koi8r.c html_entities.c cp932_table.h html_entities.h mbfilter_7bit.h mbfilter_ascii.h mbfilter_base64.h mbfilter_big5.h mbfilter_byte2.h mbfilter_byte4.h mbfilter_cp1251.h mbfilter_cp1252.h mbfilter_cp866.h mbfilter_cp932.h mbfilter_cp936.h mbfilter_euc_cn.h mbfilter_euc_jp.h mbfilter_euc_jp_win.h mbfilter_euc_kr.h mbfilter_euc_tw.h mbfilter_htmlent.h mbfilter_hz.h mbfilter_iso2022_kr.h mbfilter_iso8859_1.h mbfilter_iso8859_10.h mbfilter_iso8859_13.h mbfilter_iso8859_14.h mbfilter_iso8859_15.h mbfilter_iso8859_16.h mbfilter_iso8859_2.h mbfilter_iso8859_3.h mbfilter_iso8859_4.h mbfilter_iso8859_5.h mbfilter_iso8859_6.h mbfilter_iso8859_7.h mbfilter_iso8859_8.h mbfilter_iso8859_9.h mbfilter_jis.h mbfilter_koi8r.h mbfilter_qprint.h mbfilter_sjis.h mbfilter_ucs2.h mbfilter_ucs4.h mbfilter_uhc.h mbfilter_utf16.h mbfilter_utf32.h mbfilter_utf7.h mbfilter_utf7imap.h mbfilter_utf8.h mbfilter_uuencode.h unicode_prop.h unicode_table_big5.h unicode_table_cns11643.h unicode_table_cp1251.h unicode_table_cp1252.h unicode_table_cp866.h unicode_table_cp932_ext.h unicode_table_cp936.h unicode_table_iso8859_10.h unicode_table_iso8859_13.h unicode_table_iso8859_14.h unicode_table_iso8859_15.h unicode_table_iso8859_16.h unicode_table_iso8859_2.h unicode_table_iso8859_3.h unicode_table_iso8859_4.h unicode_table_iso8859_5.h unicode_table_iso8859_6.h unicode_table_iso8859_7.h unicode_table_iso8859_8.h unicode_table_iso8859_9.h unicode_table_jis.h unicode_table_koi8r.h unicode_table_uhc.h + +mbfilter_iso8859_2.c: unicode_table_iso8859_2.h + +mbfilter_iso8859_3.c: unicode_table_iso8859_3.h + +mbfilter_iso8859_4.c: unicode_table_iso8859_4.h + +mbfilter_iso8859_5.c: unicode_table_iso8859_5.h + +mbfilter_iso8859_6.c: unicode_table_iso8859_6.h + +mbfilter_iso8859_7.c: unicode_table_iso8859_7.h + +mbfilter_iso8859_8.c: unicode_table_iso8859_8.h + +mbfilter_iso8859_9.c: unicode_table_iso8859_9.h + +mbfilter_iso8859_10.c: unicode_table_iso8859_10.h + +mbfilter_iso8859_11.c: unicode_table_iso8859_11.h + +mbfilter_iso8859_13.c: unicode_table_iso8859_13.h + +mbfilter_iso8859_14.c: unicode_table_iso8859_13.h + +mbfilter_iso8859_15.c: unicode_table_iso8859_15.h + +mbfilter_iso8859_16.c: unicode_table_iso8859_16.h + +8859-1.TXT 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT 8859-6.TXT \ +8859-7.TXT 8859-8.TXT 8859-9.TXT 8859-10.TXT 8859-11.TXT 8859-13.TXT \ +8859-14.TXT 8859-15.TXT 8859-16.TXT: + $(FETCH_VIA_FTP) ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/$@ + +unicode_table_iso8859_1.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_1_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_1_H -f mk_sb_tbl.awk 8859-1.TXT > $@ + +unicode_table_iso8859_2.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_2_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_2_H -f mk_sb_tbl.awk 8859-2.TXT > $@ + +unicode_table_iso8859_3.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_3_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_3_H -f mk_sb_tbl.awk 8859-3.TXT > $@ + +unicode_table_iso8859_4.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_4_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_4_H -f mk_sb_tbl.awk 8859-4.TXT > $@ + +unicode_table_iso8859_5.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_5_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_5_H -f mk_sb_tbl.awk 8859-5.TXT > $@ + +unicode_table_iso8859_6.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_6_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_6_H -f mk_sb_tbl.awk 8859-6.TXT > $@ + +unicode_table_iso8859_7.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_7_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_7_H -f mk_sb_tbl.awk 8859-7.TXT > $@ + +unicode_table_iso8859_8.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_8_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_8_H -f mk_sb_tbl.awk 8859-8.TXT > $@ + +unicode_table_iso8859_9.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_9_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_9_H -f mk_sb_tbl.awk 8859-9.TXT > $@ + +unicode_table_iso8859_10.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_10_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_10_H -f mk_sb_tbl.awk 8859-10.TXT > $@ + +unicode_table_iso8859_11.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_11_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_11_H -f mk_sb_tbl.awk 8859-11.TXT > $@ + +unicode_table_iso8859_13.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_13_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_13_H -f mk_sb_tbl.awk 8859-13.TXT > $@ + +unicode_table_iso8859_14.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_14_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_14_H -f mk_sb_tbl.awk 8859-14.TXT > $@ + +unicode_table_iso8859_15.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_15_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLE_ISO8859_15_H -f mk_sb_tbl.awk 8859-15.TXT > $@ + +unicode_table_iso8859_16.h: mk_sb_tbl.awk + $(AWK) -v TABLE_NAME=iso8859_16_ucs_table \ + -v IFNDEF_NAME=UNICODE_TABLEISO8859_16_H -f mk_sb_tbl.awk 8859-16.TXT > $@ + +unidata: 8859-1.TXT 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT 8859-6.TXT \ +8859-7.TXT 8859-8.TXT 8859-9.TXT 8859-10.TXT 8859-11.TXT 8859-13.TXT \ +8859-14.TXT 8859-15.TXT 8859-16.TXT + +.PHONY: unidata diff --git a/ext/mbstring/libmbfl/filters/Makefile.bcc32 b/ext/mbstring/libmbfl/filters/Makefile.bcc32 index 4219ed9061d..32bd161093c 100644 --- a/ext/mbstring/libmbfl/filters/Makefile.bcc32 +++ b/ext/mbstring/libmbfl/filters/Makefile.bcc32 @@ -1,6 +1,6 @@ !include ..\rules.mak.bcc32 INCLUDES=$(INCLUDES) -I../mbfl -OBJS=mbfilter_cp936.obj mbfilter_hz.obj mbfilter_euc_tw.obj mbfilter_big5.obj mbfilter_euc_jp.obj mbfilter_jis.obj mbfilter_iso8859_1.obj mbfilter_iso8859_2.obj mbfilter_cp1252.obj mbfilter_cp1251.obj mbfilter_ascii.obj mbfilter_iso8859_3.obj mbfilter_iso8859_4.obj mbfilter_iso8859_5.obj mbfilter_iso8859_6.obj mbfilter_iso8859_7.obj mbfilter_iso8859_8.obj mbfilter_iso8859_9.obj mbfilter_iso8859_10.obj mbfilter_iso8859_13.obj mbfilter_iso8859_14.obj mbfilter_iso8859_15.obj mbfilter_htmlent.obj mbfilter_byte2.obj mbfilter_byte4.obj mbfilter_uuencode.obj mbfilter_base64.obj mbfilter_sjis.obj mbfilter_7bit.obj mbfilter_qprint.obj mbfilter_ucs4.obj mbfilter_ucs2.obj mbfilter_utf32.obj mbfilter_utf16.obj mbfilter_utf8.obj mbfilter_utf7.obj mbfilter_utf7imap.obj mbfilter_euc_jp_win.obj mbfilter_cp932.obj mbfilter_euc_cn.obj mbfilter_euc_kr.obj mbfilter_uhc.obj mbfilter_iso2022_kr.obj mbfilter_cp866.obj mbfilter_koi8r.obj html_entities.obj +OBJS=mbfilter_cp936.obj mbfilter_hz.obj mbfilter_euc_tw.obj mbfilter_big5.obj mbfilter_euc_jp.obj mbfilter_jis.obj mbfilter_iso8859_1.obj mbfilter_iso8859_2.obj mbfilter_cp1252.obj mbfilter_cp1251.obj mbfilter_ascii.obj mbfilter_iso8859_3.obj mbfilter_iso8859_4.obj mbfilter_iso8859_5.obj mbfilter_iso8859_6.obj mbfilter_iso8859_7.obj mbfilter_iso8859_8.obj mbfilter_iso8859_9.obj mbfilter_iso8859_10.obj mbfilter_iso8859_13.obj mbfilter_iso8859_14.obj mbfilter_iso8859_15.obj mbfilter_iso8859_16.obj mbfilter_htmlent.obj mbfilter_byte2.obj mbfilter_byte4.obj mbfilter_uuencode.obj mbfilter_base64.obj mbfilter_sjis.obj mbfilter_7bit.obj mbfilter_qprint.obj mbfilter_ucs4.obj mbfilter_ucs2.obj mbfilter_utf32.obj mbfilter_utf16.obj mbfilter_utf8.obj mbfilter_utf7.obj mbfilter_utf7imap.obj mbfilter_euc_jp_win.obj mbfilter_cp932.obj mbfilter_euc_cn.obj mbfilter_euc_kr.obj mbfilter_uhc.obj mbfilter_iso2022_kr.obj mbfilter_cp866.obj mbfilter_koi8r.obj html_entities.obj all: $(OBJS) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.c b/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.c new file mode 100755 index 00000000000..8f12feb47e8 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.c @@ -0,0 +1,136 @@ +/* + * "streamable kanji code filter and converter" + * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. + * + * LICENSE NOTICES + * + * This file is part of "streamable kanji code filter and converter", + * which is distributed under the terms of GNU Lesser General Public + * License (version 2) as published by the Free Software Foundation. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with "streamable kanji code filter and converter"; + * if not, write to the Free Software Foundation, Inc., 59 Temple Place, + * Suite 330, Boston, MA 02111-1307 USA + * + * The author of this file: + * + */ +/* + * The source code included in this files was separated from mbfilter.c + * by moriyoshi koizumi on 4 dec 2002. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "mbfilter.h" +#include "mbfilter_iso8859_16.h" +#include "unicode_table_iso8859_16.h" + +static const char *mbfl_encoding_8859_16_aliases[] = {"ISO_8859-16", NULL}; + +const mbfl_encoding mbfl_encoding_8859_16 = { + mbfl_no_encoding_8859_16, + "ISO-8859-16", + "ISO-8859-16", + (const char *(*)[])&mbfl_encoding_8859_16_aliases, + NULL, + MBFL_ENCTYPE_SBCS +}; + +const struct mbfl_identify_vtbl vtbl_identify_8859_16 = { + mbfl_no_encoding_8859_16, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_common_dtor, + mbfl_filt_ident_true +}; + +const struct mbfl_convert_vtbl vtbl_8859_16_wchar = { + mbfl_no_encoding_8859_16, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_8859_16_wchar, + mbfl_filt_conv_common_flush +}; + +const struct mbfl_convert_vtbl vtbl_wchar_8859_16 = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_8859_16, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_8859_16, + mbfl_filt_conv_common_flush +}; + +#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) + +/* + * ISO-8859-16 => wchar + */ +int mbfl_filt_conv_8859_16_wchar(int c, mbfl_convert_filter *filter) +{ + int s; + + if (c >= 0 && c < 0xa0) { + s = c; + } else if (c >= 0xa0 && c < 0x100) { + s = iso8859_16_ucs_table[c - 0xa0]; + if (s <= 0) { + s = c; + s &= MBFL_WCSPLANE_MASK; + s |= MBFL_WCSPLANE_8859_16; + } + } else { + s = c; + s &= MBFL_WCSGROUP_MASK; + s |= MBFL_WCSGROUP_THROUGH; + } + + CK((*filter->output_function)(s, filter->data)); + + return c; +} + +/* + * wchar => ISO-8859-16 + */ +int mbfl_filt_conv_wchar_8859_16(int c, mbfl_convert_filter *filter) +{ + int s, n; + + if (c >= 0 && c < 0xa0) { + s = c; + } else { + s = -1; + n = 95; + while (n >= 0) { + if (c == iso8859_16_ucs_table[n]) { + s = 0xa0 + n; + break; + } + n--; + } + if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_8859_16) { + s = c & MBFL_WCSPLANE_MASK; + } + } + + if (s >= 0) { + CK((*filter->output_function)(s, filter->data)); + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter)); + } + } + + return c; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.h b/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.h new file mode 100755 index 00000000000..a5e2d2fbef6 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso8859_16.h @@ -0,0 +1,23 @@ +/* + * COPYRIGHT NOTICE + * + * This file is a portion of "streamable kanji code filter and converter" + * library, which is distributed under GNU Lesser General Public License + * version 2.1. + * + */ + +#ifndef MBFL_MBFILTER_ISO8859_16_H +#define MBFL_MBFILTER_ISO8859_16_H + +#include "mbfilter.h" + +extern const mbfl_encoding mbfl_encoding_8859_16; +extern const struct mbfl_identify_vtbl vtbl_identify_8859_16; +extern const struct mbfl_convert_vtbl vtbl_8859_16_wchar; +extern const struct mbfl_convert_vtbl vtbl_wchar_8859_16; + +int mbfl_filt_conv_8859_16_wchar(int c, mbfl_convert_filter *filter); +int mbfl_filt_conv_wchar_8859_16(int c, mbfl_convert_filter *filter); + +#endif /* MBFL_MBFILTER_ISO8859_16_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 9b18a1cb2b9..188d088ed0a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -155,11 +155,30 @@ int mbfl_filt_conv_qprintenc_flush(mbfl_convert_filter *filter) */ int mbfl_filt_conv_qprintdec(int c, mbfl_convert_filter *filter) { - int n; + int n, m; + + static int hex2code_map[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; switch (filter->status) { case 1: - if ((c >= 0x30 && c <= 0x39) || (c >= 0x41 && c <= 0x46)) { /* 0 - 9 or A - F */ + if (hex2code_map[c & 0xff] >= 0) { filter->cache = c; filter->status = 2; } else if (c == 0x0d) { /* soft line feed */ @@ -173,21 +192,13 @@ int mbfl_filt_conv_qprintdec(int c, mbfl_convert_filter *filter) } break; case 2: - n = filter->cache; - if (n >= 0x30 && n <= 0x39) { /* '0' - '9' */ - n -= 48; /* 48 = '0' */ - } else { - n -= 55; /* 55 = 'A' - 10 */ - } - n <<= 4; - if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */ - n += (c - 48); - } else if (c >= 0x41 && c <= 0x46) { /* 'A' - 'F' */ - n += (c - 55); - } else { + m = hex2code_map[c & 0xff]; + if (m < 0) { CK((*filter->output_function)(0x3d, filter->data)); /* '=' */ CK((*filter->output_function)(filter->cache, filter->data)); n = c; + } else { + n = hex2code_map[filter->cache] << 4 | m; } CK((*filter->output_function)(n, filter->data)); filter->status = 0; diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_10.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_10.h index 0de0d17eb47..48b1301b729 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_10.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_10.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_10_H -#define UNICODE_TABLE_ISO8859_10_H - -static const unsigned short iso8859_10_ucs_table[] = { - 0x00A0,0x0104,0x0112,0x0122,0x0124,0x0128,0x0136,0x00A7, - 0x013B,0x0110,0x0160,0x0166,0x017D,0x00AD,0x016A,0x014A, - 0x00B0,0x0105,0x0113,0x0123,0x012B,0x0129,0x0137,0x00B7, - 0x013C,0x0111,0x0161,0x0167,0x017E,0x2015,0x016B,0x014B, - 0x0100,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x012E, - 0x010C,0x00C9,0x0118,0x00CB,0x0116,0x00CD,0x00CE,0x00CF, - 0x00D0,0x0145,0x014C,0x00D3,0x00D4,0x00D5,0x00D6,0x0168, - 0x00D8,0x0172,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF, - 0x0101,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x012F, - 0x010D,0x00E9,0x0119,0x00EB,0x0117,0x00ED,0x00EE,0x00EF, - 0x00F0,0x0146,0x014D,0x00F3,0x00F4,0x00F5,0x00F6,0x0169, - 0x00F8,0x0173,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x0138 +static const unsigned int iso8859_10_ucs_table[] = { + 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, + 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, + 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, + 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, + 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, + 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, + 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, + 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, + 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, + 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, + 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138 }; - #endif /* UNICODE_TABLE_ISO8859_10_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_13.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_13.h index ee69310bdb7..e7991f8e284 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_13.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_13.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_13_H -#define UNICODE_TABLE_ISO8859_13_H - -static const unsigned short iso8859_13_ucs_table[] = { - 0x00A0,0x201D,0x00A2,0x00A3,0x00A4,0x201E,0x00A6,0x00A7, - 0x00D8,0x00A9,0x0156,0x00AB,0x00AC,0x00AD,0x00AE,0x00C6, - 0x00B0,0x00B1,0x00B2,0x00B3,0x201C,0x00B5,0x00B6,0x00B7, - 0x00F8,0x00B9,0x0157,0x00BB,0x00BC,0x00BD,0x00BE,0x00E6, - 0x0104,0x012E,0x0100,0x0106,0x00C4,0x00C5,0x0118,0x0112, - 0x010C,0x00C9,0x0179,0x0116,0x0122,0x0136,0x012A,0x013B, - 0x0160,0x0143,0x0145,0x00D3,0x014C,0x00D5,0x00D6,0x00D7, - 0x0172,0x0141,0x015A,0x016A,0x00DC,0x017B,0x017D,0x00DF, - 0x0105,0x012F,0x0101,0x0107,0x00E4,0x00E5,0x0119,0x0113, - 0x010D,0x00E9,0x017A,0x0117,0x0123,0x0137,0x012B,0x013C, - 0x0161,0x0144,0x0146,0x00F3,0x014D,0x00F5,0x00F6,0x00F7, - 0x0173,0x0142,0x015B,0x016B,0x00FC,0x017C,0x017E,0x2019 +static const unsigned int iso8859_13_ucs_table[] = { + 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, + 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, + 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, + 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, + 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, + 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, + 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, + 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, + 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, + 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, + 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019 }; - #endif /* UNICODE_TABLE_ISO8859_13_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_14.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_14.h index f0633963ff2..8c0bfe133ae 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_14.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_14.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_14_H -#define UNICODE_TABLE_ISO8859_14_H - -static const unsigned short iso8859_14_ucs_table[] = { - 0x00A0,0x1E02,0x1E03,0x00A3,0x010A,0x010B,0x1E0A,0x00A7, - 0x1E80,0x00A9,0x1E82,0x1E0B,0x1EF2,0x00AD,0x00AE,0x0178, - 0x1E1E,0x1E1F,0x0120,0x0121,0x1E40,0x1E41,0x00B6,0x1E56, - 0x1E81,0x1E57,0x1E83,0x1E60,0x1EF3,0x1E84,0x1E85,0x1E61, - 0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7, - 0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF, - 0x0174,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x1E6A, - 0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x0176,0x00DF, - 0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7, - 0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF, - 0x0175,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x1E6B, - 0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x0177,0x00FF +static const unsigned int iso8859_14_ucs_table[] = { + 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, + 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, + 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, + 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, + 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, + 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff }; - #endif /* UNICODE_TABLE_ISO8859_14_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_15.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_15.h index ab2f1a82b63..82743da2b5e 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_15.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_15.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_15_H -#define UNICODE_TABLE_ISO8859_15_H - -static const unsigned short iso8859_15_ucs_table[] = { - 0x00A0,0x00A1,0x00A2,0x00A3,0x20AC,0x00A5,0x0160,0x00A7, - 0x0161,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF, - 0x00B0,0x00B1,0x00B2,0x00B3,0x017D,0x00B5,0x00B6,0x00B7, - 0x017E,0x00B9,0x00BA,0x00BB,0x0152,0x0153,0x0178,0x00BF, - 0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7, - 0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF, - 0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7, - 0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF, - 0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7, - 0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF, - 0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7, - 0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF +static const unsigned int iso8859_15_ucs_table[] = { + 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, + 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, + 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, + 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, + 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }; - #endif /* UNICODE_TABLE_ISO8859_15_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_16.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_16.h new file mode 100644 index 00000000000..256865fd616 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_16.h @@ -0,0 +1,17 @@ +/* This file is automatically generated. Do not edit! */ +#ifndef UNICODE_TABLEISO8859_16_H +static const unsigned int iso8859_16_ucs_table[] = { + 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, + 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, + 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, + 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, + 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, + 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, + 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, + 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, + 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff +}; +#endif /* UNICODE_TABLEISO8859_16_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_2.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_2.h index 148fdbfff52..ab1ca6eea83 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_2.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_2.h @@ -1,45 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_2_H -#define UNICODE_TABLE_ISO8859_2_H - -static const unsigned short iso8859_2_ucs_table[] = { - 0x00A0,0x0104,0x02D8,0x0141,0x00A4,0x013D,0x015A,0x00A7, - 0x00A8,0x0160,0x015E,0x0164,0x0179,0x00AD,0x017D,0x017B, - 0x00B0,0x0105,0x02DB,0x0142,0x00B4,0x013E,0x015B,0x02C7, - 0x00B8,0x0161,0x015F,0x0165,0x017A,0x02DD,0x017E,0x017C, - 0x0154,0x00C1,0x00C2,0x0102,0x00C4,0x0139,0x0106,0x00C7, - 0x010C,0x00C9,0x0118,0x00CB,0x011A,0x00CD,0x00CE,0x010E, - 0x0110,0x0143,0x0147,0x00D3,0x00D4,0x0150,0x00D6,0x00D7, - 0x0158,0x016E,0x00DA,0x0170,0x00DC,0x00DD,0x0162,0x00DF, - 0x0155,0x00E1,0x00E2,0x0103,0x00E4,0x013A,0x0107,0x00E7, - 0x010D,0x00E9,0x0119,0x00EB,0x011B,0x00ED,0x00EE,0x010F, - 0x0111,0x0144,0x0148,0x00F3,0x00F4,0x0151,0x00F6,0x00F7, - 0x0159,0x016F,0x00FA,0x0171,0x00FC,0x00FD,0x0163,0x02D9 +static const unsigned int iso8859_2_ucs_table[] = { + 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, + 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, + 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, + 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, + 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, + 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, + 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, + 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 }; - - #endif /* UNICODE_TABLE_ISO8859_2_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_3.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_3.h index 40aa5d635a1..f0501787e3d 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_3.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_3.h @@ -1,43 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_3_H -#define UNICODE_TABLE_ISO8859_3_H - -static const unsigned short iso8859_3_ucs_table[] = { - 0x00A0,0x0126,0x02D8,0x00A3,0x00A4,0x0000,0x0124,0x00A7, - 0x00A8,0x0130,0x015E,0x011E,0x0134,0x00AD,0x0000,0x017B, - 0x00B0,0x0127,0x00B2,0x00B3,0x00B4,0x00B5,0x0125,0x00B7, - 0x00B8,0x0131,0x015F,0x011F,0x0135,0x00BD,0x0000,0x017C, - 0x00C0,0x00C1,0x00C2,0x0000,0x00C4,0x010A,0x0108,0x00C7, - 0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF, - 0x0000,0x00D1,0x00D2,0x00D3,0x00D4,0x0120,0x00D6,0x00D7, - 0x011C,0x00D9,0x00DA,0x00DB,0x00DC,0x016C,0x015C,0x00DF, - 0x00E0,0x00E1,0x00E2,0x0000,0x00E4,0x010B,0x0109,0x00E7, - 0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF, - 0x0000,0x00F1,0x00F2,0x00F3,0x00F4,0x0121,0x00F6,0x00F7, - 0x011D,0x00F9,0x00FA,0x00FB,0x00FC,0x016D,0x015D,0x02D9 +static const unsigned int iso8859_3_ucs_table[] = { + 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, + 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, + 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, + 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, + 0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, + 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, + 0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, + 0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, + 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9 }; - #endif /* UNICODE_TABLE_ISO8859_3_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_4.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_4.h index 4aaa276c00d..01f90fb3e9d 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_4.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_4.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_4_H -#define UNICODE_TABLE_ISO8859_4_H - -static const unsigned short iso8859_4_ucs_table[] = { - 0x00A0,0x0104,0x0138,0x0156,0x00A4,0x0128,0x013B,0x00A7, - 0x00A8,0x0160,0x0112,0x0122,0x0166,0x00AD,0x017D,0x00AF, - 0x00B0,0x0105,0x02DB,0x0157,0x00B4,0x0129,0x013C,0x02C7, - 0x00B8,0x0161,0x0113,0x0123,0x0167,0x014A,0x017E,0x014B, - 0x0100,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x012E, - 0x010C,0x00C9,0x0118,0x00CB,0x0116,0x00CD,0x00CE,0x012A, - 0x0110,0x0145,0x014C,0x0136,0x00D4,0x00D5,0x00D6,0x00D7, - 0x00D8,0x0172,0x00DA,0x00DB,0x00DC,0x0168,0x016A,0x00DF, - 0x0101,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x012F, - 0x010D,0x00E9,0x0119,0x00EB,0x0117,0x00ED,0x00EE,0x012B, - 0x0111,0x0146,0x014D,0x0137,0x00F4,0x00F5,0x00F6,0x00F7, - 0x00F8,0x0173,0x00FA,0x00FB,0x00FC,0x0169,0x016B,0x02D9 +static const unsigned int iso8859_4_ucs_table[] = { + 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, + 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, + 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, + 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, + 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, + 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, + 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, + 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, + 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, + 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9 }; - #endif /* UNICODE_TABLE_ISO8859_4_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_5.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_5.h index 58fc0e2adaa..70c9f38c5e8 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_5.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_5.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_5_H -#define UNICODE_TABLE_ISO8859_5_H - -static const unsigned short iso8859_5_ucs_table[] = { - 0x00A0,0x0401,0x0402,0x0403,0x0404,0x0405,0x0406,0x0407, - 0x0408,0x0409,0x040A,0x040B,0x040C,0x00AD,0x040E,0x040F, - 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417, - 0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,0x041F, - 0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427, - 0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,0x042F, - 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437, - 0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,0x043F, - 0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447, - 0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,0x044E,0x044F, - 0x2116,0x0451,0x0452,0x0453,0x0454,0x0455,0x0456,0x0457, - 0x0458,0x0459,0x045A,0x045B,0x045C,0x00A7,0x045E,0x045F +static const unsigned int iso8859_5_ucs_table[] = { + 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, + 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f }; - #endif /* UNICODE_TABLE_ISO8859_5_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_6.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_6.h index 9dba3ad895a..c9d66243370 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_6.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_6.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_6_H -#define UNICODE_TABLE_ISO8859_6_H - -static const unsigned short iso8859_6_ucs_table[] = { - 0x00A0,0x0000,0x0000,0x0000,0x00A4,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x060C,0x00AD,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x061B,0x0000,0x0000,0x0000,0x061F, - 0x0000,0x0621,0x0622,0x0623,0x0624,0x0625,0x0626,0x0627, - 0x0628,0x0629,0x062A,0x062B,0x062C,0x062D,0x062E,0x062F, - 0x0630,0x0631,0x0632,0x0633,0x0634,0x0635,0x0636,0x0637, - 0x0638,0x0639,0x063A,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0640,0x0641,0x0642,0x0643,0x0644,0x0645,0x0646,0x0647, - 0x0648,0x0649,0x064A,0x064B,0x064C,0x064D,0x064E,0x064F, - 0x0650,0x0651,0x0652,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x060C,0x00AD,0x0000,0x0000 +static const unsigned int iso8859_6_ucs_table[] = { + 0x00a0, 0x0000, 0x0000, 0x0000, 0x00a4, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x060c, 0x00ad, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x061b, 0x0000, 0x0000, 0x0000, 0x061f, + 0x0000, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, + 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, + 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, + 0x0638, 0x0639, 0x063a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, + 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, + 0x0650, 0x0651, 0x0652, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; - #endif /* UNICODE_TABLE_ISO8859_6_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_7.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_7.h index 0dfde9bb84e..4961c30f928 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_7.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_7.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_7_H -#define UNICODE_TABLE_ISO8859_7_H - -static const unsigned short iso8859_7_ucs_table[] = { - 0x00A0,0x2018,0x2019,0x00A3,0x0000,0x0000,0x00A6,0x00A7, - 0x00A8,0x00A9,0x0000,0x00AB,0x00AC,0x00AD,0x0000,0x2015, - 0x00B0,0x00B1,0x00B2,0x00B3,0x0384,0x0385,0x0386,0x00B7, - 0x0388,0x0389,0x038A,0x00BB,0x038C,0x00BD,0x038E,0x038F, - 0x0390,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397, - 0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F, - 0x03A0,0x03A1,0x0000,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7, - 0x03A8,0x03A9,0x03AA,0x03AB,0x03AC,0x03AD,0x03AE,0x03AF, - 0x03B0,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7, - 0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF, - 0x03C0,0x03C1,0x03C2,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7, - 0x03C8,0x03C9,0x03CA,0x03CB,0x03CC,0x03CD,0x03CE,0x0000 +static const unsigned int iso8859_7_ucs_table[] = { + 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, 0x0000, 0x2015, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, + 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, + 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, + 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, + 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, + 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0x0000 }; - #endif /* UNICODE_TABLE_ISO8859_7_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_8.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_8.h index 756004f913e..3a52badb36b 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_8.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_8.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_8_H -#define UNICODE_TABLE_ISO8859_8_H - -static const unsigned short iso8859_8_ucs_table[] = { - 0x00A0,0x0000,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7, - 0x00A8,0x00A9,0x00D7,0x00AB,0x00AC,0x00AD,0x00AE,0x203E, - 0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7, - 0x00B8,0x00B9,0x00F7,0x00BB,0x00BC,0x00BD,0x00BE,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2017, - 0x05D0,0x05D1,0x05D2,0x05D3,0x05D4,0x05D5,0x05D6,0x05D7, - 0x05D8,0x05D9,0x05DA,0x05DB,0x05DC,0x05DD,0x05DE,0x05DF, - 0x05E0,0x05E1,0x05E2,0x05E3,0x05E4,0x05E5,0x05E6,0x05E7, - 0x05E8,0x05E9,0x05EA,0x0000,0x0000,0x0000,0x0000,0x0000 +static const unsigned int iso8859_8_ucs_table[] = { + 0x00a0, 0x0000, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2017, + 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, + 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, + 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x0000, 0x0000, 0x200e, 0x200f, 0x0000 }; - #endif /* UNICODE_TABLE_ISO8859_8_H */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_9.h b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_9.h index f39e83f45ee..eaac8c6723d 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_iso8859_9.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_iso8859_9.h @@ -1,44 +1,17 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The authors of this file: PHP3 internationalization team - * You can contact the primary author 金本 茂 . - * - */ - +/* This file is automatically generated. Do not edit! */ #ifndef UNICODE_TABLE_ISO8859_9_H -#define UNICODE_TABLE_ISO8859_9_H - -static const unsigned short iso8859_9_ucs_table[] = { - 0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7, - 0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF, - 0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7, - 0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF, - 0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7, - 0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF, - 0x011E,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7, - 0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x0130,0x015E,0x00DF, - 0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7, - 0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF, - 0x011F,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7, - 0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x0131,0x015F,0x00FF +static const unsigned int iso8859_9_ucs_table[] = { + 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, + 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, + 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff }; - #endif /* UNICODE_TABLE_ISO8859_9_H */ diff --git a/ext/mbstring/libmbfl/libmbfl.dsp b/ext/mbstring/libmbfl/libmbfl.dsp index bf3fe065571..e041df02c53 100644 --- a/ext/mbstring/libmbfl/libmbfl.dsp +++ b/ext/mbstring/libmbfl/libmbfl.dsp @@ -199,6 +199,10 @@ SOURCE=.\filters\mbfilter_iso8859_15.c # End Source File # Begin Source File +SOURCE=.\filters\mbfilter_iso8859_16.c +# End Source File +# Begin Source File + SOURCE=.\filters\mbfilter_iso8859_2.c # End Source File # Begin Source File @@ -500,6 +504,10 @@ SOURCE=.\filters\mbfilter_iso8859_15.h # End Source File # Begin Source File +SOURCE=.\filters\mbfilter_iso8859_16.h +# End Source File +# Begin Source File + SOURCE=.\filters\mbfilter_iso8859_2.h # End Source File # Begin Source File @@ -708,6 +716,10 @@ SOURCE=.\filters\unicode_table_iso8859_15.h # End Source File # Begin Source File +SOURCE=.\filters\unicode_table_iso8859_16.h +# End Source File +# Begin Source File + SOURCE=.\filters\unicode_table_iso8859_2.h # End Source File # Begin Source File diff --git a/ext/mbstring/libmbfl/libmbfl.sln b/ext/mbstring/libmbfl/libmbfl.sln new file mode 100755 index 00000000000..f49f0c0d868 --- /dev/null +++ b/ext/mbstring/libmbfl/libmbfl.sln @@ -0,0 +1,21 @@ +Microsoft Visual Studio Solution File, Format Version 7.00 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libmbfl", "libmbfl.vcproj", "{B3636594-A785-4270-A765-8EAE922B5207}" +EndProject +Global + GlobalSection(SolutionConfiguration) = preSolution + ConfigName.0 = Debug + ConfigName.1 = Release + EndGlobalSection + GlobalSection(ProjectDependencies) = postSolution + EndGlobalSection + GlobalSection(ProjectConfiguration) = postSolution + {B3636594-A785-4270-A765-8EAE922B5207}.Debug.ActiveCfg = Debug|Win32 + {B3636594-A785-4270-A765-8EAE922B5207}.Debug.Build.0 = Debug|Win32 + {B3636594-A785-4270-A765-8EAE922B5207}.Release.ActiveCfg = Release|Win32 + {B3636594-A785-4270-A765-8EAE922B5207}.Release.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + EndGlobalSection + GlobalSection(ExtensibilityAddIns) = postSolution + EndGlobalSection +EndGlobal diff --git a/ext/mbstring/libmbfl/libmbfl.vcproj b/ext/mbstring/libmbfl/libmbfl.vcproj new file mode 100755 index 00000000000..29e0af0a270 --- /dev/null +++ b/ext/mbstring/libmbfl/libmbfl.vcprojdiff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 01e4787a962..ea2ca5632d5 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -1345,7 +1345,6 @@ mbfl_strcut( } -#include /* * strwidth */ @@ -1435,6 +1434,7 @@ collector_strimwidth(int c, void* data) default: if (pc->outchar >= pc->from) { pc->outwidth += (is_fullwidth(c) ? 2: 1); + if (pc->outwidth > pc->width) { if (pc->status == 0) { pc->endpos = pc->device.pos; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h index d907512ec20..d20b3ceea96 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h @@ -74,6 +74,7 @@ #define MBFL_WCSPLANE_CP1251 0x70f70000 #define MBFL_WCSPLANE_CP866 0x70f80000 #define MBFL_WCSPLANE_KOI8R 0x70f90000 +#define MBFL_WCSPLANE_8859_16 0x70fa0000 /* 00h - FFh */ #define MBFL_WCSGROUP_MASK 0xffffff #define MBFL_WCSGROUP_UCS4MAX 0x70000000 #define MBFL_WCSGROUP_WCHARMAX 0x78000000 diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index df279336ce2..21fb6319db7 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -80,6 +80,7 @@ #include "filters/mbfilter_iso8859_13.h" #include "filters/mbfilter_iso8859_14.h" #include "filters/mbfilter_iso8859_15.h" +#include "filters/mbfilter_iso8859_16.h" #include "filters/mbfilter_base64.h" #include "filters/mbfilter_qprint.h" #include "filters/mbfilter_uuencode.h" @@ -163,6 +164,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = { &mbfl_encoding_8859_13, &mbfl_encoding_8859_14, &mbfl_encoding_8859_15, + &mbfl_encoding_8859_16, &mbfl_encoding_euc_cn, &mbfl_encoding_cp936, &mbfl_encoding_hz, @@ -295,7 +297,3 @@ mbfl_is_support_encoding(const char *name) return 1; } } - - - - diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index c99af89a020..c9b51dd3608 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -97,6 +97,7 @@ enum mbfl_no_encoding { mbfl_no_encoding_cp1251, mbfl_no_encoding_cp866, mbfl_no_encoding_koi8r, + mbfl_no_encoding_8859_16, mbfl_no_encoding_charset_max }; diff --git a/ext/mbstring/oniguruma/COPYING b/ext/mbstring/oniguruma/COPYING index 7913cbf23f8..ed3fa53b253 100644 --- a/ext/mbstring/oniguruma/COPYING +++ b/ext/mbstring/oniguruma/COPYING @@ -6,7 +6,7 @@ this of Ruby follows the license of Ruby. It follows the BSD license in the case of the one except for it. /*- - * Copyright (c) 2002 K.Kosako + * Copyright (c) 2002-2004 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/ext/mbstring/oniguruma/HISTORY b/ext/mbstring/oniguruma/HISTORY index a7a817dd2de..65ef03f50fa 100644 --- a/ext/mbstring/oniguruma/HISTORY +++ b/ext/mbstring/oniguruma/HISTORY @@ -1,18 +1,479 @@ History +2005/02/19: Version 3.7.0 + +2005/02/19: [test] success in ruby 1.9.0 (2005-02-19) [i386-cygwin]. +2005/02/19: [new] (thanks Minero Aoki) + add onig_region_set(). +2005/02/19: [API] change onig_region_init() to extern. +2005/02/19: [dist] remove reggnu.c from MANIFEST-RUBY. + remove reggnu.c from make 19. +2005/02/19: [dist] update doc/API and doc/API.ja. +2005/02/19: [test] success in ruby 1.9.0 (2005-02-19) [i386-cygwin]. +2005/02/19: [impl] (thanks Alexey Zakhlestin) + change UChar* to const UChar* in oniguruma.h, + regenc.h and regparse.h. +2005/02/13: [impl] change UChar* to const UChar* in oniguruma.h and + onigposix.h and st.h. +2005/02/12: [test] success in ruby 1.9.0 (2005-02-11) [i386-cygwin]. +2005/02/12: [bug] (thanks nobu) [ruby-dev:25676] + type_cclass_hash() fix overrun. +2005/02/09: [test] success in ruby 1.9.0 (2005-02-09) [i686-linux]. +2005/02/09: [spec] add RE_OPTION_FIND_NOT_EMPTY etc.. to oniggnu.h. +2005/02/09: [dist] remove hash.c.patch. +2005/02/07: [impl] remove re_mbctab, mbctab_ascii etc... + (USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY) + +2005/02/04: Version 3.6.0 + +2005/02/04: [test] success in ruby 1.9.0 (2005-02-04) [i686-linux]. +2005/02/01: [bug] add key_free() call to st_free_table(). +2005/02/01: [new] add onig_get_default_ambig_flag() and + onig_set_default_ambig_flag(). +2005/02/01: [dist] update MANIFEST-RUBY. +2005/01/31: [test] success in ruby 1.9.0 (2005-01-29) [i686-linux]. +2005/01/31: [spec] remove ONIGENC_AMBIGUOUS_MATCH_COMPOUND + from ONIGENC_AMBIGUOUS_MATCH_DEFAULT. +2005/01/31: [dist] update Makefile.in (make 19). +2005/01/29: [memo] (thanks Kazuo Saito) + Oniguruma 3.5.4 was merged to Ruby 1.9.0. +2005/01/28: [impl] (thanks UK-taniyama) + add extern "C" { } directive to oniguruma.h, oniggnu.h + and onigposix.h for C++. +2005/01/25: [impl] remove nested function call for xxx_code_to_mbclen(). + (euc_kr.c, euc_tw.c, big5.c) + +2005/01/19: Version 3.5.4 + +2005/01/19: [test] success in ruby 1.9.0 (2005-01-05) [i686-linux]. +2005/01/19: [bug] (thanks Isao Sonobe) + callback function argument name_end of onig_foreach_name() + was wrong. + name key of name table should be null terminated for + character encoding length. + add strdup_with_null(), rename onig_strdup() to k_strdup(). + use e->name_len in i_names(). +2005/01/17: [impl] (thanks UK-taniyama) + add HAVE_SYS_TYPES_H to config.h.in. + +2005/01/13: Version 3.5.3 + +2005/01/13: [test] success in ruby 1.9.0 (2005-01-05) [i686-linux]. +2005/01/13: [bug] ignore case match bug. + ex. /s+/iu.match("SSSSS") ==> [4..5] + fix OP_EXACT1_IC, OP_EXACTN_IC process. +2005/01/13: [bug] (thanks Isao Sonobe) + ignore case match bug. + ex. /is/iu.match("ss") fail. + fix str_lower_case_match() etc. + +2005/01/05: Version 3.5.2 + +2005/01/05: [test] success in ruby 1.9.0 (2005-01-05) [i686-linux]. +2005/01/05: [test] success in ruby 1.9.0 (2004-12-16) [i686-linux]. +2005/01/05: [bug] (thanks Isao Sonobe) + ignore case match bug. + ex. /s+/iu.match("sssss") ==> [4..5] + fix OP_EXACT1_IC, OP_EXACTN_IC process. +2005/01/05: [bug] (thanks Isao Sonobe) + group name table should be renumbered. + add onig_renumber_name_table(). +2004/12/24: [dist] remove file onigcmpt200.h. + +2004/12/17: Version 3.5.1 + +2004/12/17: [dist] add INSTALL-RUBY to archive. +2004/12/16: [test] success in ruby 1.9.0 (2004-12-16) [i686-linux]. +2004/12/16: [dist] update hash.c.patch. +2004/12/15: [bug] (thanks matz) + char > 127 should be casted to unsigned char. (utf8.c) +2004/12/13: [impl] add HAVE_PROTOTYPES and HAVE_STDARG_PROTOTYPES definition + to oniguruma.h in the case __cplusplus. +2004/12/06: [dist] update doc/RE and doc/RE.ja. +2004/12/03: [impl] (thanks nobu) + st.h fix prototype for C++. + +2004/12/03: Version 3.5.0 + +2004/12/02: [test] success in ruby 1.9.0 (2004-12-02) [i686-linux]. +2004/12/01: [test] success in ruby 1.9.0 (2004-12-01) [i386-mswin32]. +2004/12/01: [dist] add make targets 19 and 19up to win32/Makefile. +2004/12/01: [test] success in ruby 1.9.0 (2004-12-01) [i386-cygwin]. +2004/12/01: [test] success in ruby 1.9.0 (2004-12-01) [i686-linux]. +2004/12/01: [impl] double cast for escape warning in Cygwin. + (HashDataType* )((void* )(&e)) in regparse.c +2004/12/01: [test] success in ruby 1.9.0 (2004-11-30) [i686-linux]. +2004/12/01: [tune] change implementation of clear_opt_map_info(). + (which was 10-16% cost in gprof result for my test program) +2004/12/01: [dist] remove regex.c from distribution files. +2004/11/30: [memo] remove targets 16 and 18 from Makefile.in. +2004/11/30: [test] success in ruby 1.9.0 (2004-11-30) [i686-linux]. +2004/11/30: [inst] add "cp -p st.[ch] st.[ch].ruby_orig" to "make 19". +2004/11/30: [tune] map_position_value() return 20 if code is 0 + and minimum enclen > 1. +2004/11/30: [test] success in ruby 1.9.0 (2004-11-29) [i686-linux]. +2004/11/30: [impl] minor changes for multi-thread in regexec.c and regcomp.c. +2004/11/30: [impl] change THREAD_PASS_LIMIT_COUNT value from 10 to 8. +2004/11/30: [impl] add THREAD_ATOMIC_XXX to FreeNodeList access in regparse.c +2004/11/29: [impl] add USE_MULTI_THREAD_SYSTEM. +2004/11/29: [memo] add hash.c.patch to CVS. +2004/11/29: [dist] change mail address to 'sndgk393 AT ...' +2004/11/29: [dist] add -s option (silent mode) to test.rb. +2004/11/29: [tune] change THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS value + from 20 to 8. +2004/11/29: [inst] add make target "19up". +2004/11/29: [dist] change Oniguruma Home Page URL. +2004/11/29: [impl] remove onig_is_in_code_range_array(). +2004/11/29: [dist] fix doc/RE and RE.ja (character types). +2004/11/26: [dist] fix win32/Makefile. +2004/11/26: [dist] fix doc/RE and RE.ja (multibyte character types). +2004/11/26: [impl] add onig_free_shared_cclass_table(). +2004/11/26: [impl] move definition USE_UNICODE_FULL_RANGE_CTYPE to regenc.h. +2004/11/26: [impl] add opcode OP_CCLASS_NODE. +2004/11/26: [impl] move definition of CClassNode to regint.h. +2004/11/26: [impl] add type PointerType in regint.h. +2004/11/25: [impl] remove ONIGENC_CTYPE_MOD_NOT. +2004/11/25: [impl] rename onig_node_new_cclass_by_codepoint_range to + node_new_cclass_by_codepoint_range. +2004/11/25: [impl] remove get_type_cc_node method from OnigEncodingType. +2004/11/25: [impl] move implementation of shared char-class from enc/*.c + to regparse.c. +2004/11/25: [dist] add hash.c.patch for Ruby 1.9 hash.c change. +2004/11/22: [impl] change utf8_get_type_node(). +2004/11/22: [impl] add ONIGENC_CTYPE_MOD_NOT. +2004/11/22: [bug] (thanks MIYAMUKO Katsuyuki) + ruby make test fail in HP-UX B.11.23 ia64. + should use tok->u.code instead of tok->u.c in + the case of TK_CODE_POINT. +2004/11/19: [bug] (thanks Yoshida Masato) + invalid multibyte code causes segmentation fault. + ex. /[\xFF-\xFF]/u +2004/11/19: [bug] (thanks Yoshida Masato) + illegal check in char-class range in UTF-8. + ex. s = "[\xC2\xA0-\xC3\xBE]" + p(Regexp.new(s, nil, "u") =~ "\xC3\xBE") +2004/11/18: [impl] add onig_node_new_cclass_by_codepoint_range(). +2004/11/18: [impl] remove OnigCodePointRange type. (use OnigCodePoint[].) +2004/11/17: [bug] (thanks nobu) + abort in "a".gsub(/a\Z/, "") + fix ONIGENC_STEP_BACK() argument in onig_search(). +2004/11/16: [impl] add key2 member to st_table_entry in st.[ch]. + change API of st for non-null terminated string key. +2004/11/16: [impl] add get_type_cc_node method to OnigEncodingType. +2004/11/15: [impl] add st.h and st.c from Ruby 1.9. + use st-hash always. +2004/11/12: [impl] change menber 'not' of CClassNode to 'flags'. + add flags FLAG_CCLASS_NOT and FLAG_CCLASS_SHARE. +2004/11/12: [impl] add onig_is_in_code_range_array() to enc/unicode.c. +2004/11/12: [impl] fix CRWord in enc/unicode.c and MBWord in enc/utf8.c. +2004/11/11: [bug] fix enc/utf8.c. + size 0 array initializer was compile error in VC++. +2004/11/09: [inst] (thanks Hiroki YAGITA) + change installed file mode to 0644. +2004/11/09: [bug] (thanks UK-taniyama) + wrong definitions GET_RELADDR_INC(), GET_ABSADDR_INC() + etc... (NOT PLATFORM_UNALIGNED_WORD_ACCESS) +2004/11/09: [impl] type cast in regexec() for remove compile time warning. + (WIN32, regposix.c) +2004/11/08: [spec] fix Unicode character types. + 0x00ad (soft hyphen) should be [:cntrl:] and [:space:] type. + [0x0009..0x000d], 0x0085 should be [:print:] type. + 0x00ad should not be [:punct:] type. +2004/11/08: [inst] fix Makefile.in. (for make ctest/ptest/testcu) +2004/11/06: [impl] (thanks Kazuo Saito) + too many alternatives pattern causes core dump. + change implementation of onig_node_free(). +2004/11/05: [spec] rename ONIGERR_END_PATTERN_AT_BACKSLASH to + ONIGERR_END_PATTERN_AT_ESCAPE. +2004/11/05: [impl] (thanks matz) + escape compile time warnings for x86-64 Linux. + StackIndex type int -> long +2004/11/05: [memo] (thanks Kazuo Saito) + Oniguruma 3.4.0 was merged to Ruby 1.9.0. + +2004/10/30: Version 3.4.0 + +2004/10/30: [test] success in ruby 1.9.0 (2004-09-24) [i686-linux]. +2004/10/30: [new] add hexadecimal digit char type. (\h, \H) + syntax: ONIG_SYN_OP2_ESC_H_XDIGIT +2004/10/30: [bug] (thanks Guy Decoux) + reluctant infinite repeat bug. + ex. /^[a-z]{2,}?$/.match("aaa") fail. + fix OP_REPEAT_INC_NG process in match_at(). + +2004/10/18: Version 3.3.1 + +2004/10/18: [test] success in ruby 1.9.0 (2004-09-24) [i686-linux]. +2004/10/18: [impl] (thanks Imai Yasumasa) + enclose #include by #ifndef __BORLANDC__. +2004/10/18: [bug] (thanks Imai Yasumasa) + memory acess violation in select_opt_exact_info(). +2004/09/25: [dist] fix doc/API and doc/API.ja. +2004/09/25: [bug] fix OP_SEMI_END_BUF process in match_at() for + the case USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + is not defined. + +2004/09/17: Version 3.3.0 + +2004/09/17: [dist] add COPYING to program source files. +2004/09/17: [test] success in ruby 1.9.0 (2004-07-23) [i686-linux]. +2004/09/17: [bug] (thanks Isao Sonobe) + memory access violations in xxx_mbc_enc_len(), + and xxx_mbc_to_normalize() and + xxx_left_adjust_char_head(). + add string range check in match_at() and onig_search(). +2004/09/08: [dist] change mail address format.(kosako AT sofnec ...) + +2004/09/04: Version 3.2.9 + +2004/09/04: [test] success in ruby 1.9.0 (2004-07-23) [i686-linux]. +2004/09/04: [bug] (thanks Bob Kerstetter and Richard Koch) + search fail in ignore case mode. + fix str_lower_case_match(). +2004/09/04: [inst] (thanks Isao Sonobe) + clear sample directory in 'make clean'. +2004/09/04: [bug] fix ONIGENC_AMBIGUOUS_MATCH_COMPOUND/ASCII/NONASCII + meanings in XXXXX_mbc_to_normalize() and + XXXXX_is_mbc_ambiguous(). +2004/08/28: [bug] fix ONIGENC_AMBIGUOUS_MATCH_COMPOUND/ASCII/NONASCII + meanings in iso_8859_XX_mbc_to_normalize() and + iso_8859_XX_is_mbc_ambiguous(). + +2004/08/24: Version 3.2.8 + +2004/08/24: [test] success in ruby 1.9.0 (2004-07-23) [i686-linux]. +2004/08/24: [spec] add ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY. + /a{n}?/ == /(?:a{n})?/ +2004/08/24: [dist] fix doc/RE and doc/RE.ja. +2004/08/24: [bug] (thanks starfish) + memory leak in set_optimize_exact_info(). + +2004/08/21: Version 3.2.7 + +2004/08/21: [test] success in ruby 1.8.2 (2004-07-28) [i686-linux]. + (1.8.2 preview2) +2004/08/21: [test] success in ruby 1.9.0 (2004-07-23) [i686-linux]. +2004/08/21: [bug] (thanks Isao Sonobe) (thanks kage) + memory access violation in bm_search_notrev(). + (forgotten to merge from 2.X) + +2004/07/24: Version 3.2.6 + +2004/07/24: [test] success in ruby 1.9.0 (2004-07-23) [i686-linux]. +2004/07/24: [test] success in ruby 1.8.2 (2004-07-16) [i686-linux]. +2004/07/24: [bug] fix warnings for regexec.c. (gcc 2.91.66) +2004/07/24: [memo] change version control system from Subversion + to CVS 1.11.17. +2004/07/20: [bug] (thanks Isao Sonobe) + illegal result in negative character class in ignore case + mode. fix pair-ambig-codes process in parse_exp(). + ex. /[^a]/i.match("A") +2004/07/20: [bug] (thanks Isao Sonobe) + undefined bytecode error happens in UTF-16BE etc.. + compile_length_cclass_node() was not consistent with + compile_cclass_node(). + +2004/07/01: Version 3.2.5 + +2004/07/01: [test] success in ruby 1.8.2 (2004-06-23) [i686-linux]. +2004/07/01: [new] add onig_get_syntax_{op,op2,behavior,options}. +2004/07/01: [bug] (thanks Isao Sonobe) + invalid result in onig_capture_tree_traverse(). + fix make_capture_history_tree(). + +2004/06/29: Version 3.2.4 + +2004/06/29: [test] success in ruby 1.8.2 (2004-06-23) [i686-linux]. +2004/06/29: [new] (thanks Isao Sonobe) + add onig_number_of_captures(). + +2004/06/25: Version 3.2.3 + +2004/06/25: [test] success in ruby 1.8.2 (2004-06-23) [i686-linux]. +2004/06/25: [bug] (thanks Isao Sonobe) + invalid result in onig_capture_tree_traverse(). + fix make_capture_history_tree(). + +2004/06/24: Version 3.2.2 + +2004/06/24: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. +2004/06/24: [test] success in ruby 1.8.0 (2003-08-08) [i386-mswin32]. +2004/06/24: [test] success in ruby 1.8.2 (2004-06-23) [i686-linux]. +2004/06/24: [new] (thanks Isao Sonobe) + add onig_number_of_capture_histories(). +2004/06/24: [bug] (thanks Isao Sonobe) + invalid char position match in UTF-16 and UTF-32. + add onigenc_always_false_is_allowed_reverse_match(). + +2004/06/17: Version 3.2.1 + +2004/06/17: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. +2004/06/17: [test] success in ruby 1.8.0 (2003-08-08) [i386-mswin32]. +2004/06/17: [test] success in ruby 1.8.2 (2004-05-18) [i686-linux]. +2004/06/17: [impl] should not use OP_REPEAT for (...)? even if target size + is long. +2004/06/17: [bug] (thanks nobu) [ruby-dev:23703] + should use STACK_AT() instead of stkp in OP_REPEAT_INC. + add IN_VAR_REPEAT flag in setup_tree(). +2004/06/16: [impl] change select_opt_exact_info() to use ByteValTable[]. +2004/06/16: [impl] change map_position_value() table values. +2004/06/14: [impl] (thanks John Carter) + RelAddrType, AbsAddrType and LengthType change + from short int to int type for the very long string match. +2004/06/14: [bug] (thanks Greg A. Woods) + fix nmatch argument of regexec() is smaller than + reg->num_mem + 1 case. (POSIX API) +2004/06/14: [spec] (thanks Greg A. Woods) + set pmatch to NULL if nmatch is 0 in regexec(). (POSIX API) + +2004/06/10: Version 3.2.0 + +2004/06/10: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. +2004/06/10: [test] success in ruby 1.9.0 (2004-05-27) [i386-mswin32]. +2004/06/10: [test] success in ruby 1.8.2 (2004-05-18) [i686-linux]. +2004/06/10: [dist] add README.ja. +2004/06/10: [new] add onig_copy_encoding(). +2004/06/10: [API] add encoding argument to onig_set_meta_char(). + add meta_char_table member to OnigEncodingType. +2004/06/08: [dist] add doc/API.ja. +2004/06/07: [API] add num_of_elements member to OnigCompileInfo. +2004/05/29: [memo] (thanks Kazuo Saito) + Oniguruma 3.1.0 was merged to Ruby 1.9.0. +2004/05/26: [impl] rename NST_SIMPLE_REPEAT to NST_STOP_BT_SIMPLE_REPEAT. +2004/05/26: [impl] doesn't need to check that target's simple repeat-ness + for EFFECT_MEMORY type node in setup_tree(). + +2004/05/25: Version 3.1.0 + +2004/05/25: [test] success in ruby 1.8.0 (2003-08-08) [i386-mswin32]. +2004/05/25: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. +2004/05/25: [test] success in ruby 1.9.0 (2004-05-23) [i686-linux]. +2004/05/25: [test] success in ruby 1.8.2 (2004-05-18) [i686-linux]. +2004/05/25: [bug] (thanks Masahiro Sakai) [ruby-dev:23560] + ruby -ruri -ve 'URI::ABS_URI =~ + "http://example.org/Andr\xC3\xA9"' + nested STK_REPEAT type stack can't backtrack repeat_stk[]. + add OP_REPEAT_INC_SG and OP_REPEAT_INC_NG_SG. +2004/05/25: [new] support UTF-32LE. (ONIG_ENCODING_UTF32_LE) +2004/05/25: [new] support UTF-32BE. (ONIG_ENCODING_UTF32_BE) +2004/05/24: [impl] divide enc/utf16.c to utf16_be.c and utf16_le.c. +2004/05/24: [impl] add enc/unicode.c. +2004/05/24: [API] change calling sequences of onig_new_deluxe() and + onig_recompile_deluxe(). + define OnigCompileInfo type. +2004/05/21: [impl] perform ensure process for rb_trap_exec() in match_at(). + add onig_exec_trap() and CHECK_INTERRUPT_IN_MATCH_AT. +2004/05/21: [impl] add regex status check to onig_match(). +2004/05/21: [new] add onig_get_capture_tree() and + onig_capture_tree_traverse(). +2004/05/20: [spec] (thanks Isao Sonobe) + capture history return capture data tree. + (see sample/listcap.c) +2004/05/19: [bug] (thanks Simon Strandgaard) + Control-C does not work in matching process on Ruby. + add calling of CHECK_INTERRUPT into match_at(). + ex. /<(?:[^">]+|"[^"]*")+>/.match('') +2004/05/19: [bug] (thanks Simon Strandgaard) + define virtual codepoint values for invalid encoding + byte 0xfe and 0xff in UTF-8. + ex. /\w+/u.match("%a\xffb\xfec%") ==> "a" +2004/05/19: [spec] (thanks Simon Strandgaard) + too big backref number should be treated as a sequence of + an octal char and number digits. + ex. /b\3777\c/.match("b\3777\c") +2004/05/17: [spec] rename encoding names "UTF-16 BE" and "UTF-16 LE" + to "UTF-16BE" and "UTF-16LE". +2004/05/17: [impl] move ismbchar() and mbclen() from oniguruma.h to oniggnu.h. +2004/05/17: [impl] rename onigenc_single_byte_is_allowed_reverse_match() to + onigenc_always_true_is_allowed_reverse_match(). + +2004/05/14: Version 3.0.0 + +2004/05/14: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. +2004/05/14: [test] success in ruby 1.9.0 (2004-05-14) [i686-linux]. +2004/05/14: [test] success in ruby 1.8.0 (2003-08-08) [i386-mswin32]. + (* need to edit parse.y: + register int c; ---> int c; in yylex()) +2004/05/14: [impl] add regext.c. +2004/05/14: [spec] KOI8 is not included in library archive by default setup. +2004/05/14: [impl] implementation changes are completed for all encoding files. +2004/05/12: [impl] add divide_ambig_string_node(). + ambiguous string is divided and normalized before + optimization and compilation process. +2004/05/11: [dist] remove INSTALL-RUBY from distribution. +2004/04/28: [memo] (thanks Kazuo Saito) + Oniguruma 2.2.8 was merged to Ruby 1.9.0. +2004/04/26: [spec] change value DEFAULT_MATCH_STACK_LIMIT_SIZE = 0 : unlimited +2004/04/26: [new] add onig_get_match_stack_limit_size() and + onig_set_match_stack_limit_size(). +2004/04/26: [bug] add error check to re.c.181.patch and re.c.168.patch. +2004/04/23: [impl] remove ctype_support_level from OnigEncodingType. +2004/04/22: [spec] allow the range from single byte char to multibyte char in + character class for implementation reason. + ex. /[a-\xbb\xcc]/ in EUC-JP encoding. +2004/04/21: [impl] remove max_enc_len_by_first_byte() from OnigEncodingType. +2004/04/20: [new] add onig_copyright(). +2004/04/20: [impl] add regversion.c. +2004/04/15: [new] add onig_get_ambig_flag(). +2004/04/14: [bug] (thanks Isao Sonobe) + undefined bytecode error happens if ONIG_OPTION_FIND_LONGEST + is setted. + should finish matching process if find-condition + is fail at OP_END in match_at(). +2004/04/12: [impl] add ambig_flag to regex_t. +2004/04/09: [impl] move onig_set_meta_char() to regsyntax.c. +2004/04/09: [bug] (thanks HIROSE Masaaki) fix onig_version(). +2004/04/08: [impl] add regsyntax.c. +2004/04/07: [new] support UTF-16 LE. (ONIG_ENCODING_UTF16_LE) +2004/04/05: [impl] add ONIGENC_CTYPE_NEWLINE. +2004/04/05: [memo] (thanks Kazuo Saito) + Oniguruma 2.2.6 was merged to Ruby 1.9.0. +2004/04/02: [memo] Version 2.2.6 was released. +2004/03/26: [new] support UTF-16 BE. (ONIG_ENCODING_UTF16_BE) +2004/03/25: [spec] support non 8-bit encodings. +2004/03/16: [memo] 2.X branch for 8-bit encodings only. + +2004/03/16: Version 2.2.5 + +2004/03/16: [test] success in ruby 1.8.0 (2003-08-08) [i386-mswin32]. +2004/03/16: [test] success in ruby 1.9.0 (2004-02-24) [i686-linux]. +2004/03/16: [impl] add property name to error message of + ONIGERR_INVALID_CHAR_PROPERTY_NAME. +2004/03/16: [spec] allow prefix 'Is' for \p{...} in ONIG_SYNTAX_PERL. + add syntax op. ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS. +2004/03/15: [dist] add sample/syntax.c. +2004/03/15: [spec] support NOT op. in char property. \p{^...}, \P{^...}. + add syntax op. ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT. +2004/03/15: [spec] rename ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY to + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY. +2004/03/10: [impl] move ONIGERR_XXX from regenc.h to oniguruma.h, + rename ONIGERR_XXX to ONIGENCERR_XXX in regenc.h. +2004/03/08: [impl] (thanks eban) + replace defined(__CYGWIN__) to defined(__GNUC__). +2004/03/08: [bug] (thanks eban) [ruby-dev:23172] + need to separate initialization for bcc32. +2004/03/06: [memo] (thanks Kazuo Saito) + Oniguruma 2.2.4 was merged to Ruby 1.9.0. +2004/03/05: [API] change second argument type of onig_set_meta_char() + from unsigned int to OnigCodePoint. +2004/03/05: [dist] (thanks Kazuo Saito) + add MANIFEST-RUBY. + 2004/03/04: Version 2.2.4 2004/03/04: [impl] (thanks Moriyoshi Koizumi) - fix many warning in Win32 VC++ with /W3 option. + fix many warnings in Win32 VC++ with /W3 option. 2004/03/02: Version 2.2.3 2004/03/02: [bug] (thanks Isao Sonobe) return invalid capture region value if capture history is used. (OP_MEMORY_END_PUSH_REC bug) - ex. /\\g

(?@

\\(\\g\\)){0}(?(?:\\g

)*|){0}/ + ex. /\g

(?@

\(\g\)){0}(?(?:\g

)*|){0}/ .match("((())())") -2004/03/02: [impl] add :nodoc: to onig_stat_print() for RDoc. +2004/03/02: [impl] (thanks Kazuo Saito) + add :nodoc: to onig_stat_print() for RDoc. 2004/03/02: [impl] don't use ONIG_SOURCE_IS_WRAPPED. 2004/02/27: Version 2.2.2 @@ -91,14 +552,14 @@ History (0x80 - 0xff is not ASCII) 2004/01/23: [new] support ISO-8859-2. (ONIG_ENCODING_ISO_8859_2) 2004/01/23: [dist] add enc/isotable.c. -2004/01/22; [new] support EUC-TW. (ONIG_ENCODING_EUC_TW) +2004/01/22: [new] support EUC-TW. (ONIG_ENCODING_EUC_TW) 2004/01/22: [bug] definition of GET_ALIGNMENT_PAD_SIZE() and ALIGNMENT_RIGHT() was wrong. type casting should be unsigned int, not int. 2004/01/22: [impl] add defined(__x86_64) || defined(__x86_64__) to unaligned word access condition. (AMD64 ?) 2004/01/21: [dist] rename enc/eucjp.c to enc/euc_jp.c. -2004/01/21; [new] support EUC-KR. (ONIG_ENCODING_EUC_KR) +2004/01/21: [new] support EUC-KR. (ONIG_ENCODING_EUC_KR) 2004/01/20: [test] success in ruby 1.8.0 (2003-08-08) [i386-cygwin]. 2004/01/20: [dist] change Makefile.in. 2004/01/20: [spec] add \p{...}, \P{...} in char class. @@ -883,5 +1344,16 @@ History [test: test] [memo: memo] -- + +svn mkdir http://localhost/repos/branches -m "" +svn mkdir http://localhost/repos/branches/oniguruma -m "" +svn copy http://localhost/repos/trunk/oniguruma http://localhost/repos/branches/oniguruma/2.X -m "branch for 8-bit encodings only" + svn copy http://localhost/repos/trunk/oniguruma http://localhost/repos/tags/oniguruma/X.X.X -m "onigdXXXXXXXX" + + +cvs history -T + + +cvs rtag "VERSION_X_X_X" oniguruma diff --git a/ext/mbstring/oniguruma/README b/ext/mbstring/oniguruma/README index 3880423f03b..dc4fb3b64b8 100644 --- a/ext/mbstring/oniguruma/README +++ b/ext/mbstring/oniguruma/README @@ -1,7 +1,8 @@ -README 2004/02/25 +README 2005/02/04 -Oniguruma ---- (C) K.Kosako +Oniguruma ---- (C) K.Kosako +http://www.geocities.jp/kosako3/oniguruma/ http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/oniguruma/ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ @@ -11,45 +12,38 @@ for every regular expression object can be specified. Supported character encodings: - ASCII, UTF-8, + ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, EUC-JP, EUC-TW, EUC-KR, EUC-CN, - Shift_JIS, Big5, KOI8, KOI8-R, + Shift_JIS, Big5, KOI8-R, KOI8 (*), ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 - -There are two ways of using of it in this program. - - * Built-in regular expression engine of Ruby - * C library (supported APIs: GNU regex, POSIX, Oniguruma native) - +* KOI8 is not included in library archive by default setup. + (need to edit Makefile if you want to use it.) ------------------------------------------------------------ Install -(A) Install into Ruby - - See INSTALL-RUBY. - - (character encodings: ASCII, UTF-8, EUC-JP, Shift_JIS) - - -(B) Install C library - - (B-1) Unix and Cygwin platform + Case 1: Unix and Cygwin platform 1. ./configure 2. make 3. make install - (* uninstall: make uninstall) + library file: libonig.a - * test (ASCII/EUC-JP) - 4. make ctest + test (ASCII/EUC-JP) + + make ctest + + uninstall + + make uninstall - (B-2) Win32 platform (VC++) + + Case 2: Win32 platform (VC++) 1. copy win32\Makefile Makefile 2. copy win32\config.h config.h @@ -77,6 +71,16 @@ Regular Expressions See doc/RE (or doc/RE.ja for Japanese). +Usage + + Include oniguruma.h in your program. (native API) + See doc/API for native API. + + If you want to use static link library(onig_s.lib) in Win32, + add option -DONIG_EXTERN=extern to C compiler. + + + Sample Programs sample/simple.c example of the minimum (native API) @@ -86,14 +90,12 @@ Sample Programs sample/posix.c POSIX API sample. sample/sql.c example of the variable meta characters. (SQL-like pattern matching) + sample/syntax.c Perl and Java syntax test. Source Files oniguruma.h Oniguruma API header file. (public) - oniggnu.h GNU regex API header file. (public) - onigcmpt200.h Oniguruma API backward compatibility header file. (public) - (for 2.0.0 or more older version) regenc.h character encodings framework header file. regint.h internal definitions @@ -101,17 +103,31 @@ Source Files regcomp.c compiling and optimization functions regenc.c character encodings framework. regerror.c error message function - regex.c source files wrapper for Ruby + regext.c extended API functions. (deluxe version API) regexec.c search and match functions regparse.c parsing functions. + regsyntax.c pattern syntax functions and built-in syntax definitions. + regtrav.c capture history tree data traverse functions. + regversion.c version info function. + st.h hash table functions header file + st.c hash table functions + + oniggnu.h GNU regex API header file. (public) reggnu.c GNU regex API functions onigposix.h POSIX API header file. (public) regposerr.c POSIX error message function. - regposix.c POSIX functions. + regposix.c POSIX API functions. enc/mktable.c character type table generator. enc/ascii.c ASCII encoding. + enc/euc_jp.c EUC-JP encoding. + enc/euc_tw.c EUC-TW encoding. + enc/euc_kr.c EUC-KR, EUC-CN encoding. + enc/sjis.c Shift_JIS encoding. + enc/big5.c Big5 encoding. + enc/koi8.c KOI8 encoding. + enc/koi8_r.c KOI8-R encoding. enc/iso8859_1.c ISO-8859-1 encoding. (Latin-1) enc/iso8859_2.c ISO-8859-2 encoding. (Latin-2) enc/iso8859_3.c ISO-8859-3 encoding. (Latin-3) @@ -128,18 +144,19 @@ Source Files enc/iso8859_15.c ISO-8859-15 encoding. (Latin-9 or West European with Euro) enc/iso8859_16.c ISO-8859-16 encoding. (Latin-10 or South-Eastern European with Euro) - enc/utf8.c UTF-8 encoding. - enc/euc_jp.c EUC-JP encoding. - enc/euc_tw.c EUC-TW encoding. - enc/euc_kr.c EUC-KR, EUC-CN encoding. - enc/sjis.c Shift_JIS encoding. - enc/koi8.c KOI8 encoding. - enc/koi8_r.c KOI8-R encoding. - enc/big5.c Big5 encoding. + enc/utf8.c UTF-8 encoding. + enc/utf16_be.c UTF-16BE encoding. + enc/utf16_le.c UTF-16LE encoding. + enc/utf32_be.c UTF-32BE encoding. + enc/utf32_le.c UTF-32LE encoding. + enc/unicode.c Unicode information data. + + win32/Makefile Makefile for Win32 (VC++) + win32/config.h config.h for Win32 -API differences with Japanized GNU regex(version 0.12) of Ruby +API differences with Japanized GNU regex(version 0.12) of Ruby 1.8/1.6 + re_compile_fastmap() is removed. + re_recompile_pattern() is added. @@ -148,18 +165,17 @@ API differences with Japanized GNU regex(version 0.12) of Ruby ToDo - 1 support 16-bit encodings. (UTF-16) - 2 different encoding pattern with target. - (ex. ASCII/UTF-16, UTF-16 BE and UTF-16 LE) - 3 add enc/name.c (onigenc_get_enc_by_name(name)) - - ? transmission stopper. (return ONIG_STOP from match_at()) - ? implement syntax behavior ONIG_SYN_CONTEXT_INDEP_ANCHORS. - ? better acess to hash table (st.c). - non null-terminated key version st_lookup(). - ? grep-like tool 'onigrep'. - ? return parse tree of regexp pattern to application. - ?? /a{n}?/ should be interpreted as /(?:a{n})?/. - ?? \h hexadecimal digit char ([0-9a-fA-F]), \H not \h. + ? ignore case in full code point range of Unicode. + ? Unicode Property. + ? ambig-flag Katakana <-> Hiragana. + ? add ONIG_OPTION_NOTBOS/NOTEOS. (\A, \z, \Z) + ? add ONIG_SYNTAX_ASIS. + ?? \X (== \PM\pM*) + ?? implement syntax behavior ONIG_SYN_CONTEXT_INDEP_ANCHORS. + ?? variable line separator. + ?? transmission stopper. (return ONIG_STOP from match_at()) and I'm thankful to Akinori MUSHA. + + +Mail Address: K.Kosako diff --git a/ext/mbstring/oniguruma/README.ja b/ext/mbstring/oniguruma/README.ja new file mode 100644 index 00000000000..44553abfefd --- /dev/null +++ b/ext/mbstring/oniguruma/README.ja @@ -0,0 +1,177 @@ +README.ja 2005/02/04 + +µ´¼Ö ---- (C) K.Kosako + +http://www.geocities.jp/kosako3/oniguruma/ +http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/oniguruma/ +http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ + +µ´¼Ö¤ÏÀµµ¬É½¸½¥é¥¤¥Ö¥é¥ê¤Ç¤¢¤ë¡£ +¤³¤Î¥é¥¤¥Ö¥é¥ê¤ÎÆÃĹ¤Ï¡¢¤½¤ì¤¾¤ì¤ÎÀµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤´¤È¤Ë +ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°¤ò»ØÄê¤Ç¤­¤ë¤³¤È¤Ç¤¢¤ë¡£ + +¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤ëʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°: + + ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, + EUC-JP, EUC-TW, EUC-KR, EUC-CN, + Shift_JIS, Big5, KOI8-R, KOI8 (*), + ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, + ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, + ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 + +* KOI8¤Ï¥Ç¥Õ¥©¥ë¥È¤Î¥»¥Ã¥È¥¢¥Ã¥×¤Ç¤Ï¥é¥¤¥Ö¥é¥ê¤ÎÃæ¤Ë´Þ¤Þ¤ì¤Ê¤¤¡£ + (ɬÍפǤ¢¤ì¤ÐMakefile¤òÊÔ½¸¤¹¤ë¤³¤È) +------------------------------------------------------------ + +¥¤¥ó¥¹¥È¡¼¥ë + + ¥±¡¼¥¹£±: Unix¤ÈCygwin´Ä¶­ + + 1. ./configure + 2. make + 3. make install + + ¥é¥¤¥Ö¥é¥ê¥Õ¥¡¥¤¥ë: libonig.a + + Æ°ºî¥Æ¥¹¥È (ASCII/EUC-JP) + + make ctest + + ¥¢¥ó¥¤¥ó¥¹¥È¡¼¥ë + + make uninstall + + + + ¥±¡¼¥¹£²: Win32(VC++)´Ä¶­ + + 1. copy win32\Makefile Makefile + 2. copy win32\config.h config.h + 3. nmake + + onig_s.lib: static link library + onig.dll: dynamic link library + + * Æ°ºî¥Æ¥¹¥È (ASCII/Shift_JIS) + 4. copy win32\testc.c testc.c + 5. nmake ctest + + +¥é¥¤¥»¥ó¥¹ + + ¤³¤Î¥½¥Õ¥È¥¦¥§¥¢¤¬Ruby¤È°ì½ï¤Ë»ÈÍѤޤ¿¤ÏÇÛÉÛ¤µ¤ì¤ë¾ì¹ç¤Ë¤Ï¡¢ + Ruby¤Î¥é¥¤¥»¥ó¥¹¤Ë½¾¤¦¡£ + ¤½¤ì°Ê³°¤Î¾ì¹ç¤Ë¤Ï¡¢BSD¥é¥¤¥»¥ó¥¹¤Ë½¾¤¦¡£ + + +Àµµ¬É½¸½ + + doc/RE.ja¤ò»²¾È + + +»ÈÍÑÊýË¡ + + »ÈÍѤ¹¤ë¥×¥í¥°¥é¥à¤Ç¡¢oniguruma.h¤ò¥¤¥ó¥¯¥ë¡¼¥É¤¹¤ë(Native API¤Î¾ì¹ç)¡£ + Native API¤Ë¤Ä¤¤¤Æ¤Ï¡¢doc/API.ja¤ò»²¾È¡£ + + Win32¤Ç¥¹¥¿¥Æ¥£¥Ã¥¯¥ê¥ó¥¯¥é¥¤¥Ö¥é¥ê(onig_s.lib)¤ò¥ê¥ó¥¯¤¹¤ë¾ì¹ç¤Ë¤Ï¡¢ + ¥³¥ó¥Ñ¥¤¥ë¤¹¤ë¤È¤­¤Ë -DONIG_EXTERN=extern ¤ò¥³¥ó¥Ñ¥¤¥ë°ú¿ô¤ËÄɲ乤뤳¤È¡£ + + +»ÈÍÑÎã¥×¥í¥°¥é¥à + + sample/simple.c ºÇ¾®Îã (native API) + sample/names.c ̾Á°ÉÕ¤­¥°¥ë¡¼¥×¥³¡¼¥ë¥Ð¥Ã¥¯»ÈÍÑÎã + sample/encode.c ´ö¤Ä¤«¤Îʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°»ÈÍÑÎã + sample/listcap.c Êá³ÍÍúÎòµ¡Ç½¤Î»ÈÍÑÎã + sample/posix.c POSIX API»ÈÍÑÎã + sample/sql.c ²ÄÊѥ᥿ʸ»úµ¡Ç½»ÈÍÑÎã (SQL-like ¥Ñ¥¿¡¼¥ó) + sample/syntax.c Perl¤ÈJavaʸˡ¤Î¥Æ¥¹¥È + + +¥½¡¼¥¹¥Õ¥¡¥¤¥ë + + oniguruma.h µ´¼ÖAPI¥Ø¥Ã¥À (¸ø³«) + + regenc.h ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°ÏÈÁȤߥإåÀ + regint.h ÆâÉôÀë¸À + regparse.h regparse.c¤Èregcomp.c¤Î¤¿¤á¤ÎÆâÉôÀë¸À + regcomp.c ¥³¥ó¥Ñ¥¤¥ë¡¢ºÇŬ²½´Ø¿ô + regenc.c ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°ÏÈÁÈ¤ß + regerror.c ¥¨¥é¡¼¥á¥Ã¥»¡¼¥¸´Ø¿ô + regext.c ³ÈÄ¥API´Ø¿ô + regexec.c ¸¡º÷¡¢¾È¹ç´Ø¿ô + regparse.c Àµµ¬É½¸½¥Ñ¥¿¡¼¥ó²òÀÏ´Ø¿ô + regsyntax.c Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ´Ø¿ô¡¢Áȹþ¤ßʸˡÄêµÁ + regtrav.c Êá³ÍÍúÎòÌÚ½ä²ó´Ø¿ô + regversion.c ÈǾðÊó´Ø¿ô + st.h ¥Ï¥Ã¥·¥å¥Æ¡¼¥Ö¥ë´Ø¿ôÀë¸À + st.c ¥Ï¥Ã¥·¥å¥Æ¡¼¥Ö¥ë´Ø¿ô + + oniggnu.h GNU regex API¥Ø¥Ã¥À (¸ø³«) + reggnu.c GNU regex API´Ø¿ô + + onigposix.h POSIX API¥Ø¥Ã¥À (¸ø³«) + regposerr.c POSIX API¥¨¥é¡¼¥á¥Ã¥»¡¼¥¸´Ø¿ô + regposix.c POSIX API´Ø¿ô + + enc/mktable.c ʸ»ú¥¿¥¤¥×¥Æ¡¼¥Ö¥ëÀ¸À®¥×¥í¥°¥é¥à + enc/ascii.c ASCII ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/euc_jp.c EUC-JP ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/euc_tw.c EUC-TW ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/euc_kr.c EUC-KR, EUC-CN ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/sjis.c Shift_JIS ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/big5.c Big5 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/koi8.c KOI8 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/koi8_r.c KOI8-R ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/iso8859_1.c ISO-8859-1 (Latin-1) + enc/iso8859_2.c ISO-8859-2 (Latin-2) + enc/iso8859_3.c ISO-8859-3 (Latin-3) + enc/iso8859_4.c ISO-8859-4 (Latin-4) + enc/iso8859_5.c ISO-8859-5 (Cyrillic) + enc/iso8859_6.c ISO-8859-6 (Arabic) + enc/iso8859_7.c ISO-8859-7 (Greek) + enc/iso8859_8.c ISO-8859-8 (Hebrew) + enc/iso8859_9.c ISO-8859-9 (Latin-5 ¤Þ¤¿¤Ï Turkish) + enc/iso8859_10.c ISO-8859-10 (Latin-6 ¤Þ¤¿¤Ï Nordic) + enc/iso8859_11.c ISO-8859-11 (Thai) + enc/iso8859_13.c ISO-8859-13 (Latin-7 ¤Þ¤¿¤Ï Baltic Rim) + enc/iso8859_14.c ISO-8859-14 (Latin-8 ¤Þ¤¿¤Ï Celtic) + enc/iso8859_15.c ISO-8859-15 (Latin-9 ¤Þ¤¿¤Ï West European with Euro) + enc/iso8859_16.c ISO-8859-16 + (Latin-10 ¤Þ¤¿¤Ï South-Eastern European with Euro) + enc/utf8.c UTF-8 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/utf16_be.c UTF-16BE ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/utf16_le.c UTF-16LE ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/utf32_be.c UTF-32BE ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/utf32_le.c UTF-32LE ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/unicode.c Unicode¾ðÊó + + win32/Makefile Win32ÍÑ Makefile (for VC++) + win32/config.h Win32ÍÑ config.h + + + +Ruby 1.8/1.6¤ÎÆüËܸ첽GNU regex¤È¤ÎAPI¤Î°ã¤¤ + + + re_compile_fastmap() ¤Ïºï½ü¤µ¤ì¤¿¡£ + + re_recompile_pattern() ¤¬Äɲ䵤줿¡£ + + re_alloc_pattern() ¤¬Äɲ䵤줿¡£ + + +»Ä·ï + + ? UnicodeÁ´¥³¡¼¥É¥Ý¥¤¥ó¥ÈÎΰè¤Ç¤ÎÂçʸ»ú¾®Ê¸»ú¾È¹ç + ? Unicode¥×¥í¥Ñ¥Æ¥£ + ? ambig-flag Katakana <-> Hiragana + ? ONIG_OPTION_NOTBOS/NOTEOSÄɲà (\A, \z, \Z) + ? ONIG_SYNTAX_ASISÄɲà + ?? \X (== \PM\pM*) + ?? ʸˡÍ×ÁÇ ONIG_SYN_CONTEXT_INDEP_ANCHORS¤Î¼ÂÁõ + ?? ²þ¹Ôʸ»ú(ʸ»úÎó)¤òÊѹ¹¤Ç¤­¤ë + ?? ¸¡º÷°ÌÃÖ°ÜÆ°Ää»ß±é»»»Ò (match_at()¤«¤éONIG_STOP¤òÊÖ¤¹) + +and I'm thankful to Akinori MUSHA. + + +Mail Address: K.Kosako diff --git a/ext/mbstring/oniguruma/config.h.in b/ext/mbstring/oniguruma/config.h.in index 1a59a45dc07..5ca2056fb39 100644 --- a/ext/mbstring/oniguruma/config.h.in +++ b/ext/mbstring/oniguruma/config.h.in @@ -49,6 +49,9 @@ /* Define if you have the header file. */ #undef HAVE_STRINGS_H +/* Define if you have the header file. */ +#undef HAVE_SYS_TYPES_H + /* Define if you have the header file. */ #undef HAVE_SYS_TIME_H diff --git a/ext/mbstring/oniguruma/enc/ascii.c b/ext/mbstring/oniguruma/enc/ascii.c index 44cc78f77c3..64be21d7fff 100644 --- a/ext/mbstring/oniguruma/enc/ascii.c +++ b/ext/mbstring/oniguruma/enc/ascii.c @@ -1,14 +1,36 @@ /********************************************************************** - ascii.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" static int -ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) +ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); @@ -17,38 +39,29 @@ ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingASCII = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "US-ASCII", /* name */ 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - ascii_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + ascii_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/big5.c b/ext/mbstring/oniguruma/enc/big5.c index 8aad7f53547..763872e963a 100644 --- a/ext/mbstring/oniguruma/enc/big5.c +++ b/ext/mbstring/oniguruma/enc/big5.c @@ -1,14 +1,61 @@ /********************************************************************** - big5.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_BIG5[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +big5_mbc_enc_len(const UChar* p) +{ + return EncLen_BIG5[*p]; +} + static OnigCodePoint -big5_mbc_to_code(UChar* p, UChar* end) +big5_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_BIG5, p, end); } @@ -20,15 +67,23 @@ big5_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -big5_mbc_to_lower(UChar* p, UChar* lower) +big5_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) { - return onigenc_mbn_mbc_to_lower(ONIG_ENCODING_BIG5, p, lower); + return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_BIG5, flag, + pp, end, lower); } static int -big5_code_is_ctype(OnigCodePoint code, unsigned int ctype) +big5_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) { - return onigenc_mb2_code_is_ctype(ONIG_ENCODING_BIG5, code, ctype); + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_BIG5, flag, pp, end); +} + +static int +big5_is_code_ctype(OnigCodePoint code, unsigned int ctype) +{ + return onigenc_mb2_is_code_ctype(ONIG_ENCODING_BIG5, code, ctype); } static const char BIG5_CAN_BE_TRAIL_TABLE[256] = { @@ -50,16 +105,16 @@ static const char BIG5_CAN_BE_TRAIL_TABLE[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; -#define BIG5_ISMB_FIRST(byte) (OnigEncodingBIG5.len_table[byte] > 1) +#define BIG5_ISMB_FIRST(byte) (EncLen_BIG5[byte] > 1) #define BIG5_ISMB_TRAIL(byte) BIG5_CAN_BE_TRAIL_TABLE[(byte)] static UChar* -big5_left_adjust_char_head(UChar* start, UChar* s) +big5_left_adjust_char_head(const UChar* start, const UChar* s) { - UChar *p; + const UChar *p; int len; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; if (BIG5_ISMB_TRAIL(*p)) { @@ -70,53 +125,44 @@ big5_left_adjust_char_head(UChar* start, UChar* s) } } } - len = enc_len(ONIG_ENCODING_BIG5, *p); - if (p + len > s) return p; + len = enc_len(ONIG_ENCODING_BIG5, p); + if (p + len > s) return (UChar* )p; p += len; - return p + ((s - p) & ~1); + return (UChar* )(p + ((s - p) & ~1)); } static int -big5_is_allowed_reverse_match(UChar* s, UChar* end) +big5_is_allowed_reverse_match(const UChar* s, const UChar* end) { - UChar c = *s; + const UChar c = *s; return (BIG5_ISMB_TRAIL(c) ? FALSE : TRUE); } OnigEncodingType OnigEncodingBIG5 = { + big5_mbc_enc_len, + "Big5", /* name */ + 2, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "Big5", /* name */ - 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, big5_mbc_to_code, onigenc_mb2_code_to_mbclen, big5_code_to_mbc, - big5_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - big5_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + big5_mbc_to_normalize, + big5_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + big5_is_code_ctype, + onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, - big5_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + big5_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/euc_jp.c b/ext/mbstring/oniguruma/enc/euc_jp.c index 848016ba5a0..5f13e33eb4c 100644 --- a/ext/mbstring/oniguruma/enc/euc_jp.c +++ b/ext/mbstring/oniguruma/enc/euc_jp.c @@ -1,23 +1,69 @@ /********************************************************************** - euc_jp.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) +static int EncLen_EUCJP[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +eucjp_mbc_enc_len(const UChar* p) +{ + return EncLen_EUCJP[*p]; +} + static OnigCodePoint -eucjp_mbc_to_code(UChar* p, UChar* end) +eucjp_mbc_to_code(const UChar* p, const UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(ONIG_ENCODING_EUC_JP, c); - n = c; + len = enc_len(ONIG_ENCODING_EUC_JP, p); + n = (OnigCodePoint )*p++; if (len == 1) return n; for (i = 1; i < len; i++) { @@ -31,11 +77,13 @@ eucjp_mbc_to_code(UChar* p, UChar* end) static int eucjp_code_to_mbclen(OnigCodePoint code) { - if ((code & 0xff0000) != 0) return 3; + if (ONIGENC_IS_CODE_ASCII(code)) return 1; + else if ((code & 0xff0000) != 0) return 3; else if ((code & 0xff00) != 0) return 2; - else return 1; + else return 0; } +#if 0 static int eucjp_code_to_mbc_first(OnigCodePoint code) { @@ -43,27 +91,16 @@ eucjp_code_to_mbc_first(OnigCodePoint code) if ((code & 0xff0000) != 0) { first = (code >> 16) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 3) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; - */ } else if ((code & 0xff00) != 0) { first = (code >> 8) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 2) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; - */ } else { - /* - if (enc_len(ONIG_ENCODING_EUC_JP, code) != 1) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; - */ return (int )code; } return first; } +#endif static int eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) @@ -75,44 +112,57 @@ eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(ONIG_ENCODING_EUC_JP, buf[0]) != (p - buf)) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; + if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf)) + return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -eucjp_mbc_to_lower(UChar* p, UChar* lower) +eucjp_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { int len; + const UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_EUC_JP, *p); + len = enc_len(ONIG_ENCODING_EUC_JP, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype) +eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end); +} + +static int +eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = eucjp_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_EUC_JP, first) > 1 ? TRUE : FALSE); - } + else + return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -125,28 +175,28 @@ eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype) } static UChar* -eucjp_left_adjust_char_head(UChar* start, UChar* s) +eucjp_left_adjust_char_head(const UChar* start, const UChar* s) { - /* Assumed in this encoding, - mb-trail bytes don't mix with single bytes. + /* In this encoding + mb-trail bytes doesn't mix with single bytes. */ - UChar *p; + const UChar *p; int len; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; while (!eucjp_islead(*p) && p > start) p--; - len = enc_len(ONIG_ENCODING_EUC_JP, *p); - if (p + len > s) return p; + len = enc_len(ONIG_ENCODING_EUC_JP, p); + if (p + len > s) return (UChar* )p; p += len; - return p + ((s - p) & ~1); + return (UChar* )(p + ((s - p) & ~1)); } static int -eucjp_is_allowed_reverse_match(UChar* s, UChar* end) +eucjp_is_allowed_reverse_match(const UChar* s, const UChar* end) { - UChar c = *s; + const UChar c = *s; if (c <= 0x7e || c == 0x8e || c == 0x8f) return TRUE; else @@ -154,38 +204,29 @@ eucjp_is_allowed_reverse_match(UChar* s, UChar* end) } OnigEncodingType OnigEncodingEUC_JP = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 - }, + eucjp_mbc_enc_len, "EUC-JP", /* name */ - 3, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 3, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, eucjp_mbc_to_code, eucjp_code_to_mbclen, eucjp_code_to_mbc, - eucjp_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - eucjp_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + eucjp_mbc_to_normalize, + eucjp_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + eucjp_is_code_ctype, + onigenc_not_support_get_ctype_code_range, eucjp_left_adjust_char_head, - eucjp_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + eucjp_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/euc_kr.c b/ext/mbstring/oniguruma/enc/euc_kr.c index 1a0fc996dcd..c1e83b7e660 100644 --- a/ext/mbstring/oniguruma/enc/euc_kr.c +++ b/ext/mbstring/oniguruma/enc/euc_kr.c @@ -1,14 +1,61 @@ /********************************************************************** - euc_kr.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_EUCKR[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +euckr_mbc_enc_len(const UChar* p) +{ + return EncLen_EUCKR[*p]; +} + static OnigCodePoint -euckr_mbc_to_code(UChar* p, UChar* end) +euckr_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_KR, p, end); } @@ -20,117 +67,107 @@ euckr_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -euckr_mbc_to_lower(UChar* p, UChar* lower) +euckr_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) { - return onigenc_mbn_mbc_to_lower(ONIG_ENCODING_EUC_KR, p, lower); + return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_EUC_KR, flag, + pp, end, lower); } static int -euckr_code_is_ctype(OnigCodePoint code, unsigned int ctype) +euckr_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) { - return onigenc_mb2_code_is_ctype(ONIG_ENCODING_EUC_KR, code, ctype); + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_KR, flag, pp, end); +} + +static int +euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype) +{ + return onigenc_mb2_is_code_ctype(ONIG_ENCODING_EUC_KR, code, ctype); } #define euckr_islead(c) ((c) < 0xa1 || (c) == 0xff) static UChar* -euckr_left_adjust_char_head(UChar* start, UChar* s) +euckr_left_adjust_char_head(const UChar* start, const UChar* s) { /* Assumed in this encoding, mb-trail bytes don't mix with single bytes. */ - UChar *p; + const UChar *p; int len; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; while (!euckr_islead(*p) && p > start) p--; - len = enc_len(ONIG_ENCODING_EUC_KR, *p); - if (p + len > s) return p; + len = enc_len(ONIG_ENCODING_EUC_KR, p); + if (p + len > s) return (UChar* )p; p += len; - return p + ((s - p) & ~1); + return (UChar* )(p + ((s - p) & ~1)); } static int -euckr_is_allowed_reverse_match(UChar* s, UChar* end) +euckr_is_allowed_reverse_match(const UChar* s, const UChar* end) { - UChar c = *s; + const UChar c = *s; if (c <= 0x7e) return TRUE; else return FALSE; } OnigEncodingType OnigEncodingEUC_KR = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 - }, + euckr_mbc_enc_len, "EUC-KR", /* name */ - 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 2, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, onigenc_mb2_code_to_mbclen, euckr_code_to_mbc, - euckr_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - euckr_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + euckr_mbc_to_normalize, + euckr_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + euckr_is_code_ctype, + onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, - euckr_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + euckr_is_allowed_reverse_match }; /* Same with OnigEncodingEUC_KR except the name */ OnigEncodingType OnigEncodingEUC_CN = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 - }, + euckr_mbc_enc_len, "EUC-CN", /* name */ - 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 2, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, onigenc_mb2_code_to_mbclen, euckr_code_to_mbc, - euckr_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - euckr_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + euckr_mbc_to_normalize, + euckr_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + euckr_is_code_ctype, + onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, - euckr_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + euckr_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/euc_tw.c b/ext/mbstring/oniguruma/enc/euc_tw.c index b39a9a5b7ac..4e5851a4515 100644 --- a/ext/mbstring/oniguruma/enc/euc_tw.c +++ b/ext/mbstring/oniguruma/enc/euc_tw.c @@ -1,14 +1,61 @@ /********************************************************************** - euc_tw.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_EUCTW[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +euctw_mbc_enc_len(const UChar* p) +{ + return EncLen_EUCTW[*p]; +} + static OnigCodePoint -euctw_mbc_to_code(UChar* p, UChar* end) +euctw_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_TW, p, end); } @@ -20,79 +67,78 @@ euctw_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -euctw_mbc_to_lower(UChar* p, UChar* lower) +euctw_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) { - return onigenc_mbn_mbc_to_lower(ONIG_ENCODING_EUC_TW, p, lower); + return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_EUC_TW, flag, + pp, end, lower); } static int -euctw_code_is_ctype(OnigCodePoint code, unsigned int ctype) +euctw_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) { - return onigenc_mb4_code_is_ctype(ONIG_ENCODING_EUC_TW, code, ctype); + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_TW, flag, pp, end); +} + +static int +euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype) +{ + return onigenc_mb4_is_code_ctype(ONIG_ENCODING_EUC_TW, code, ctype); } #define euctw_islead(c) (((c) < 0xa1 && (c) != 0x8e) || (c) == 0xff) static UChar* -euctw_left_adjust_char_head(UChar* start, UChar* s) +euctw_left_adjust_char_head(const UChar* start, const UChar* s) { /* Assumed in this encoding, mb-trail bytes don't mix with single bytes. */ - UChar *p; + const UChar *p; int len; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; while (!euctw_islead(*p) && p > start) p--; - len = enc_len(ONIG_ENCODING_EUC_TW, *p); - if (p + len > s) return p; + len = enc_len(ONIG_ENCODING_EUC_TW, p); + if (p + len > s) return (UChar* )p; p += len; - return p + ((s - p) & ~1); + return (UChar* )(p + ((s - p) & ~1)); } static int -euctw_is_allowed_reverse_match(UChar* s, UChar* end) +euctw_is_allowed_reverse_match(const UChar* s, const UChar* end) { - UChar c = *s; + const UChar c = *s; if (c <= 0x7e) return TRUE; else return FALSE; } OnigEncodingType OnigEncodingEUC_TW = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 - }, + euctw_mbc_enc_len, "EUC-TW", /* name */ - 4, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 4, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, euctw_mbc_to_code, onigenc_mb4_code_to_mbclen, euctw_code_to_mbc, - euctw_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - euctw_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + euctw_mbc_to_normalize, + euctw_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + euctw_is_code_ctype, + onigenc_not_support_get_ctype_code_range, euctw_left_adjust_char_head, - euctw_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + euctw_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_1.c b/ext/mbstring/oniguruma/enc/iso8859_1.c index 662f0e2c079..53ad52ee13d 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_1.c +++ b/ext/mbstring/oniguruma/enc/iso8859_1.c @@ -1,112 +1,145 @@ /********************************************************************** - iso8859_1.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" -#define ENC_ISO_8859_1_TO_LOWER_CASE(c) EncISO_8859_1_ToLowerCaseTable[c] #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ ((EncISO_8859_1_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_1_ToLowerCaseTable[256] = { - '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', - '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', - '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', - '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', - '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', - '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', - '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', - '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', - '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', - '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', - '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', - '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', - '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', - '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', - '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', - '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', - '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', - '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', - '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', - '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', - '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', - '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', - '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', - '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', - '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', - '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', - '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327', - '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337', - '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', - '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', - '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', - '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' -}; - static unsigned short EncISO_8859_1_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x00d0, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0871, 0x00d0, 0x0050, 0x00d0, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0850, 0x0850, 0x0050, 0x0871, 0x0050, 0x00d0, - 0x0050, 0x0850, 0x0871, 0x00d0, 0x0850, 0x0850, 0x0850, 0x00d0, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_1_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_1_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_1_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_1_mbc_is_case_ambig(UChar* p) +iso_8859_1_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_1_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba)) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; + } } - return (v != 0 ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_1_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba)) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_1_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_1_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_1_CTYPE(code, ctype); @@ -115,38 +148,31 @@ iso_8859_1_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingISO_8859_1 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-1", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_1_mbc_to_lower, - iso_8859_1_mbc_is_case_ambig, - iso_8859_1_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_1_mbc_to_normalize, + iso_8859_1_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_1_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_10.c b/ext/mbstring/oniguruma/enc/iso8859_10.c index ac493037144..a9331cebf35 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_10.c +++ b/ext/mbstring/oniguruma/enc/iso8859_10.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_10.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_10_TO_LOWER_CASE(c) EncISO_8859_10_ToLowerCaseTable[c] @@ -47,69 +69,114 @@ static UChar EncISO_8859_10_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_10_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0a51, 0x0a51, - 0x0050, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x00d0, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x00d0, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x14a2, 0x14a2, + 0x00a0, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x01a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x01a0, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_10_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_10_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_10_mbc_is_case_ambig(UChar* p) +iso_8859_10_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_10_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; - } - else if (v != 0) { - return TRUE; + } } + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_10_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf is lower case letter, but can't convert. */ + if (*p == 0xdf) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } return FALSE; } static int -iso_8859_10_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_10_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_10_CTYPE(code, ctype); @@ -117,39 +184,144 @@ iso_8859_10_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_10_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xb1 }, + { 0xa2, 0xb2 }, + { 0xa3, 0xb3 }, + { 0xa4, 0xb4 }, + { 0xa5, 0xb5 }, + { 0xa6, 0xb6 }, + { 0xa8, 0xb8 }, + { 0xa9, 0xb9 }, + { 0xaa, 0xba }, + { 0xab, 0xbb }, + { 0xac, 0xbc }, + { 0xae, 0xbe }, + { 0xaf, 0xbf }, + + { 0xb1, 0xa1 }, + { 0xb2, 0xa2 }, + { 0xb3, 0xa3 }, + { 0xb4, 0xa4 }, + { 0xb5, 0xa5 }, + { 0xb6, 0xa6 }, + { 0xb8, 0xa8 }, + { 0xb9, 0xa9 }, + { 0xba, 0xaa }, + { 0xbb, 0xab }, + { 0xbc, 0xac }, + { 0xbe, 0xae }, + { 0xbf, 0xaf }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_10 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-10", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_10_mbc_to_lower, - iso_8859_10_mbc_is_case_ambig, - iso_8859_10_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_10_mbc_to_normalize, + iso_8859_10_is_mbc_ambiguous, + iso_8859_10_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_10_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_11.c b/ext/mbstring/oniguruma/enc/iso8859_11.c index ebe81d325fc..bb1098807ac 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_11.c +++ b/ext/mbstring/oniguruma/enc/iso8859_11.c @@ -1,52 +1,74 @@ /********************************************************************** - iso8859_11.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_IS_ISO_8859_11_CTYPE(code,ctype) \ ((EncISO_8859_11_CtypeTable[code] & ctype) != 0) static unsigned short EncISO_8859_11_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0000, 0x0000, 0x0000, 0x0000, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0000, 0x0000, 0x0000, 0x0000 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x0000, 0x0000, 0x0000, 0x0000, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x0000, 0x0000, 0x0000, 0x0000 }; static int -iso_8859_11_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_11_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_11_CTYPE(code, ctype); @@ -55,38 +77,29 @@ iso_8859_11_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingISO_8859_11 = { + onigenc_single_byte_mbc_enc_len, + "ISO-8859-11", /* name */ + 1, /* max enc length */ + 1, /* min enc length */ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE ), { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "ISO-8859-11", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - iso_8859_11_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + iso_8859_11_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_13.c b/ext/mbstring/oniguruma/enc/iso8859_13.c index 8de7251d2f0..827ca508e8b 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_13.c +++ b/ext/mbstring/oniguruma/enc/iso8859_13.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_13.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_13_TO_LOWER_CASE(c) EncISO_8859_13_ToLowerCaseTable[c] @@ -47,69 +69,114 @@ static UChar EncISO_8859_13_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_13_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x00d0, 0x0050, 0x0050, 0x0050, 0x00d0, 0x0050, 0x0050, - 0x0a51, 0x0050, 0x0a51, 0x00d0, 0x0050, 0x00d0, 0x0050, 0x0a51, - 0x0050, 0x0050, 0x0850, 0x0850, 0x00d0, 0x0871, 0x0050, 0x00d0, - 0x0871, 0x0850, 0x0871, 0x00d0, 0x0850, 0x0850, 0x0850, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x00d0 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x14a2, 0x00a0, 0x14a2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x14a2, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x01a0, 0x10e2, 0x00a0, 0x01a0, + 0x10e2, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x01a0 }; static int -iso_8859_13_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_13_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_13_mbc_is_case_ambig(UChar* p) +iso_8859_13_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_13_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf || *p == 0xb5) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; - } - else if (v != 0) { - return TRUE; + } } + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_13_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf, 0xb5 are lower case letter, but can't convert. */ + if (*p == 0xdf || *p == 0xb5) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } return FALSE; } static int -iso_8859_13_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_13_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_13_CTYPE(code, ctype); @@ -117,39 +184,114 @@ iso_8859_13_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_13_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_13 = { + onigenc_single_byte_mbc_enc_len, + "ISO-8859-13", /* name */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "ISO-8859-13", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_13_mbc_to_lower, - iso_8859_13_mbc_is_case_ambig, - iso_8859_13_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_13_mbc_to_normalize, + iso_8859_13_is_mbc_ambiguous, + iso_8859_13_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_13_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_14.c b/ext/mbstring/oniguruma/enc/iso8859_14.c index 333deeba320..4fe5ab29d1e 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_14.c +++ b/ext/mbstring/oniguruma/enc/iso8859_14.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_14.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_14_TO_LOWER_CASE(c) EncISO_8859_14_ToLowerCaseTable[c] @@ -47,69 +69,114 @@ static UChar EncISO_8859_14_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_14_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0871, 0x0050, 0x0a51, 0x0871, 0x0a51, 0x0050, - 0x0a51, 0x0050, 0x0a51, 0x0871, 0x0a51, 0x00d0, 0x0050, 0x0a51, - 0x0a51, 0x0871, 0x0a51, 0x0871, 0x0a51, 0x0871, 0x0050, 0x0a51, - 0x0871, 0x0871, 0x0871, 0x0a51, 0x0871, 0x0a51, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x10e2, 0x00a0, 0x14a2, 0x10e2, 0x14a2, 0x00a0, + 0x14a2, 0x00a0, 0x14a2, 0x10e2, 0x14a2, 0x01a0, 0x00a0, 0x14a2, + 0x14a2, 0x10e2, 0x14a2, 0x10e2, 0x14a2, 0x10e2, 0x00a0, 0x14a2, + 0x10e2, 0x10e2, 0x10e2, 0x14a2, 0x10e2, 0x14a2, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_14_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_14_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_14_mbc_is_case_ambig(UChar* p) +iso_8859_14_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_14_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; - } - else if (v != 0) { - return TRUE; + } } + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_14_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf is lower case letter, but can't convert. */ + if (*p == 0xdf) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } return FALSE; } static int -iso_8859_14_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_14_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_14_CTYPE(code, ctype); @@ -117,39 +184,144 @@ iso_8859_14_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_14_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xa2 }, + { 0xa2, 0xa1 }, + { 0xa4, 0xa5 }, + { 0xa5, 0xa4 }, + { 0xa6, 0xab }, + { 0xa8, 0xb8 }, + { 0xaa, 0xba }, + { 0xab, 0xa6 }, + { 0xac, 0xbc }, + { 0xaf, 0xff }, + + { 0xb0, 0xb1 }, + { 0xb1, 0xb0 }, + { 0xb2, 0xb3 }, + { 0xb3, 0xb2 }, + { 0xb4, 0xb5 }, + { 0xb5, 0xb4 }, + { 0xb7, 0xb9 }, + { 0xb8, 0xa8 }, + { 0xb9, 0xb7 }, + { 0xba, 0xaa }, + { 0xbb, 0xbf }, + { 0xbc, 0xac }, + { 0xbd, 0xbe }, + { 0xbe, 0xbd }, + { 0xbf, 0xbb }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde }, + { 0xff, 0xaf } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_14 = { + onigenc_single_byte_mbc_enc_len, + "ISO-8859-14", /* name */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "ISO-8859-14", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_14_mbc_to_lower, - iso_8859_14_mbc_is_case_ambig, - iso_8859_14_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_14_mbc_to_normalize, + iso_8859_14_is_mbc_ambiguous, + iso_8859_14_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_14_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_15.c b/ext/mbstring/oniguruma/enc/iso8859_15.c index 49cb266058b..1a8bd7b4c5b 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_15.c +++ b/ext/mbstring/oniguruma/enc/iso8859_15.c @@ -1,15 +1,33 @@ /********************************************************************** - iso8859_15.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#include "regenc.h" +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ -/* reference - http://en.wikipedia.org/wiki/ISO_8859-15 -*/ +#include "regenc.h" #define ENC_ISO_8859_15_TO_LOWER_CASE(c) EncISO_8859_15_ToLowerCaseTable[c] #define ENC_IS_ISO_8859_15_CTYPE(code,ctype) \ @@ -51,65 +69,114 @@ static UChar EncISO_8859_15_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_15_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x00d0, 0x0050, 0x0050, 0x0050, 0x0050, 0x0a51, 0x0050, - 0x0871, 0x0050, 0x0871, 0x00d0, 0x0050, 0x00d0, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0850, 0x0850, 0x0a51, 0x0871, 0x0050, 0x00d0, - 0x0871, 0x0850, 0x0871, 0x00d0, 0x0a51, 0x0871, 0x0a51, 0x00d0, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x14a2, 0x00a0, + 0x10e2, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x14a2, 0x10e2, 0x00a0, 0x01a0, + 0x10e2, 0x10a0, 0x10e2, 0x01a0, 0x14a2, 0x10e2, 0x14a2, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_15_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_15_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_15_mbc_is_case_ambig(UChar* p) +iso_8859_15_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_15_CtypeTable[*p] - & (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xdf || *p == 0xaa || *p == 0xb5 || *p == 0xba) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; + } } - return (v != 0 ? TRUE : FALSE); + + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_15_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf etc.. are lower case letter, but can't convert. */ + if (*p == 0xdf || *p == 0xaa || *p == 0xb5 || *p == 0xba) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_15_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_15_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_15_CTYPE(code, ctype); @@ -117,39 +184,124 @@ iso_8859_15_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_15_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa6, 0xa8 }, + { 0xa8, 0xa6 }, + + { 0xb4, 0xb8 }, + { 0xb8, 0xb4 }, + { 0xbc, 0xbd }, + { 0xbd, 0xbc }, + { 0xbe, 0xff }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde }, + { 0xff, 0xbe } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_15 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-15", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_15_mbc_to_lower, - iso_8859_15_mbc_is_case_ambig, - iso_8859_15_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_15_mbc_to_normalize, + iso_8859_15_is_mbc_ambiguous, + iso_8859_15_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_15_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_16.c b/ext/mbstring/oniguruma/enc/iso8859_16.c index e59ea0f42c4..e283db17ccf 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_16.c +++ b/ext/mbstring/oniguruma/enc/iso8859_16.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_16.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_16_TO_LOWER_CASE(c) EncISO_8859_16_ToLowerCaseTable[c] @@ -47,69 +69,114 @@ static UChar EncISO_8859_16_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_16_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0871, 0x0a51, 0x0050, 0x00d0, 0x0a51, 0x0050, - 0x0871, 0x0050, 0x0a51, 0x00d0, 0x0a51, 0x00d0, 0x0871, 0x0a51, - 0x0050, 0x0050, 0x0a51, 0x0871, 0x0a51, 0x00d0, 0x0050, 0x00d0, - 0x0871, 0x0871, 0x0871, 0x00d0, 0x0a51, 0x0871, 0x0a51, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x10e2, 0x14a2, 0x00a0, 0x01a0, 0x14a2, 0x00a0, + 0x10e2, 0x00a0, 0x14a2, 0x01a0, 0x14a2, 0x01a0, 0x10e2, 0x14a2, + 0x00a0, 0x00a0, 0x14a2, 0x10e2, 0x14a2, 0x01a0, 0x00a0, 0x01a0, + 0x10e2, 0x10e2, 0x10e2, 0x01a0, 0x14a2, 0x10e2, 0x14a2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_16_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_16_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_16_mbc_is_case_ambig(UChar* p) +iso_8859_16_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_16_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; - } - else if (v != 0) { - return TRUE; + } } + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_16_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf is lower case letter, but can't convert. */ + if (*p == 0xdf) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } return FALSE; } static int -iso_8859_16_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_16_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_16_CTYPE(code, ctype); @@ -117,39 +184,138 @@ iso_8859_16_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_16_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xa2 }, + { 0xa2, 0xa1 }, + { 0xa3, 0xb3 }, + { 0xa6, 0xa8 }, + { 0xa8, 0xa6 }, + { 0xaa, 0xba }, + { 0xac, 0xae }, + { 0xae, 0xac }, + { 0xaf, 0xbf }, + + { 0xb2, 0xb9 }, + { 0xb3, 0xa3 }, + { 0xb4, 0xb8 }, + { 0xb8, 0xb4 }, + { 0xb9, 0xb2 }, + { 0xba, 0xaa }, + { 0xbc, 0xbd }, + { 0xbd, 0xbc }, + { 0xbe, 0xff }, + { 0xbf, 0xaf }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde }, + { 0xff, 0xbe } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_16 = { + onigenc_single_byte_mbc_enc_len, + "ISO-8859-16", /* name */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "ISO-8859-16", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_16_mbc_to_lower, - iso_8859_16_mbc_is_case_ambig, - iso_8859_16_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_16_mbc_to_normalize, + iso_8859_16_is_mbc_ambiguous, + iso_8859_16_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_16_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_2.c b/ext/mbstring/oniguruma/enc/iso8859_2.c index 3d6b0a6573d..e86415b9c96 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_2.c +++ b/ext/mbstring/oniguruma/enc/iso8859_2.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_2.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_2_TO_LOWER_CASE(c) EncISO_8859_2_ToLowerCaseTable[c] @@ -47,66 +69,218 @@ static UChar EncISO_8859_2_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_2_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0050, 0x0a51, 0x0050, 0x0a51, 0x0a51, 0x0050, - 0x0050, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0a51, 0x0a51, - 0x0050, 0x0871, 0x0050, 0x0871, 0x0050, 0x0871, 0x0871, 0x0050, - 0x0050, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x00a0, 0x14a2, 0x00a0, 0x14a2, 0x14a2, 0x00a0, + 0x00a0, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x14a2, 0x14a2, + 0x00a0, 0x10e2, 0x00a0, 0x10e2, 0x00a0, 0x10e2, 0x10e2, 0x00a0, + 0x00a0, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0 }; static int -iso_8859_2_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_2_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_2_mbc_is_case_ambig(UChar* p) +iso_8859_2_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_2_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; + } } - return (v != 0 ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_2_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf is lower case letter, but can't convert. */ + if (*p == 0xdf) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_2_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_2_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xb1 }, + { 0xa3, 0xb3 }, + { 0xa5, 0xb5 }, + { 0xa6, 0xb6 }, + { 0xa9, 0xb9 }, + { 0xaa, 0xba }, + { 0xab, 0xbb }, + { 0xac, 0xbc }, + { 0xae, 0xbe }, + { 0xaf, 0xbf }, + + { 0xb1, 0xa1 }, + { 0xb3, 0xa3 }, + { 0xb5, 0xa5 }, + { 0xb6, 0xa6 }, + { 0xb9, 0xa9 }, + { 0xba, 0xaa }, + { 0xbb, 0xab }, + { 0xbc, 0xac }, + { 0xbe, 0xae }, + { 0xbf, 0xaf }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + +static int +iso_8859_2_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_2_CTYPE(code, ctype); @@ -115,38 +289,31 @@ iso_8859_2_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingISO_8859_2 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-2", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_2_mbc_to_lower, - iso_8859_2_mbc_is_case_ambig, - iso_8859_2_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_2_mbc_to_normalize, + iso_8859_2_is_mbc_ambiguous, + iso_8859_2_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_2_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_3.c b/ext/mbstring/oniguruma/enc/iso8859_3.c index 37a3089aabd..76d2bec8a87 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_3.c +++ b/ext/mbstring/oniguruma/enc/iso8859_3.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_3.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_3_TO_LOWER_CASE(c) EncISO_8859_3_ToLowerCaseTable[c] @@ -47,66 +69,114 @@ static UChar EncISO_8859_3_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_3_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0050, 0x0050, 0x0050, 0x0000, 0x0a51, 0x0050, - 0x0050, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0000, 0x0a51, - 0x0050, 0x0871, 0x0850, 0x0850, 0x0050, 0x0871, 0x0871, 0x00d0, - 0x0050, 0x0871, 0x0871, 0x0871, 0x0871, 0x0850, 0x0000, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0000, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0000, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0000, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0000, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x00a0, 0x00a0, 0x00a0, 0x0000, 0x14a2, 0x00a0, + 0x00a0, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x0000, 0x14a2, + 0x00a0, 0x10e2, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x10e2, 0x01a0, + 0x00a0, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x11a0, 0x0000, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x0000, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x0000, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x0000, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x0000, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0 }; static int -iso_8859_3_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_3_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_3_mbc_is_case_ambig(UChar* p) +iso_8859_3_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_3_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf || *p == 0xb5) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; + } } - return (v != 0 ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_3_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (*p == 0xdf || *p == 0xb5) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_3_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_3_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_3_CTYPE(code, ctype); @@ -114,39 +184,125 @@ iso_8859_3_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_3_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xb1 }, + { 0xa6, 0xb6 }, + { 0xa9, 0xb9 }, + { 0xaa, 0xba }, + { 0xab, 0xbb }, + { 0xac, 0xbc }, + { 0xaf, 0xbf }, + { 0xb1, 0xa1 }, + { 0xb6, 0xa6 }, + { 0xb9, 0xa9 }, + { 0xba, 0xaa }, + { 0xbb, 0xab }, + { 0xbc, 0xac }, + { 0xbf, 0xaf }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_3 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-3", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_3_mbc_to_lower, - iso_8859_3_mbc_is_case_ambig, - iso_8859_3_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_3_mbc_to_normalize, + iso_8859_3_is_mbc_ambiguous, + iso_8859_3_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_3_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_4.c b/ext/mbstring/oniguruma/enc/iso8859_4.c index 897eec957fd..7569006725c 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_4.c +++ b/ext/mbstring/oniguruma/enc/iso8859_4.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_4.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_4_TO_LOWER_CASE(c) EncISO_8859_4_ToLowerCaseTable[c] @@ -47,69 +69,114 @@ static UChar EncISO_8859_4_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_4_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0871, 0x0a51, 0x0050, 0x0a51, 0x0a51, 0x0050, - 0x0050, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0a51, 0x0050, - 0x0050, 0x0871, 0x0050, 0x0871, 0x0050, 0x0871, 0x0871, 0x0050, - 0x0050, 0x0871, 0x0871, 0x0871, 0x0871, 0x0a51, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x10e2, 0x14a2, 0x00a0, 0x14a2, 0x14a2, 0x00a0, + 0x00a0, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x14a2, 0x00a0, + 0x00a0, 0x10e2, 0x00a0, 0x10e2, 0x00a0, 0x10e2, 0x10e2, 0x00a0, + 0x00a0, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x14a2, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0 }; static int -iso_8859_4_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_4_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_4_mbc_is_case_ambig(UChar* p) +iso_8859_4_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_4_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf is lower case letter, but can't convert. */ - if (*p == 0xdf || *p == 0xa2) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; - } - else if (v != 0) { - return TRUE; + } } + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_4_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (*p == 0xdf || *p == 0xa2) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } return FALSE; } static int -iso_8859_4_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_4_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_4_CTYPE(code, ctype); @@ -117,39 +184,134 @@ iso_8859_4_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_4_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xb1 }, + { 0xa3, 0xb3 }, + { 0xa5, 0xb5 }, + { 0xa6, 0xb6 }, + { 0xa9, 0xb9 }, + { 0xaa, 0xba }, + { 0xab, 0xbb }, + { 0xac, 0xbc }, + { 0xae, 0xbe }, + + { 0xb1, 0xa1 }, + { 0xb3, 0xa3 }, + { 0xb5, 0xa5 }, + { 0xb6, 0xa6 }, + { 0xb9, 0xa9 }, + { 0xba, 0xaa }, + { 0xbb, 0xab }, + { 0xbc, 0xac }, + { 0xbe, 0xae }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_4 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-4", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_4_mbc_to_lower, - iso_8859_4_mbc_is_case_ambig, - iso_8859_4_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_4_mbc_to_normalize, + iso_8859_4_is_mbc_ambiguous, + iso_8859_4_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_4_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_5.c b/ext/mbstring/oniguruma/enc/iso8859_5.c index 34e0f9db3ee..2f7677b3e7a 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_5.c +++ b/ext/mbstring/oniguruma/enc/iso8859_5.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_5.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_5_TO_LOWER_CASE(c) EncISO_8859_5_ToLowerCaseTable[c] @@ -47,58 +69,80 @@ static UChar EncISO_8859_5_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_5_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0050, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x00a0, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, 0x10e2, 0x10e2 }; static int -iso_8859_5_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_5_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_5_mbc_is_case_ambig(UChar* p) +iso_8859_5_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_5_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - return (v != 0 ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_5_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_5_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_5_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_5_CTYPE(code, ctype); @@ -106,39 +150,147 @@ iso_8859_5_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_5_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xa1, 0xf1 }, + { 0xa2, 0xf2 }, + { 0xa3, 0xf3 }, + { 0xa4, 0xf4 }, + { 0xa5, 0xf5 }, + { 0xa6, 0xf6 }, + { 0xa7, 0xf7 }, + { 0xa8, 0xf8 }, + { 0xa9, 0xf9 }, + { 0xaa, 0xfa }, + { 0xab, 0xfb }, + { 0xac, 0xfc }, + { 0xae, 0xfe }, + { 0xaf, 0xff }, + + { 0xb0, 0xd0 }, + { 0xb1, 0xd1 }, + { 0xb2, 0xd2 }, + { 0xb3, 0xd3 }, + { 0xb4, 0xd4 }, + { 0xb5, 0xd5 }, + { 0xb6, 0xd6 }, + { 0xb7, 0xd7 }, + { 0xb8, 0xd8 }, + { 0xb9, 0xd9 }, + { 0xba, 0xda }, + { 0xbb, 0xdb }, + { 0xbc, 0xdc }, + { 0xbd, 0xdd }, + { 0xbe, 0xdf }, + { 0xbf, 0xdf }, + + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xb0 }, + { 0xd1, 0xb1 }, + { 0xd2, 0xb2 }, + { 0xd3, 0xb3 }, + { 0xd4, 0xb4 }, + { 0xd5, 0xb5 }, + { 0xd6, 0xb6 }, + { 0xd7, 0xb7 }, + { 0xd8, 0xb8 }, + { 0xd9, 0xb9 }, + { 0xda, 0xba }, + { 0xdb, 0xbb }, + { 0xdc, 0xbc }, + { 0xdd, 0xbd }, + { 0xde, 0xbe }, + { 0xdf, 0xbf }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf1, 0xa1 }, + { 0xf2, 0xa2 }, + { 0xf3, 0xa3 }, + { 0xf4, 0xa4 }, + { 0xf5, 0xa5 }, + { 0xf6, 0xa6 }, + { 0xf7, 0xa7 }, + { 0xf8, 0xa8 }, + { 0xf9, 0xa9 }, + { 0xfa, 0xaa }, + { 0xfb, 0xab }, + { 0xfc, 0xac }, + { 0xfe, 0xae }, + { 0xff, 0xaf } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_5 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-5", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_5_mbc_to_lower, - iso_8859_5_mbc_is_case_ambig, - iso_8859_5_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_5_mbc_to_normalize, + iso_8859_5_is_mbc_ambiguous, + iso_8859_5_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + iso_8859_5_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_6.c b/ext/mbstring/oniguruma/enc/iso8859_6.c index 956e982878a..0fcb9e8b836 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_6.c +++ b/ext/mbstring/oniguruma/enc/iso8859_6.c @@ -1,52 +1,74 @@ /********************************************************************** - iso8859_6.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_IS_ISO_8859_6_CTYPE(code,ctype) \ ((EncISO_8859_6_CtypeTable[code] & ctype) != 0) static unsigned short EncISO_8859_6_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0000, 0x0000, 0x0000, 0x0050, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x00d0, 0x00d0, 0x0000, 0x0000, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x0000, 0x0000, 0x0000, 0x00a0, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x01a0, 0x01a0, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x00d0, 0x0000, 0x0000, 0x0000, 0x00d0, - 0x0000, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x01a0, 0x0000, 0x0000, 0x0000, 0x01a0, + 0x0000, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; static int -iso_8859_6_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_6_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_6_CTYPE(code, ctype); @@ -55,38 +77,29 @@ iso_8859_6_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingISO_8859_6 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-6", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - iso_8859_6_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + iso_8859_6_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_7.c b/ext/mbstring/oniguruma/enc/iso8859_7.c index 1ea225007fe..8b2cb9ec592 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_7.c +++ b/ext/mbstring/oniguruma/enc/iso8859_7.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_7.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_7_TO_LOWER_CASE(c) EncISO_8859_7_ToLowerCaseTable[c] @@ -47,65 +69,87 @@ static UChar EncISO_8859_7_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_7_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x00d0, 0x00d0, 0x0050, 0x0000, 0x0000, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0000, 0x00d0, 0x0050, 0x00d0, 0x0000, 0x00d0, - 0x0050, 0x0050, 0x0850, 0x0850, 0x0050, 0x0050, 0x0a51, 0x00d0, - 0x0a51, 0x0a51, 0x0a51, 0x00d0, 0x0a51, 0x0850, 0x0a51, 0x0a51, - 0x0871, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0000, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0000 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x01a0, 0x00a0, 0x0000, 0x0000, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x0000, 0x01a0, 0x00a0, 0x01a0, 0x0000, 0x01a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x00a0, 0x14a2, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x01a0, 0x14a2, 0x10a0, 0x14a2, 0x14a2, + 0x10e2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x0000, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x0000 }; static int -iso_8859_7_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_7_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_7_mbc_is_case_ambig(UChar* p) +iso_8859_7_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_7_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - if (*p == 0xc0 || *p == 0xe0) - return FALSE; - else - return TRUE; + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_7_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + if (*p == 0xc0 || *p == 0xe0) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); } - - return (v != 0 ? TRUE : FALSE); + return FALSE; } static int -iso_8859_7_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_7_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_7_CTYPE(code, ctype); @@ -113,39 +157,122 @@ iso_8859_7_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_7_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xb6, 0xdc }, + { 0xb8, 0xdd }, + { 0xb9, 0xde }, + { 0xba, 0xdf }, + { 0xbc, 0xfc }, + { 0xbe, 0xfd }, + { 0xbf, 0xfe }, + + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xb6 }, + { 0xdd, 0xb8 }, + { 0xde, 0xb9 }, + { 0xdf, 0xba }, + + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xbc }, + { 0xfd, 0xbe }, + { 0xfe, 0xbf } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_7 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-7", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_7_mbc_to_lower, - iso_8859_7_mbc_is_case_ambig, - iso_8859_7_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_7_mbc_to_normalize, + iso_8859_7_is_mbc_ambiguous, + iso_8859_7_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + iso_8859_7_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_8.c b/ext/mbstring/oniguruma/enc/iso8859_8.c index d87774a8327..3c95b9b1375 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_8.c +++ b/ext/mbstring/oniguruma/enc/iso8859_8.c @@ -1,52 +1,74 @@ /********************************************************************** - iso8859_8.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_IS_ISO_8859_8_CTYPE(code,ctype) \ ((EncISO_8859_8_CtypeTable[code] & ctype) != 0) static unsigned short EncISO_8859_8_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0000, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x00d0, 0x0050, 0x00d0, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0850, 0x0850, 0x0050, 0x0871, 0x0050, 0x00d0, - 0x0050, 0x0850, 0x0050, 0x00d0, 0x0850, 0x0850, 0x0850, 0x0000, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x0000, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x00a0, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x00d0, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, 0x0851, - 0x0851, 0x0851, 0x0851, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01a0, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, 0x10a2, + 0x10a2, 0x10a2, 0x10a2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; static int -iso_8859_8_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_8_CTYPE(code, ctype); @@ -55,38 +77,29 @@ iso_8859_8_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingISO_8859_8 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-8", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - iso_8859_8_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + iso_8859_8_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/iso8859_9.c b/ext/mbstring/oniguruma/enc/iso8859_9.c index 9638b929943..1b061ff6ea7 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_9.c +++ b/ext/mbstring/oniguruma/enc/iso8859_9.c @@ -1,10 +1,32 @@ /********************************************************************** - iso8859_9.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_ISO_8859_9_TO_LOWER_CASE(c) EncISO_8859_9_ToLowerCaseTable[c] @@ -47,66 +69,114 @@ static UChar EncISO_8859_9_ToLowerCaseTable[256] = { }; static unsigned short EncISO_8859_9_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x00d0, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0871, 0x00d0, 0x0050, 0x00d0, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0850, 0x0850, 0x0050, 0x0871, 0x0050, 0x00d0, - 0x0050, 0x0850, 0x0871, 0x00d0, 0x0850, 0x0850, 0x0850, 0x00d0, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0050, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; static int -iso_8859_9_mbc_to_lower(UChar* p, UChar* lower) +iso_8859_9_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + *lower = 0xdf; + (*pp) += 2; + return 1; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -iso_8859_9_mbc_is_case_ambig(UChar* p) +iso_8859_9_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - int v = (EncISO_8859_9_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - if ((v | ONIGENC_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba)) - return FALSE; - else + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 1) { + if ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S'))) { + (*pp) += 2; + return TRUE; + } + } + + if (*p == 0xdf) { + (*pp)++; return TRUE; + } } - return (v != 0 ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncISO_8859_9_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xdf etc.. are lower case letter, but can't convert. */ + if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba)) + return FALSE; + else + return TRUE; + } + + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -iso_8859_9_code_is_ctype(OnigCodePoint code, unsigned int ctype) +iso_8859_9_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_ISO_8859_9_CTYPE(code, ctype); @@ -114,39 +184,114 @@ iso_8859_9_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +iso_8859_9_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingISO_8859_9 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "ISO-8859-9", /* name */ - 1, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - iso_8859_9_mbc_to_lower, - iso_8859_9_mbc_is_case_ambig, - iso_8859_9_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + iso_8859_9_mbc_to_normalize, + iso_8859_9_is_mbc_ambiguous, + iso_8859_9_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + iso_8859_9_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_get_all_fold_match_code_ss_0xdf, - onigenc_get_fold_match_info_ss_0xdf + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/koi8.c b/ext/mbstring/oniguruma/enc/koi8.c index 9dba2641c96..84afa1afef3 100644 --- a/ext/mbstring/oniguruma/enc/koi8.c +++ b/ext/mbstring/oniguruma/enc/koi8.c @@ -1,10 +1,32 @@ /********************************************************************** - koi8.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_KOI8_TO_LOWER_CASE(c) EncKOI8_ToLowerCaseTable[c] @@ -47,58 +69,79 @@ static UChar EncKOI8_ToLowerCaseTable[256] = { }; static unsigned short EncKOI8_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, - 0x0142, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51 + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2 }; static int -koi8_mbc_to_lower(UChar* p, UChar* lower) +koi8_mbc_to_normalize(OnigAmbigType flag, + UChar** pp, UChar* end, UChar* lower) { - *lower = ENC_KOI8_TO_LOWER_CASE(*p); + UChar* p = *pp; + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_KOI8_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -koi8_mbc_is_case_ambig(UChar* p) +koi8_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) { - int v = (EncKOI8_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + UChar* p = *pp; - return ((v != 0) ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncKOI8_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } + static int -koi8_code_is_ctype(OnigCodePoint code, unsigned int ctype) +koi8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_KOI8_CTYPE(code, ctype); @@ -106,39 +149,116 @@ koi8_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +koi8_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + { 0xdf, 0xff }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfe, 0xde }, + { 0xff, 0xdf } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingKOI8 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "KOI8", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - koi8_mbc_to_lower, - koi8_mbc_is_case_ambig, - koi8_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + koi8_mbc_to_normalize, + koi8_is_mbc_ambiguous, + koi8_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + koi8_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/koi8_r.c b/ext/mbstring/oniguruma/enc/koi8_r.c index 20e7d73a12b..7c626df6161 100644 --- a/ext/mbstring/oniguruma/enc/koi8_r.c +++ b/ext/mbstring/oniguruma/enc/koi8_r.c @@ -1,10 +1,32 @@ /********************************************************************** - koi8_r.c - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define ENC_KOI8_R_TO_LOWER_CASE(c) EncKOI8_R_ToLowerCaseTable[c] @@ -47,58 +69,78 @@ static UChar EncKOI8_R_ToLowerCaseTable[256] = { }; static unsigned short EncKOI8_R_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, - 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0142, 0x0050, 0x0050, 0x0850, 0x00d0, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0871, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0a51, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, 0x0050, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, 0x0871, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, - 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51, 0x0a51 + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x20a0, 0x20a0, 0x20a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x20a0, 0x31a0, + 0x20a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x20a0, 0x21a0, 0x20a0, 0x2008, + 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x0284, 0x00a0, 0x00a0, 0x10a0, 0x01a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x10e2, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x14a2, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2 }; static int -koi8_r_mbc_to_lower(UChar* p, UChar* lower) +koi8_r_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + const UChar* p = *pp; + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } static int -koi8_r_mbc_is_case_ambig(UChar* p) +koi8_r_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) { - int v = (EncKOI8_R_CtypeTable[*p] & - (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + const UChar* p = *pp; - return ((v != 0) ? TRUE : FALSE); + (*pp)++; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + int v = (EncKOI8_R_CtypeTable[*p] & + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + return (v != 0 ? TRUE : FALSE); + } + return FALSE; } static int -koi8_r_code_is_ctype(OnigCodePoint code, unsigned int ctype) +koi8_r_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) return ENC_IS_KOI8_R_CTYPE(code, ctype); @@ -106,39 +148,116 @@ koi8_r_code_is_ctype(OnigCodePoint code, unsigned int ctype) return FALSE; } +static int +koi8_r_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd7, 0xf7 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + { 0xdf, 0xff }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf7, 0xd7 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfe, 0xde }, + { 0xff, 0xdf } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return 52; + } + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + OnigEncodingType OnigEncodingKOI8_R = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "KOI8-R", /* name */ - 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* max enc length */ + 1, /* min enc length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - koi8_r_mbc_to_lower, - koi8_r_mbc_is_case_ambig, - koi8_r_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + koi8_r_mbc_to_normalize, + koi8_r_is_mbc_ambiguous, + koi8_r_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + koi8_r_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/mktable.c b/ext/mbstring/oniguruma/enc/mktable.c index 67e9a61313b..6b9ef4c5b5b 100644 --- a/ext/mbstring/oniguruma/enc/mktable.c +++ b/ext/mbstring/oniguruma/enc/mktable.c @@ -1,32 +1,55 @@ /********************************************************************** - mktable.c - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include #define NOT_RUBY #include "regenc.h" -#define ISO_8859_1 0 -#define ISO_8859_2 1 -#define ISO_8859_3 2 -#define ISO_8859_4 3 -#define ISO_8859_5 4 -#define ISO_8859_6 5 -#define ISO_8859_7 6 -#define ISO_8859_8 7 -#define ISO_8859_9 8 -#define ISO_8859_10 9 -#define ISO_8859_11 10 -#define ISO_8859_13 11 -#define ISO_8859_14 12 -#define ISO_8859_15 13 -#define ISO_8859_16 14 -#define KOI8 15 -#define KOI8_R 16 +#define UNICODE_ISO_8859_1 0 +#define ISO_8859_1 1 +#define ISO_8859_2 2 +#define ISO_8859_3 3 +#define ISO_8859_4 4 +#define ISO_8859_5 5 +#define ISO_8859_6 6 +#define ISO_8859_7 7 +#define ISO_8859_8 8 +#define ISO_8859_9 9 +#define ISO_8859_10 10 +#define ISO_8859_11 11 +#define ISO_8859_13 12 +#define ISO_8859_14 13 +#define ISO_8859_15 14 +#define ISO_8859_16 15 +#define KOI8 16 +#define KOI8_R 17 typedef struct { int num; @@ -34,6 +57,7 @@ typedef struct { } ENC_INFO; static ENC_INFO Info[] = { + { UNICODE_ISO_8859_1, "UNICODE_ISO_8859_1" }, { ISO_8859_1, "ISO_8859_1" }, { ISO_8859_2, "ISO_8859_2" }, { ISO_8859_3, "ISO_8859_3" }, @@ -60,6 +84,7 @@ static int IsAlpha(int enc, int c) if (c >= 0x61 && c <= 0x7a) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_9: if (c == 0xaa) return 1; @@ -232,6 +257,7 @@ static int IsBlank(int enc, int c) if (c == 0x09 || c == 0x20) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_2: case ISO_8859_3: @@ -267,6 +293,9 @@ static int IsCntrl(int enc, int c) if (c >= 0x00 && c <= 0x1F) return 1; switch (enc) { + case UNICODE_ISO_8859_1: + if (c == 0xad) return 1; + /* fall */ case ISO_8859_1: case ISO_8859_2: case ISO_8859_3: @@ -286,6 +315,7 @@ static int IsCntrl(int enc, int c) if (c >= 0x7f && c <= 0x9F) return 1; break; + case KOI8_R: if (c == 0x7f) return 1; break; @@ -308,6 +338,7 @@ static int IsGraph(int enc, int c) if (c >= 0x21 && c <= 0x7e) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_2: case ISO_8859_4: @@ -376,6 +407,7 @@ static int IsLower(int enc, int c) if (c >= 0x61 && c <= 0x7a) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_9: if (c == 0xaa) return 1; @@ -504,6 +536,10 @@ static int IsPrint(int enc, int c) if (c >= 0x20 && c <= 0x7e) return 1; switch (enc) { + case UNICODE_ISO_8859_1: + if (c >= 0x09 && c <= 0x0d) return 1; + if (c == 0x85) return 1; + /* fall */ case ISO_8859_1: case ISO_8859_2: case ISO_8859_4: @@ -572,11 +608,11 @@ static int IsPrint(int enc, int c) static int IsPunct(int enc, int c) { -#ifndef BY_UNICODE_PROPERTY - if (c == 0x24 || c == 0x2b || c == 0x5e || c == 0x60 || - c == 0x7c || c == 0x7e) return 1; - if (c >= 0x3c && c <= 0x3e) return 1; -#endif + if (enc == UNICODE_ISO_8859_1) { + if (c == 0x24 || c == 0x2b || c == 0x5e || c == 0x60 || + c == 0x7c || c == 0x7e) return 1; + if (c >= 0x3c && c <= 0x3e) return 1; + } if (c >= 0x21 && c <= 0x23) return 1; if (c >= 0x25 && c <= 0x2a) return 1; @@ -592,9 +628,11 @@ static int IsPunct(int enc, int c) case ISO_8859_1: case ISO_8859_9: case ISO_8859_15: + if (c == 0xad) return 1; + /* fall */ + case UNICODE_ISO_8859_1: if (c == 0xa1) return 1; if (c == 0xab) return 1; - if (c == 0xad) return 1; if (c == 0xb7) return 1; if (c == 0xbb) return 1; if (c == 0xbf) return 1; @@ -675,6 +713,9 @@ static int IsSpace(int enc, int c) if (c == 0x20) return 1; switch (enc) { + case UNICODE_ISO_8859_1: + if (c == 0x85) return 1; + /* fall */ case ISO_8859_1: case ISO_8859_2: case ISO_8859_3: @@ -710,6 +751,7 @@ static int IsUpper(int enc, int c) if (c >= 0x41 && c <= 0x5a) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_9: if (c >= 0xc0 && c <= 0xd6) return 1; @@ -844,6 +886,7 @@ static int IsWord(int enc, int c) if (c >= 0x61 && c <= 0x7a) return 1; switch (enc) { + case UNICODE_ISO_8859_1: case ISO_8859_1: case ISO_8859_9: if (c == 0xaa) return 1; @@ -1019,6 +1062,12 @@ static int IsAscii(int enc, int c) return 0; } +static int IsNewline(int enc, int c) +{ + if (c == 0x0a) return 1; + return 0; +} + static int exec(FILE* fp, ENC_INFO* einfo) { #define NCOL 8 @@ -1032,19 +1081,20 @@ static int exec(FILE* fp, ENC_INFO* einfo) for (c = 0; c < 256; c++) { val = 0; - if (IsAlpha (enc, c)) val |= ONIGENC_CTYPE_ALPHA; - if (IsBlank (enc, c)) val |= ONIGENC_CTYPE_BLANK; - if (IsCntrl (enc, c)) val |= ONIGENC_CTYPE_CNTRL; - if (IsDigit (enc, c)) val |= ONIGENC_CTYPE_DIGIT; - if (IsGraph (enc, c)) val |= ONIGENC_CTYPE_GRAPH; - if (IsLower (enc, c)) val |= ONIGENC_CTYPE_LOWER; - if (IsPrint (enc, c)) val |= ONIGENC_CTYPE_PRINT; - if (IsPunct (enc, c)) val |= ONIGENC_CTYPE_PUNCT; - if (IsSpace (enc, c)) val |= ONIGENC_CTYPE_SPACE; - if (IsUpper (enc, c)) val |= ONIGENC_CTYPE_UPPER; - if (IsXDigit(enc, c)) val |= ONIGENC_CTYPE_XDIGIT; - if (IsWord (enc, c)) val |= ONIGENC_CTYPE_WORD; - if (IsAscii (enc, c)) val |= ONIGENC_CTYPE_ASCII; + if (IsNewline(enc, c)) val |= ONIGENC_CTYPE_NEWLINE; + if (IsAlpha (enc, c)) val |= ONIGENC_CTYPE_ALPHA; + if (IsBlank (enc, c)) val |= ONIGENC_CTYPE_BLANK; + if (IsCntrl (enc, c)) val |= ONIGENC_CTYPE_CNTRL; + if (IsDigit (enc, c)) val |= ONIGENC_CTYPE_DIGIT; + if (IsGraph (enc, c)) val |= ONIGENC_CTYPE_GRAPH; + if (IsLower (enc, c)) val |= ONIGENC_CTYPE_LOWER; + if (IsPrint (enc, c)) val |= ONIGENC_CTYPE_PRINT; + if (IsPunct (enc, c)) val |= ONIGENC_CTYPE_PUNCT; + if (IsSpace (enc, c)) val |= ONIGENC_CTYPE_SPACE; + if (IsUpper (enc, c)) val |= ONIGENC_CTYPE_UPPER; + if (IsXDigit(enc, c)) val |= ONIGENC_CTYPE_XDIGIT; + if (IsWord (enc, c)) val |= ONIGENC_CTYPE_WORD; + if (IsAscii (enc, c)) val |= ONIGENC_CTYPE_ASCII; if (c % NCOL == 0) fputs(" ", fp); fprintf(fp, "0x%04x", val); diff --git a/ext/mbstring/oniguruma/enc/sjis.c b/ext/mbstring/oniguruma/enc/sjis.c index 8485910e696..e13407bccfd 100644 --- a/ext/mbstring/oniguruma/enc/sjis.c +++ b/ext/mbstring/oniguruma/enc/sjis.c @@ -1,12 +1,53 @@ /********************************************************************** - sjis.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_SJIS[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 +}; + static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,17 +67,39 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }; -#define SJIS_ISMB_FIRST(byte) (OnigEncodingSJIS.len_table[byte] > 1) +#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] +static int +sjis_mbc_enc_len(const UChar* p) +{ + return EncLen_SJIS[*p]; +} + +extern int +sjis_code_to_mbclen(OnigCodePoint code) +{ + if (code < 256) { + if (EncLen_SJIS[(int )code] == 1) + return 1; + else + return 0; + } + else if (code <= 0xffff) { + return 2; + } + else + return 0; +} + static OnigCodePoint -sjis_mbc_to_code(UChar* p, UChar* end) +sjis_mbc_to_code(const UChar* p, const UChar* end) { int c, i, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_SJIS, p); c = *p++; - len = enc_len(ONIG_ENCODING_SJIS, c); n = c; if (len == 1) return n; @@ -57,43 +120,58 @@ sjis_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 0 - if (enc_len(ONIG_ENCODING_SJIS, buf[0]) != (p - buf)) + if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf)) return REGERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -sjis_mbc_to_lower(UChar* p, UChar* lower) +sjis_mbc_to_normalize(OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { - int len; + const UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_SJIS, *p); + int len = enc_len(ONIG_ENCODING_SJIS, p); + if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -sjis_code_is_ctype(OnigCodePoint code, unsigned int ctype) +sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); + +} + +static int +sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_SJIS, first) > 1 ? TRUE : FALSE); + return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); } ctype &= ~ONIGENC_CTYPE_WORD; @@ -107,12 +185,12 @@ sjis_code_is_ctype(OnigCodePoint code, unsigned int ctype) } static UChar* -sjis_left_adjust_char_head(UChar* start, UChar* s) +sjis_left_adjust_char_head(const UChar* start, const UChar* s) { - UChar *p; + const UChar *p; int len; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; if (SJIS_ISMB_TRAIL(*p)) { @@ -123,52 +201,43 @@ sjis_left_adjust_char_head(UChar* start, UChar* s) } } } - len = enc_len(ONIG_ENCODING_SJIS, *p); - if (p + len > s) return p; + len = enc_len(ONIG_ENCODING_SJIS, p); + if (p + len > s) return (UChar* )p; p += len; - return p + ((s - p) & ~1); + return (UChar* )(p + ((s - p) & ~1)); } static int -sjis_is_allowed_reverse_match(UChar* s, UChar* end) +sjis_is_allowed_reverse_match(const UChar* s, const UChar* end) { - UChar c = *s; + const UChar c = *s; return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE); } OnigEncodingType OnigEncodingSJIS = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 - }, + sjis_mbc_enc_len, "Shift_JIS", /* name */ 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, sjis_mbc_to_code, - onigenc_mb2_code_to_mbclen, + sjis_code_to_mbclen, sjis_code_to_mbc, - sjis_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - sjis_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + sjis_mbc_to_normalize, + sjis_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + sjis_is_code_ctype, + onigenc_not_support_get_ctype_code_range, sjis_left_adjust_char_head, - sjis_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + sjis_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/enc/unicode.c b/ext/mbstring/oniguruma/enc/unicode.c new file mode 100644 index 00000000000..e3be9450a51 --- /dev/null +++ b/ext/mbstring/oniguruma/enc/unicode.c @@ -0,0 +1,3400 @@ +/********************************************************************** + unicode.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + + +unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 +}; + +static OnigCodePoint CRAlnum[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 414, +#else + 9, +#endif + 0x0030, 0x0039, + 0x0041, 0x005a, + 0x0061, 0x007a, + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, + 0x00f8, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x0669, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0966, 0x096f, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09f1, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b6f, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bef, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e50, 0x0e59, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f20, 0x0f29, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1049, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1369, 0x1371, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x17e0, 0x17e9, + 0x180b, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1946, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x3005, 0x3006, + 0x302a, 0x302f, + 0x3031, 0x3035, + 0x303b, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30fa, + 0x30fc, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff10, 0xff19, + 0xff21, 0xff3a, + 0xff41, 0xff5a, + 0xff66, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10300, 0x1031e, + 0x10330, 0x10349, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRAlnum */ + +static OnigCodePoint CRAlpha[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 396, +#else + 8, +#endif + 0x0041, 0x005a, + 0x0061, 0x007a, + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, + 0x00f8, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06ef, + 0x06fa, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09f0, 0x09f1, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a70, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x180b, 0x180d, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1950, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x3005, 0x3006, + 0x302a, 0x302f, + 0x3031, 0x3035, + 0x303b, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30fa, + 0x30fc, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff21, 0xff3a, + 0xff41, 0xff5a, + 0xff66, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10300, 0x1031e, + 0x10330, 0x10349, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRAlpha */ + +static OnigCodePoint CRBlank[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 9, +#else + 3, +#endif + 0x0009, 0x0009, + 0x0020, 0x0020, + 0x00a0, 0x00a0 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x1680, 0x1680, + 0x180e, 0x180e, + 0x2000, 0x200a, + 0x202f, 0x202f, + 0x205f, 0x205f, + 0x3000, 0x3000 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRBlank */ + +static OnigCodePoint CRCntrl[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 19, +#else + 3, +#endif + 0x0000, 0x001f, + 0x007f, 0x009f, + 0x00ad, 0x00ad +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0600, 0x0603, + 0x06dd, 0x06dd, + 0x070f, 0x070f, + 0x17b4, 0x17b5, + 0x200b, 0x200f, + 0x202a, 0x202e, + 0x2060, 0x2063, + 0x206a, 0x206f, + 0xd800, 0xf8ff, + 0xfeff, 0xfeff, + 0xfff9, 0xfffb, + 0x1d173, 0x1d17a, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRCntrl */ + +static OnigCodePoint CRDigit[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 23, +#else + 1, +#endif + 0x0030, 0x0039 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0660, 0x0669, + 0x06f0, 0x06f9, + 0x0966, 0x096f, + 0x09e6, 0x09ef, + 0x0a66, 0x0a6f, + 0x0ae6, 0x0aef, + 0x0b66, 0x0b6f, + 0x0be7, 0x0bef, + 0x0c66, 0x0c6f, + 0x0ce6, 0x0cef, + 0x0d66, 0x0d6f, + 0x0e50, 0x0e59, + 0x0ed0, 0x0ed9, + 0x0f20, 0x0f29, + 0x1040, 0x1049, + 0x1369, 0x1371, + 0x17e0, 0x17e9, + 0x1810, 0x1819, + 0x1946, 0x194f, + 0xff10, 0xff19, + 0x104a0, 0x104a9, + 0x1d7ce, 0x1d7ff +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRDigit */ + +static OnigCodePoint CRGraph[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 405, +#else + 2, +#endif + 0x0021, 0x007e, + 0x00a1, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x0357, + 0x035d, 0x036f, + 0x0374, 0x0375, + 0x037a, 0x037a, + 0x037e, 0x037e, + 0x0384, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03fb, + 0x0400, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x055f, + 0x0561, 0x0587, + 0x0589, 0x058a, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f4, + 0x0600, 0x0603, + 0x060c, 0x0615, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x070d, + 0x070f, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0970, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09fa, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0af1, 0x0af1, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bfa, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df4, + 0x0e01, 0x0e3a, + 0x0e3f, 0x0e5b, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fbe, 0x0fcc, + 0x0fcf, 0x0fcf, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x10fb, 0x10fb, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1361, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x1676, + 0x1681, 0x169c, + 0x16a0, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1736, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x1800, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1940, 0x1940, + 0x1944, 0x196d, + 0x1970, 0x1974, + 0x19e0, 0x19ff, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fc4, + 0x1fc6, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fdd, 0x1fef, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffe, + 0x200b, 0x2027, + 0x202a, 0x202e, + 0x2030, 0x2054, + 0x2057, 0x2057, + 0x2060, 0x2063, + 0x206a, 0x2071, + 0x2074, 0x208e, + 0x20a0, 0x20b1, + 0x20d0, 0x20ea, + 0x2100, 0x213b, + 0x213d, 0x214b, + 0x2153, 0x2183, + 0x2190, 0x23d0, + 0x2400, 0x2426, + 0x2440, 0x244a, + 0x2460, 0x2617, + 0x2619, 0x267d, + 0x2680, 0x2691, + 0x26a0, 0x26a1, + 0x2701, 0x2704, + 0x2706, 0x2709, + 0x270c, 0x2727, + 0x2729, 0x274b, + 0x274d, 0x274d, + 0x274f, 0x2752, + 0x2756, 0x2756, + 0x2758, 0x275e, + 0x2761, 0x2794, + 0x2798, 0x27af, + 0x27b1, 0x27be, + 0x27d0, 0x27eb, + 0x27f0, 0x2b0d, + 0x2e80, 0x2e99, + 0x2e9b, 0x2ef3, + 0x2f00, 0x2fd5, + 0x2ff0, 0x2ffb, + 0x3001, 0x303f, + 0x3041, 0x3096, + 0x3099, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3190, 0x31b7, + 0x31f0, 0x321e, + 0x3220, 0x3243, + 0x3250, 0x327d, + 0x327f, 0x32fe, + 0x3300, 0x4db5, + 0x4dc0, 0x9fa5, + 0xa000, 0xa48c, + 0xa490, 0xa4c6, + 0xac00, 0xd7a3, + 0xe000, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3f, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfd, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe30, 0xfe52, + 0xfe54, 0xfe66, + 0xfe68, 0xfe6b, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xfeff, 0xfeff, + 0xff01, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0xffe0, 0xffe6, + 0xffe8, 0xffee, + 0xfff9, 0xfffd, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10100, 0x10102, + 0x10107, 0x10133, + 0x10137, 0x1013f, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x1039f, 0x1039f, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d000, 0x1d0f5, + 0x1d100, 0x1d126, + 0x1d12a, 0x1d1dd, + 0x1d300, 0x1d356, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xe0100, 0xe01ef, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRGraph */ + +static OnigCodePoint CRLower[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 424, +#else + 6, +#endif + 0x0061, 0x007a, + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00df, 0x00f6, + 0x00f8, 0x00ff +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0101, 0x0101, + 0x0103, 0x0103, + 0x0105, 0x0105, + 0x0107, 0x0107, + 0x0109, 0x0109, + 0x010b, 0x010b, + 0x010d, 0x010d, + 0x010f, 0x010f, + 0x0111, 0x0111, + 0x0113, 0x0113, + 0x0115, 0x0115, + 0x0117, 0x0117, + 0x0119, 0x0119, + 0x011b, 0x011b, + 0x011d, 0x011d, + 0x011f, 0x011f, + 0x0121, 0x0121, + 0x0123, 0x0123, + 0x0125, 0x0125, + 0x0127, 0x0127, + 0x0129, 0x0129, + 0x012b, 0x012b, + 0x012d, 0x012d, + 0x012f, 0x012f, + 0x0131, 0x0131, + 0x0133, 0x0133, + 0x0135, 0x0135, + 0x0137, 0x0138, + 0x013a, 0x013a, + 0x013c, 0x013c, + 0x013e, 0x013e, + 0x0140, 0x0140, + 0x0142, 0x0142, + 0x0144, 0x0144, + 0x0146, 0x0146, + 0x0148, 0x0149, + 0x014b, 0x014b, + 0x014d, 0x014d, + 0x014f, 0x014f, + 0x0151, 0x0151, + 0x0153, 0x0153, + 0x0155, 0x0155, + 0x0157, 0x0157, + 0x0159, 0x0159, + 0x015b, 0x015b, + 0x015d, 0x015d, + 0x015f, 0x015f, + 0x0161, 0x0161, + 0x0163, 0x0163, + 0x0165, 0x0165, + 0x0167, 0x0167, + 0x0169, 0x0169, + 0x016b, 0x016b, + 0x016d, 0x016d, + 0x016f, 0x016f, + 0x0171, 0x0171, + 0x0173, 0x0173, + 0x0175, 0x0175, + 0x0177, 0x0177, + 0x017a, 0x017a, + 0x017c, 0x017c, + 0x017e, 0x0180, + 0x0183, 0x0183, + 0x0185, 0x0185, + 0x0188, 0x0188, + 0x018c, 0x018d, + 0x0192, 0x0192, + 0x0195, 0x0195, + 0x0199, 0x019b, + 0x019e, 0x019e, + 0x01a1, 0x01a1, + 0x01a3, 0x01a3, + 0x01a5, 0x01a5, + 0x01a8, 0x01a8, + 0x01aa, 0x01ab, + 0x01ad, 0x01ad, + 0x01b0, 0x01b0, + 0x01b4, 0x01b4, + 0x01b6, 0x01b6, + 0x01b9, 0x01ba, + 0x01bd, 0x01bf, + 0x01c6, 0x01c6, + 0x01c9, 0x01c9, + 0x01cc, 0x01cc, + 0x01ce, 0x01ce, + 0x01d0, 0x01d0, + 0x01d2, 0x01d2, + 0x01d4, 0x01d4, + 0x01d6, 0x01d6, + 0x01d8, 0x01d8, + 0x01da, 0x01da, + 0x01dc, 0x01dd, + 0x01df, 0x01df, + 0x01e1, 0x01e1, + 0x01e3, 0x01e3, + 0x01e5, 0x01e5, + 0x01e7, 0x01e7, + 0x01e9, 0x01e9, + 0x01eb, 0x01eb, + 0x01ed, 0x01ed, + 0x01ef, 0x01f0, + 0x01f3, 0x01f3, + 0x01f5, 0x01f5, + 0x01f9, 0x01f9, + 0x01fb, 0x01fb, + 0x01fd, 0x01fd, + 0x01ff, 0x01ff, + 0x0201, 0x0201, + 0x0203, 0x0203, + 0x0205, 0x0205, + 0x0207, 0x0207, + 0x0209, 0x0209, + 0x020b, 0x020b, + 0x020d, 0x020d, + 0x020f, 0x020f, + 0x0211, 0x0211, + 0x0213, 0x0213, + 0x0215, 0x0215, + 0x0217, 0x0217, + 0x0219, 0x0219, + 0x021b, 0x021b, + 0x021d, 0x021d, + 0x021f, 0x021f, + 0x0221, 0x0221, + 0x0223, 0x0223, + 0x0225, 0x0225, + 0x0227, 0x0227, + 0x0229, 0x0229, + 0x022b, 0x022b, + 0x022d, 0x022d, + 0x022f, 0x022f, + 0x0231, 0x0231, + 0x0233, 0x0236, + 0x0250, 0x02af, + 0x0390, 0x0390, + 0x03ac, 0x03ce, + 0x03d0, 0x03d1, + 0x03d5, 0x03d7, + 0x03d9, 0x03d9, + 0x03db, 0x03db, + 0x03dd, 0x03dd, + 0x03df, 0x03df, + 0x03e1, 0x03e1, + 0x03e3, 0x03e3, + 0x03e5, 0x03e5, + 0x03e7, 0x03e7, + 0x03e9, 0x03e9, + 0x03eb, 0x03eb, + 0x03ed, 0x03ed, + 0x03ef, 0x03f3, + 0x03f5, 0x03f5, + 0x03f8, 0x03f8, + 0x03fb, 0x03fb, + 0x0430, 0x045f, + 0x0461, 0x0461, + 0x0463, 0x0463, + 0x0465, 0x0465, + 0x0467, 0x0467, + 0x0469, 0x0469, + 0x046b, 0x046b, + 0x046d, 0x046d, + 0x046f, 0x046f, + 0x0471, 0x0471, + 0x0473, 0x0473, + 0x0475, 0x0475, + 0x0477, 0x0477, + 0x0479, 0x0479, + 0x047b, 0x047b, + 0x047d, 0x047d, + 0x047f, 0x047f, + 0x0481, 0x0481, + 0x048b, 0x048b, + 0x048d, 0x048d, + 0x048f, 0x048f, + 0x0491, 0x0491, + 0x0493, 0x0493, + 0x0495, 0x0495, + 0x0497, 0x0497, + 0x0499, 0x0499, + 0x049b, 0x049b, + 0x049d, 0x049d, + 0x049f, 0x049f, + 0x04a1, 0x04a1, + 0x04a3, 0x04a3, + 0x04a5, 0x04a5, + 0x04a7, 0x04a7, + 0x04a9, 0x04a9, + 0x04ab, 0x04ab, + 0x04ad, 0x04ad, + 0x04af, 0x04af, + 0x04b1, 0x04b1, + 0x04b3, 0x04b3, + 0x04b5, 0x04b5, + 0x04b7, 0x04b7, + 0x04b9, 0x04b9, + 0x04bb, 0x04bb, + 0x04bd, 0x04bd, + 0x04bf, 0x04bf, + 0x04c2, 0x04c2, + 0x04c4, 0x04c4, + 0x04c6, 0x04c6, + 0x04c8, 0x04c8, + 0x04ca, 0x04ca, + 0x04cc, 0x04cc, + 0x04ce, 0x04ce, + 0x04d1, 0x04d1, + 0x04d3, 0x04d3, + 0x04d5, 0x04d5, + 0x04d7, 0x04d7, + 0x04d9, 0x04d9, + 0x04db, 0x04db, + 0x04dd, 0x04dd, + 0x04df, 0x04df, + 0x04e1, 0x04e1, + 0x04e3, 0x04e3, + 0x04e5, 0x04e5, + 0x04e7, 0x04e7, + 0x04e9, 0x04e9, + 0x04eb, 0x04eb, + 0x04ed, 0x04ed, + 0x04ef, 0x04ef, + 0x04f1, 0x04f1, + 0x04f3, 0x04f3, + 0x04f5, 0x04f5, + 0x04f9, 0x04f9, + 0x0501, 0x0501, + 0x0503, 0x0503, + 0x0505, 0x0505, + 0x0507, 0x0507, + 0x0509, 0x0509, + 0x050b, 0x050b, + 0x050d, 0x050d, + 0x050f, 0x050f, + 0x0561, 0x0587, + 0x1d00, 0x1d2b, + 0x1d62, 0x1d6b, + 0x1e01, 0x1e01, + 0x1e03, 0x1e03, + 0x1e05, 0x1e05, + 0x1e07, 0x1e07, + 0x1e09, 0x1e09, + 0x1e0b, 0x1e0b, + 0x1e0d, 0x1e0d, + 0x1e0f, 0x1e0f, + 0x1e11, 0x1e11, + 0x1e13, 0x1e13, + 0x1e15, 0x1e15, + 0x1e17, 0x1e17, + 0x1e19, 0x1e19, + 0x1e1b, 0x1e1b, + 0x1e1d, 0x1e1d, + 0x1e1f, 0x1e1f, + 0x1e21, 0x1e21, + 0x1e23, 0x1e23, + 0x1e25, 0x1e25, + 0x1e27, 0x1e27, + 0x1e29, 0x1e29, + 0x1e2b, 0x1e2b, + 0x1e2d, 0x1e2d, + 0x1e2f, 0x1e2f, + 0x1e31, 0x1e31, + 0x1e33, 0x1e33, + 0x1e35, 0x1e35, + 0x1e37, 0x1e37, + 0x1e39, 0x1e39, + 0x1e3b, 0x1e3b, + 0x1e3d, 0x1e3d, + 0x1e3f, 0x1e3f, + 0x1e41, 0x1e41, + 0x1e43, 0x1e43, + 0x1e45, 0x1e45, + 0x1e47, 0x1e47, + 0x1e49, 0x1e49, + 0x1e4b, 0x1e4b, + 0x1e4d, 0x1e4d, + 0x1e4f, 0x1e4f, + 0x1e51, 0x1e51, + 0x1e53, 0x1e53, + 0x1e55, 0x1e55, + 0x1e57, 0x1e57, + 0x1e59, 0x1e59, + 0x1e5b, 0x1e5b, + 0x1e5d, 0x1e5d, + 0x1e5f, 0x1e5f, + 0x1e61, 0x1e61, + 0x1e63, 0x1e63, + 0x1e65, 0x1e65, + 0x1e67, 0x1e67, + 0x1e69, 0x1e69, + 0x1e6b, 0x1e6b, + 0x1e6d, 0x1e6d, + 0x1e6f, 0x1e6f, + 0x1e71, 0x1e71, + 0x1e73, 0x1e73, + 0x1e75, 0x1e75, + 0x1e77, 0x1e77, + 0x1e79, 0x1e79, + 0x1e7b, 0x1e7b, + 0x1e7d, 0x1e7d, + 0x1e7f, 0x1e7f, + 0x1e81, 0x1e81, + 0x1e83, 0x1e83, + 0x1e85, 0x1e85, + 0x1e87, 0x1e87, + 0x1e89, 0x1e89, + 0x1e8b, 0x1e8b, + 0x1e8d, 0x1e8d, + 0x1e8f, 0x1e8f, + 0x1e91, 0x1e91, + 0x1e93, 0x1e93, + 0x1e95, 0x1e9b, + 0x1ea1, 0x1ea1, + 0x1ea3, 0x1ea3, + 0x1ea5, 0x1ea5, + 0x1ea7, 0x1ea7, + 0x1ea9, 0x1ea9, + 0x1eab, 0x1eab, + 0x1ead, 0x1ead, + 0x1eaf, 0x1eaf, + 0x1eb1, 0x1eb1, + 0x1eb3, 0x1eb3, + 0x1eb5, 0x1eb5, + 0x1eb7, 0x1eb7, + 0x1eb9, 0x1eb9, + 0x1ebb, 0x1ebb, + 0x1ebd, 0x1ebd, + 0x1ebf, 0x1ebf, + 0x1ec1, 0x1ec1, + 0x1ec3, 0x1ec3, + 0x1ec5, 0x1ec5, + 0x1ec7, 0x1ec7, + 0x1ec9, 0x1ec9, + 0x1ecb, 0x1ecb, + 0x1ecd, 0x1ecd, + 0x1ecf, 0x1ecf, + 0x1ed1, 0x1ed1, + 0x1ed3, 0x1ed3, + 0x1ed5, 0x1ed5, + 0x1ed7, 0x1ed7, + 0x1ed9, 0x1ed9, + 0x1edb, 0x1edb, + 0x1edd, 0x1edd, + 0x1edf, 0x1edf, + 0x1ee1, 0x1ee1, + 0x1ee3, 0x1ee3, + 0x1ee5, 0x1ee5, + 0x1ee7, 0x1ee7, + 0x1ee9, 0x1ee9, + 0x1eeb, 0x1eeb, + 0x1eed, 0x1eed, + 0x1eef, 0x1eef, + 0x1ef1, 0x1ef1, + 0x1ef3, 0x1ef3, + 0x1ef5, 0x1ef5, + 0x1ef7, 0x1ef7, + 0x1ef9, 0x1ef9, + 0x1f00, 0x1f07, + 0x1f10, 0x1f15, + 0x1f20, 0x1f27, + 0x1f30, 0x1f37, + 0x1f40, 0x1f45, + 0x1f50, 0x1f57, + 0x1f60, 0x1f67, + 0x1f70, 0x1f7d, + 0x1f80, 0x1f87, + 0x1f90, 0x1f97, + 0x1fa0, 0x1fa7, + 0x1fb0, 0x1fb4, + 0x1fb6, 0x1fb7, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fc7, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fd7, + 0x1fe0, 0x1fe7, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ff7, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x210a, 0x210a, + 0x210e, 0x210f, + 0x2113, 0x2113, + 0x212f, 0x212f, + 0x2134, 0x2134, + 0x2139, 0x2139, + 0x213d, 0x213d, + 0x2146, 0x2149, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xff41, 0xff5a, + 0x10428, 0x1044f, + 0x1d41a, 0x1d433, + 0x1d44e, 0x1d454, + 0x1d456, 0x1d467, + 0x1d482, 0x1d49b, + 0x1d4b6, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d4cf, + 0x1d4ea, 0x1d503, + 0x1d51e, 0x1d537, + 0x1d552, 0x1d56b, + 0x1d586, 0x1d59f, + 0x1d5ba, 0x1d5d3, + 0x1d5ee, 0x1d607, + 0x1d622, 0x1d63b, + 0x1d656, 0x1d66f, + 0x1d68a, 0x1d6a3, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6e1, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d71b, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d755, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d78f, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRLower */ + +static OnigCodePoint CRPrint[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 405, +#else + 4, +#endif + 0x0009, 0x000d, + 0x0020, 0x007e, + 0x0085, 0x0085, + 0x00a0, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x0357, + 0x035d, 0x036f, + 0x0374, 0x0375, + 0x037a, 0x037a, + 0x037e, 0x037e, + 0x0384, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03fb, + 0x0400, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x055f, + 0x0561, 0x0587, + 0x0589, 0x058a, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f4, + 0x0600, 0x0603, + 0x060c, 0x0615, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x070d, + 0x070f, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0970, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09fa, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0af1, 0x0af1, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bfa, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df4, + 0x0e01, 0x0e3a, + 0x0e3f, 0x0e5b, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fbe, 0x0fcc, + 0x0fcf, 0x0fcf, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x10fb, 0x10fb, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1361, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x1676, + 0x1680, 0x169c, + 0x16a0, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1736, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x1800, 0x180e, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1940, 0x1940, + 0x1944, 0x196d, + 0x1970, 0x1974, + 0x19e0, 0x19ff, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fc4, + 0x1fc6, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fdd, 0x1fef, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffe, + 0x2000, 0x2054, + 0x2057, 0x2057, + 0x205f, 0x2063, + 0x206a, 0x2071, + 0x2074, 0x208e, + 0x20a0, 0x20b1, + 0x20d0, 0x20ea, + 0x2100, 0x213b, + 0x213d, 0x214b, + 0x2153, 0x2183, + 0x2190, 0x23d0, + 0x2400, 0x2426, + 0x2440, 0x244a, + 0x2460, 0x2617, + 0x2619, 0x267d, + 0x2680, 0x2691, + 0x26a0, 0x26a1, + 0x2701, 0x2704, + 0x2706, 0x2709, + 0x270c, 0x2727, + 0x2729, 0x274b, + 0x274d, 0x274d, + 0x274f, 0x2752, + 0x2756, 0x2756, + 0x2758, 0x275e, + 0x2761, 0x2794, + 0x2798, 0x27af, + 0x27b1, 0x27be, + 0x27d0, 0x27eb, + 0x27f0, 0x2b0d, + 0x2e80, 0x2e99, + 0x2e9b, 0x2ef3, + 0x2f00, 0x2fd5, + 0x2ff0, 0x2ffb, + 0x3000, 0x303f, + 0x3041, 0x3096, + 0x3099, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3190, 0x31b7, + 0x31f0, 0x321e, + 0x3220, 0x3243, + 0x3250, 0x327d, + 0x327f, 0x32fe, + 0x3300, 0x4db5, + 0x4dc0, 0x9fa5, + 0xa000, 0xa48c, + 0xa490, 0xa4c6, + 0xac00, 0xd7a3, + 0xe000, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3f, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfd, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe30, 0xfe52, + 0xfe54, 0xfe66, + 0xfe68, 0xfe6b, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xfeff, 0xfeff, + 0xff01, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0xffe0, 0xffe6, + 0xffe8, 0xffee, + 0xfff9, 0xfffd, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10100, 0x10102, + 0x10107, 0x10133, + 0x10137, 0x1013f, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x1039f, 0x1039f, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d000, 0x1d0f5, + 0x1d100, 0x1d126, + 0x1d12a, 0x1d1dd, + 0x1d300, 0x1d356, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xe0100, 0xe01ef, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRPrint */ + +static OnigCodePoint CRPunct[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 86, +#else + 14, +#endif + 0x0021, 0x0023, + 0x0025, 0x002a, + 0x002c, 0x002f, + 0x003a, 0x003b, + 0x003f, 0x0040, + 0x005b, 0x005d, + 0x005f, 0x005f, + 0x007b, 0x007b, + 0x007d, 0x007d, + 0x00a1, 0x00a1, + 0x00ab, 0x00ab, + 0x00b7, 0x00b7, + 0x00bb, 0x00bb, + 0x00bf, 0x00bf +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x037e, 0x037e, + 0x0387, 0x0387, + 0x055a, 0x055f, + 0x0589, 0x058a, + 0x05be, 0x05be, + 0x05c0, 0x05c0, + 0x05c3, 0x05c3, + 0x05f3, 0x05f4, + 0x060c, 0x060d, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x066a, 0x066d, + 0x06d4, 0x06d4, + 0x0700, 0x070d, + 0x0964, 0x0965, + 0x0970, 0x0970, + 0x0df4, 0x0df4, + 0x0e4f, 0x0e4f, + 0x0e5a, 0x0e5b, + 0x0f04, 0x0f12, + 0x0f3a, 0x0f3d, + 0x0f85, 0x0f85, + 0x104a, 0x104f, + 0x10fb, 0x10fb, + 0x1361, 0x1368, + 0x166d, 0x166e, + 0x169b, 0x169c, + 0x16eb, 0x16ed, + 0x1735, 0x1736, + 0x17d4, 0x17d6, + 0x17d8, 0x17da, + 0x1800, 0x180a, + 0x1944, 0x1945, + 0x2010, 0x2027, + 0x2030, 0x2043, + 0x2045, 0x2051, + 0x2053, 0x2054, + 0x2057, 0x2057, + 0x207d, 0x207e, + 0x208d, 0x208e, + 0x2329, 0x232a, + 0x23b4, 0x23b6, + 0x2768, 0x2775, + 0x27e6, 0x27eb, + 0x2983, 0x2998, + 0x29d8, 0x29db, + 0x29fc, 0x29fd, + 0x3001, 0x3003, + 0x3008, 0x3011, + 0x3014, 0x301f, + 0x3030, 0x3030, + 0x303d, 0x303d, + 0x30a0, 0x30a0, + 0x30fb, 0x30fb, + 0xfd3e, 0xfd3f, + 0xfe30, 0xfe52, + 0xfe54, 0xfe61, + 0xfe63, 0xfe63, + 0xfe68, 0xfe68, + 0xfe6a, 0xfe6b, + 0xff01, 0xff03, + 0xff05, 0xff0a, + 0xff0c, 0xff0f, + 0xff1a, 0xff1b, + 0xff1f, 0xff20, + 0xff3b, 0xff3d, + 0xff3f, 0xff3f, + 0xff5b, 0xff5b, + 0xff5d, 0xff5d, + 0xff5f, 0xff65, + 0x10100, 0x10101, + 0x1039f, 0x1039f +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRPunct */ + +static OnigCodePoint CRSpace[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 11, +#else + 4, +#endif + 0x0009, 0x000d, + 0x0020, 0x0020, + 0x0085, 0x0085, + 0x00a0, 0x00a0 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x1680, 0x1680, + 0x180e, 0x180e, + 0x2000, 0x200a, + 0x2028, 0x2029, + 0x202f, 0x202f, + 0x205f, 0x205f, + 0x3000, 0x3000 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRSpace */ + +static OnigCodePoint CRUpper[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 421, +#else + 3, +#endif + 0x0041, 0x005a, + 0x00c0, 0x00d6, + 0x00d8, 0x00de +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0100, 0x0100, + 0x0102, 0x0102, + 0x0104, 0x0104, + 0x0106, 0x0106, + 0x0108, 0x0108, + 0x010a, 0x010a, + 0x010c, 0x010c, + 0x010e, 0x010e, + 0x0110, 0x0110, + 0x0112, 0x0112, + 0x0114, 0x0114, + 0x0116, 0x0116, + 0x0118, 0x0118, + 0x011a, 0x011a, + 0x011c, 0x011c, + 0x011e, 0x011e, + 0x0120, 0x0120, + 0x0122, 0x0122, + 0x0124, 0x0124, + 0x0126, 0x0126, + 0x0128, 0x0128, + 0x012a, 0x012a, + 0x012c, 0x012c, + 0x012e, 0x012e, + 0x0130, 0x0130, + 0x0132, 0x0132, + 0x0134, 0x0134, + 0x0136, 0x0136, + 0x0139, 0x0139, + 0x013b, 0x013b, + 0x013d, 0x013d, + 0x013f, 0x013f, + 0x0141, 0x0141, + 0x0143, 0x0143, + 0x0145, 0x0145, + 0x0147, 0x0147, + 0x014a, 0x014a, + 0x014c, 0x014c, + 0x014e, 0x014e, + 0x0150, 0x0150, + 0x0152, 0x0152, + 0x0154, 0x0154, + 0x0156, 0x0156, + 0x0158, 0x0158, + 0x015a, 0x015a, + 0x015c, 0x015c, + 0x015e, 0x015e, + 0x0160, 0x0160, + 0x0162, 0x0162, + 0x0164, 0x0164, + 0x0166, 0x0166, + 0x0168, 0x0168, + 0x016a, 0x016a, + 0x016c, 0x016c, + 0x016e, 0x016e, + 0x0170, 0x0170, + 0x0172, 0x0172, + 0x0174, 0x0174, + 0x0176, 0x0176, + 0x0178, 0x0179, + 0x017b, 0x017b, + 0x017d, 0x017d, + 0x0181, 0x0182, + 0x0184, 0x0184, + 0x0186, 0x0187, + 0x0189, 0x018b, + 0x018e, 0x0191, + 0x0193, 0x0194, + 0x0196, 0x0198, + 0x019c, 0x019d, + 0x019f, 0x01a0, + 0x01a2, 0x01a2, + 0x01a4, 0x01a4, + 0x01a6, 0x01a7, + 0x01a9, 0x01a9, + 0x01ac, 0x01ac, + 0x01ae, 0x01af, + 0x01b1, 0x01b3, + 0x01b5, 0x01b5, + 0x01b7, 0x01b8, + 0x01bc, 0x01bc, + 0x01c4, 0x01c4, + 0x01c7, 0x01c7, + 0x01ca, 0x01ca, + 0x01cd, 0x01cd, + 0x01cf, 0x01cf, + 0x01d1, 0x01d1, + 0x01d3, 0x01d3, + 0x01d5, 0x01d5, + 0x01d7, 0x01d7, + 0x01d9, 0x01d9, + 0x01db, 0x01db, + 0x01de, 0x01de, + 0x01e0, 0x01e0, + 0x01e2, 0x01e2, + 0x01e4, 0x01e4, + 0x01e6, 0x01e6, + 0x01e8, 0x01e8, + 0x01ea, 0x01ea, + 0x01ec, 0x01ec, + 0x01ee, 0x01ee, + 0x01f1, 0x01f1, + 0x01f4, 0x01f4, + 0x01f6, 0x01f8, + 0x01fa, 0x01fa, + 0x01fc, 0x01fc, + 0x01fe, 0x01fe, + 0x0200, 0x0200, + 0x0202, 0x0202, + 0x0204, 0x0204, + 0x0206, 0x0206, + 0x0208, 0x0208, + 0x020a, 0x020a, + 0x020c, 0x020c, + 0x020e, 0x020e, + 0x0210, 0x0210, + 0x0212, 0x0212, + 0x0214, 0x0214, + 0x0216, 0x0216, + 0x0218, 0x0218, + 0x021a, 0x021a, + 0x021c, 0x021c, + 0x021e, 0x021e, + 0x0220, 0x0220, + 0x0222, 0x0222, + 0x0224, 0x0224, + 0x0226, 0x0226, + 0x0228, 0x0228, + 0x022a, 0x022a, + 0x022c, 0x022c, + 0x022e, 0x022e, + 0x0230, 0x0230, + 0x0232, 0x0232, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x038f, + 0x0391, 0x03a1, + 0x03a3, 0x03ab, + 0x03d2, 0x03d4, + 0x03d8, 0x03d8, + 0x03da, 0x03da, + 0x03dc, 0x03dc, + 0x03de, 0x03de, + 0x03e0, 0x03e0, + 0x03e2, 0x03e2, + 0x03e4, 0x03e4, + 0x03e6, 0x03e6, + 0x03e8, 0x03e8, + 0x03ea, 0x03ea, + 0x03ec, 0x03ec, + 0x03ee, 0x03ee, + 0x03f4, 0x03f4, + 0x03f7, 0x03f7, + 0x03f9, 0x03fa, + 0x0400, 0x042f, + 0x0460, 0x0460, + 0x0462, 0x0462, + 0x0464, 0x0464, + 0x0466, 0x0466, + 0x0468, 0x0468, + 0x046a, 0x046a, + 0x046c, 0x046c, + 0x046e, 0x046e, + 0x0470, 0x0470, + 0x0472, 0x0472, + 0x0474, 0x0474, + 0x0476, 0x0476, + 0x0478, 0x0478, + 0x047a, 0x047a, + 0x047c, 0x047c, + 0x047e, 0x047e, + 0x0480, 0x0480, + 0x048a, 0x048a, + 0x048c, 0x048c, + 0x048e, 0x048e, + 0x0490, 0x0490, + 0x0492, 0x0492, + 0x0494, 0x0494, + 0x0496, 0x0496, + 0x0498, 0x0498, + 0x049a, 0x049a, + 0x049c, 0x049c, + 0x049e, 0x049e, + 0x04a0, 0x04a0, + 0x04a2, 0x04a2, + 0x04a4, 0x04a4, + 0x04a6, 0x04a6, + 0x04a8, 0x04a8, + 0x04aa, 0x04aa, + 0x04ac, 0x04ac, + 0x04ae, 0x04ae, + 0x04b0, 0x04b0, + 0x04b2, 0x04b2, + 0x04b4, 0x04b4, + 0x04b6, 0x04b6, + 0x04b8, 0x04b8, + 0x04ba, 0x04ba, + 0x04bc, 0x04bc, + 0x04be, 0x04be, + 0x04c0, 0x04c1, + 0x04c3, 0x04c3, + 0x04c5, 0x04c5, + 0x04c7, 0x04c7, + 0x04c9, 0x04c9, + 0x04cb, 0x04cb, + 0x04cd, 0x04cd, + 0x04d0, 0x04d0, + 0x04d2, 0x04d2, + 0x04d4, 0x04d4, + 0x04d6, 0x04d6, + 0x04d8, 0x04d8, + 0x04da, 0x04da, + 0x04dc, 0x04dc, + 0x04de, 0x04de, + 0x04e0, 0x04e0, + 0x04e2, 0x04e2, + 0x04e4, 0x04e4, + 0x04e6, 0x04e6, + 0x04e8, 0x04e8, + 0x04ea, 0x04ea, + 0x04ec, 0x04ec, + 0x04ee, 0x04ee, + 0x04f0, 0x04f0, + 0x04f2, 0x04f2, + 0x04f4, 0x04f4, + 0x04f8, 0x04f8, + 0x0500, 0x0500, + 0x0502, 0x0502, + 0x0504, 0x0504, + 0x0506, 0x0506, + 0x0508, 0x0508, + 0x050a, 0x050a, + 0x050c, 0x050c, + 0x050e, 0x050e, + 0x0531, 0x0556, + 0x10a0, 0x10c5, + 0x1e00, 0x1e00, + 0x1e02, 0x1e02, + 0x1e04, 0x1e04, + 0x1e06, 0x1e06, + 0x1e08, 0x1e08, + 0x1e0a, 0x1e0a, + 0x1e0c, 0x1e0c, + 0x1e0e, 0x1e0e, + 0x1e10, 0x1e10, + 0x1e12, 0x1e12, + 0x1e14, 0x1e14, + 0x1e16, 0x1e16, + 0x1e18, 0x1e18, + 0x1e1a, 0x1e1a, + 0x1e1c, 0x1e1c, + 0x1e1e, 0x1e1e, + 0x1e20, 0x1e20, + 0x1e22, 0x1e22, + 0x1e24, 0x1e24, + 0x1e26, 0x1e26, + 0x1e28, 0x1e28, + 0x1e2a, 0x1e2a, + 0x1e2c, 0x1e2c, + 0x1e2e, 0x1e2e, + 0x1e30, 0x1e30, + 0x1e32, 0x1e32, + 0x1e34, 0x1e34, + 0x1e36, 0x1e36, + 0x1e38, 0x1e38, + 0x1e3a, 0x1e3a, + 0x1e3c, 0x1e3c, + 0x1e3e, 0x1e3e, + 0x1e40, 0x1e40, + 0x1e42, 0x1e42, + 0x1e44, 0x1e44, + 0x1e46, 0x1e46, + 0x1e48, 0x1e48, + 0x1e4a, 0x1e4a, + 0x1e4c, 0x1e4c, + 0x1e4e, 0x1e4e, + 0x1e50, 0x1e50, + 0x1e52, 0x1e52, + 0x1e54, 0x1e54, + 0x1e56, 0x1e56, + 0x1e58, 0x1e58, + 0x1e5a, 0x1e5a, + 0x1e5c, 0x1e5c, + 0x1e5e, 0x1e5e, + 0x1e60, 0x1e60, + 0x1e62, 0x1e62, + 0x1e64, 0x1e64, + 0x1e66, 0x1e66, + 0x1e68, 0x1e68, + 0x1e6a, 0x1e6a, + 0x1e6c, 0x1e6c, + 0x1e6e, 0x1e6e, + 0x1e70, 0x1e70, + 0x1e72, 0x1e72, + 0x1e74, 0x1e74, + 0x1e76, 0x1e76, + 0x1e78, 0x1e78, + 0x1e7a, 0x1e7a, + 0x1e7c, 0x1e7c, + 0x1e7e, 0x1e7e, + 0x1e80, 0x1e80, + 0x1e82, 0x1e82, + 0x1e84, 0x1e84, + 0x1e86, 0x1e86, + 0x1e88, 0x1e88, + 0x1e8a, 0x1e8a, + 0x1e8c, 0x1e8c, + 0x1e8e, 0x1e8e, + 0x1e90, 0x1e90, + 0x1e92, 0x1e92, + 0x1e94, 0x1e94, + 0x1ea0, 0x1ea0, + 0x1ea2, 0x1ea2, + 0x1ea4, 0x1ea4, + 0x1ea6, 0x1ea6, + 0x1ea8, 0x1ea8, + 0x1eaa, 0x1eaa, + 0x1eac, 0x1eac, + 0x1eae, 0x1eae, + 0x1eb0, 0x1eb0, + 0x1eb2, 0x1eb2, + 0x1eb4, 0x1eb4, + 0x1eb6, 0x1eb6, + 0x1eb8, 0x1eb8, + 0x1eba, 0x1eba, + 0x1ebc, 0x1ebc, + 0x1ebe, 0x1ebe, + 0x1ec0, 0x1ec0, + 0x1ec2, 0x1ec2, + 0x1ec4, 0x1ec4, + 0x1ec6, 0x1ec6, + 0x1ec8, 0x1ec8, + 0x1eca, 0x1eca, + 0x1ecc, 0x1ecc, + 0x1ece, 0x1ece, + 0x1ed0, 0x1ed0, + 0x1ed2, 0x1ed2, + 0x1ed4, 0x1ed4, + 0x1ed6, 0x1ed6, + 0x1ed8, 0x1ed8, + 0x1eda, 0x1eda, + 0x1edc, 0x1edc, + 0x1ede, 0x1ede, + 0x1ee0, 0x1ee0, + 0x1ee2, 0x1ee2, + 0x1ee4, 0x1ee4, + 0x1ee6, 0x1ee6, + 0x1ee8, 0x1ee8, + 0x1eea, 0x1eea, + 0x1eec, 0x1eec, + 0x1eee, 0x1eee, + 0x1ef0, 0x1ef0, + 0x1ef2, 0x1ef2, + 0x1ef4, 0x1ef4, + 0x1ef6, 0x1ef6, + 0x1ef8, 0x1ef8, + 0x1f08, 0x1f0f, + 0x1f18, 0x1f1d, + 0x1f28, 0x1f2f, + 0x1f38, 0x1f3f, + 0x1f48, 0x1f4d, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f5f, + 0x1f68, 0x1f6f, + 0x1fb8, 0x1fbb, + 0x1fc8, 0x1fcb, + 0x1fd8, 0x1fdb, + 0x1fe8, 0x1fec, + 0x1ff8, 0x1ffb, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210b, 0x210d, + 0x2110, 0x2112, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x2130, 0x2131, + 0x2133, 0x2133, + 0x213e, 0x213f, + 0x2145, 0x2145, + 0xff21, 0xff3a, + 0x10400, 0x10427, + 0x1d400, 0x1d419, + 0x1d434, 0x1d44d, + 0x1d468, 0x1d481, + 0x1d49c, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b5, + 0x1d4d0, 0x1d4e9, + 0x1d504, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d538, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d56c, 0x1d585, + 0x1d5a0, 0x1d5b9, + 0x1d5d4, 0x1d5ed, + 0x1d608, 0x1d621, + 0x1d63c, 0x1d655, + 0x1d670, 0x1d689, + 0x1d6a8, 0x1d6c0, + 0x1d6e2, 0x1d6fa, + 0x1d71c, 0x1d734, + 0x1d756, 0x1d76e, + 0x1d790, 0x1d7a8 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRUpper */ + +static OnigCodePoint CRXDigit[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 3, +#else + 3, +#endif + 0x0030, 0x0039, + 0x0041, 0x0046, + 0x0061, 0x0066 +}; + +static OnigCodePoint CRASCII[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 1, +#else + 1, +#endif + 0x0000, 0x007f +}; + +static OnigCodePoint CRWord[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 436, +#else + 12, +#endif + 0x0030, 0x0039, + 0x0041, 0x005a, + 0x005f, 0x005f, + 0x0061, 0x007a, + 0x00aa, 0x00aa, + 0x00b2, 0x00b3, + 0x00b5, 0x00b5, + 0x00b9, 0x00ba, + 0x00bc, 0x00be, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, +#ifndef USE_UNICODE_FULL_RANGE_CTYPE + 0x00f8, 0x7fffffff +#else /* not USE_UNICODE_FULL_RANGE_CTYPE */ + 0x00f8, 0x0236, + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x0669, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0966, 0x096f, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09f1, + 0x09f4, 0x09f9, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b6f, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bf2, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e50, 0x0e59, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f20, 0x0f33, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1049, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1369, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x16ee, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x180b, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1946, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x203f, 0x2040, + 0x2054, 0x2054, + 0x2070, 0x2071, + 0x2074, 0x2079, + 0x207f, 0x2089, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x2153, 0x2183, + 0x2460, 0x249b, + 0x24ea, 0x24ff, + 0x2776, 0x2793, + 0x3005, 0x3007, + 0x3021, 0x302f, + 0x3031, 0x3035, + 0x3038, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3192, 0x3195, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3220, 0x3229, + 0x3251, 0x325f, + 0x3280, 0x3289, + 0x32b1, 0x32bf, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe33, 0xfe34, + 0xfe4d, 0xfe4f, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff10, 0xff19, + 0xff21, 0xff3a, + 0xff3f, 0xff3f, + 0xff41, 0xff5a, + 0xff65, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10107, 0x10133, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of CRWord */ + + +extern int +onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype) +{ + if (code < 256) { + return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); + } + +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + return onig_is_in_code_range((UChar* )CRAlpha, code); + break; + case ONIGENC_CTYPE_BLANK: + return onig_is_in_code_range((UChar* )CRBlank, code); + break; + case ONIGENC_CTYPE_CNTRL: + return onig_is_in_code_range((UChar* )CRCntrl, code); + break; + case ONIGENC_CTYPE_DIGIT: + return onig_is_in_code_range((UChar* )CRDigit, code); + break; + case ONIGENC_CTYPE_GRAPH: + return onig_is_in_code_range((UChar* )CRGraph, code); + break; + case ONIGENC_CTYPE_LOWER: + return onig_is_in_code_range((UChar* )CRLower, code); + break; + case ONIGENC_CTYPE_PRINT: + return onig_is_in_code_range((UChar* )CRPrint, code); + break; + case ONIGENC_CTYPE_PUNCT: + return onig_is_in_code_range((UChar* )CRPunct, code); + break; + case ONIGENC_CTYPE_SPACE: + return onig_is_in_code_range((UChar* )CRSpace, code); + break; + case ONIGENC_CTYPE_UPPER: + return onig_is_in_code_range((UChar* )CRUpper, code); + break; + case ONIGENC_CTYPE_XDIGIT: + return FALSE; + break; + case ONIGENC_CTYPE_WORD: + return onig_is_in_code_range((UChar* )CRWord, code); + break; + case ONIGENC_CTYPE_ASCII: + return FALSE; + break; + case ONIGENC_CTYPE_ALNUM: + return onig_is_in_code_range((UChar* )CRAlnum, code); + break; + + default: + return ONIGENCERR_TYPE_BUG; + break; + } + +#else + + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + return TRUE; + } + return FALSE; +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +} + +extern int +onigenc_unicode_get_ctype_code_range(int ctype, + OnigCodePoint* sbr[], OnigCodePoint* mbr[]) +{ + static OnigCodePoint EmptyRange[] = { 0 }; + +#define CR_SET(list) do { \ + *mbr = list; \ +} while (0) + + *sbr = EmptyRange; + + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + CR_SET(CRAlpha); + break; + case ONIGENC_CTYPE_BLANK: + CR_SET(CRBlank); + break; + case ONIGENC_CTYPE_CNTRL: + CR_SET(CRCntrl); + break; + case ONIGENC_CTYPE_DIGIT: + CR_SET(CRDigit); + break; + case ONIGENC_CTYPE_GRAPH: + CR_SET(CRGraph); + break; + case ONIGENC_CTYPE_LOWER: + CR_SET(CRLower); + break; + case ONIGENC_CTYPE_PRINT: + CR_SET(CRPrint); + break; + case ONIGENC_CTYPE_PUNCT: + CR_SET(CRPunct); + break; + case ONIGENC_CTYPE_SPACE: + CR_SET(CRSpace); + break; + case ONIGENC_CTYPE_UPPER: + CR_SET(CRUpper); + break; + case ONIGENC_CTYPE_XDIGIT: + CR_SET(CRXDigit); + break; + case ONIGENC_CTYPE_WORD: + CR_SET(CRWord); + break; + case ONIGENC_CTYPE_ASCII: + CR_SET(CRASCII); + break; + case ONIGENC_CTYPE_ALNUM: + CR_SET(CRAlnum); + break; + + default: + return ONIGENCERR_TYPE_BUG; + break; + } + + return 0; +} diff --git a/ext/mbstring/oniguruma/enc/utf16_be.c b/ext/mbstring/oniguruma/enc/utf16_be.c new file mode 100755 index 00000000000..ad33ddbeeb3 --- /dev/null +++ b/ext/mbstring/oniguruma/enc/utf16_be.c @@ -0,0 +1,253 @@ +/********************************************************************** + utf16_be.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + +#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) +#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) + +static int EncLen_UTF16[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +static int +utf16be_mbc_enc_len(const UChar* p) +{ + return EncLen_UTF16[*p]; +} + +static int +utf16be_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p + 1 < end) { + if (*(p+1) == 0x0a && *p == 0x00) + return 1; + } + return 0; +} + +static OnigCodePoint +utf16be_mbc_to_code(const UChar* p, const UChar* end) +{ + OnigCodePoint code; + + if (UTF16_IS_SURROGATE_FIRST(*p)) { + code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16) + + ((((p[1] & 0x3f) << 2) + (p[2] - 0xdc)) << 8) + + p[3]; + } + else { + code = p[0] * 256 + p[1]; + } + return code; +} + +static int +utf16be_code_to_mbclen(OnigCodePoint code) +{ + return (code > 0xffff ? 4 : 2); +} + +static int +utf16be_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar* p = buf; + + if (code > 0xffff) { + unsigned int plane, high; + + plane = code >> 16; + *p++ = (plane >> 2) + 0xd8; + high = (code & 0xff00) >> 8; + *p++ = ((plane & 0x03) << 6) + (high >> 2); + *p++ = (high & 0x02) + 0xdc; + *p = (UChar )(code & 0xff); + return 4; + } + else { + *p++ = (UChar )((code & 0xff00) >> 8); + *p++ = (UChar )(code & 0xff); + return 2; + } +} + +static int +utf16be_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) +{ + const UChar* p = *pp; + + if (*p == 0) { + p++; + if (end > p + 2 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+2) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+2) == 'S'))) && + *(p+1) == 0) { + *lower++ = '\0'; + *lower = 0xdf; + (*pp) += 4; + return 2; + } + + *lower++ = '\0'; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp) += 2; + return 2; /* return byte length of converted char to lower */ + } + else { + int len; + len = EncLen_UTF16[*p]; + if (lower != p) { + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + +static int +utf16be_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + const UChar* p = *pp; + + (*pp) += EncLen_UTF16[*p]; + + if (*p == 0) { + int c, v; + + p++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 2 && + ((*p == 's' && *(p+2) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+2) == 'S'))) && + *(p+1) == 0) { + (*pp) += 2; + return TRUE; + } + else if (*p == 0xdf) { + return TRUE; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + c = *p; + v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (c >= 0xaa && c <= 0xba) + return FALSE; + else + return TRUE; + } + return (v != 0 ? TRUE : FALSE); + } + } + + return FALSE; +} + +static UChar* +utf16be_left_adjust_char_head(const UChar* start, const UChar* s) +{ + if (s <= start) return (UChar* )s; + + if ((s - start) % 2 == 1) { + s--; + } + + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + s -= 2; + + return (UChar* )s; +} + +OnigEncodingType OnigEncodingUTF16_BE = { + utf16be_mbc_enc_len, + "UTF-16BE", /* name */ + 4, /* max byte length */ + 2, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + utf16be_is_mbc_newline, + utf16be_mbc_to_code, + utf16be_code_to_mbclen, + utf16be_code_to_mbc, + utf16be_mbc_to_normalize, + utf16be_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + onigenc_unicode_is_code_ctype, + onigenc_unicode_get_ctype_code_range, + utf16be_left_adjust_char_head, + onigenc_always_false_is_allowed_reverse_match +}; diff --git a/ext/mbstring/oniguruma/enc/utf16_le.c b/ext/mbstring/oniguruma/enc/utf16_le.c new file mode 100755 index 00000000000..db892dcd14d --- /dev/null +++ b/ext/mbstring/oniguruma/enc/utf16_le.c @@ -0,0 +1,248 @@ +/********************************************************************** + utf16_le.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + +#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) +#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) + +static int EncLen_UTF16[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +static int +utf16le_code_to_mbclen(OnigCodePoint code) +{ + return (code > 0xffff ? 4 : 2); +} + +static int +utf16le_mbc_enc_len(const UChar* p) +{ + return EncLen_UTF16[*(p+1)]; +} + +static int +utf16le_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p + 1 < end) { + if (*p == 0x0a && *(p+1) == 0x00) + return 1; + } + return 0; +} + +static OnigCodePoint +utf16le_mbc_to_code(const UChar* p, const UChar* end) +{ + OnigCodePoint code; + UChar c0 = *p; + UChar c1 = *(p+1); + + if (UTF16_IS_SURROGATE_FIRST(c1)) { + code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16) + + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8) + + p[2]; + } + else { + code = c1 * 256 + p[0]; + } + return code; +} + +static int +utf16le_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar* p = buf; + + if (code > 0xffff) { + unsigned int plane, high; + + plane = code >> 16; + high = (code & 0xff00) >> 8; + + *p++ = ((plane & 0x03) << 6) + (high >> 2); + *p++ = (plane >> 2) + 0xd8; + *p++ = (UChar )(code & 0xff); + *p = (high & 0x02) + 0xdc; + return 4; + } + else { + *p++ = (UChar )(code & 0xff); + *p++ = (UChar )((code & 0xff00) >> 8); + return 2; + } +} + +static int +utf16le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) +{ + const UChar* p = *pp; + + if (*(p+1) == 0) { + if (end > p + 3 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+2) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+2) == 'S'))) && + *(p+3) == 0) { + *lower++ = 0xdf; + *lower = '\0'; + (*pp) += 4; + return 2; + } + + *(lower+1) = '\0'; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp) += 2; + return 2; /* return byte length of converted char to lower */ + } + else { + int len = EncLen_UTF16[*(p+1)]; + if (lower != p) { + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + +static int +utf16le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + const UChar* p = *pp; + + (*pp) += EncLen_UTF16[*(p+1)]; + + if (*(p+1) == 0) { + int c, v; + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 3 && + ((*p == 's' && *(p+2) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+2) == 'S'))) && + *(p+3) == 0) { + (*pp) += 2; + return TRUE; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + c = *p; + v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (c >= 0xaa && c <= 0xba) + return FALSE; + else + return TRUE; + } + return (v != 0 ? TRUE : FALSE); + } + } + + return FALSE; +} + +static UChar* +utf16le_left_adjust_char_head(const UChar* start, const UChar* s) +{ + if (s <= start) return (UChar* )s; + + if ((s - start) % 2 == 1) { + s--; + } + + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + s -= 2; + + return (UChar* )s; +} + +OnigEncodingType OnigEncodingUTF16_LE = { + utf16le_mbc_enc_len, + "UTF-16LE", /* name */ + 4, /* max byte length */ + 2, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + utf16le_is_mbc_newline, + utf16le_mbc_to_code, + utf16le_code_to_mbclen, + utf16le_code_to_mbc, + utf16le_mbc_to_normalize, + utf16le_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + onigenc_unicode_is_code_ctype, + onigenc_unicode_get_ctype_code_range, + utf16le_left_adjust_char_head, + onigenc_always_false_is_allowed_reverse_match +}; diff --git a/ext/mbstring/oniguruma/enc/utf32_be.c b/ext/mbstring/oniguruma/enc/utf32_be.c new file mode 100755 index 00000000000..60feb040b82 --- /dev/null +++ b/ext/mbstring/oniguruma/enc/utf32_be.c @@ -0,0 +1,208 @@ +/********************************************************************** + utf32_be.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + +static int +utf32be_mbc_enc_len(const UChar* p) +{ + return 4; +} + +static int +utf32be_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p + 3 < end) { + if (*(p+3) == 0x0a && *(p+2) == 0 && *(p+1) == 0 && *p == 0) + return 1; + } + return 0; +} + +static OnigCodePoint +utf32be_mbc_to_code(const UChar* p, const UChar* end) +{ + return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); +} + +static int +utf32be_code_to_mbclen(OnigCodePoint code) +{ + return 4; +} + +static int +utf32be_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar* p = buf; + + *p++ = (UChar )((code & 0xff000000) >>24); + *p++ = (UChar )((code & 0xff0000) >>16); + *p++ = (UChar )((code & 0xff00) >> 8); + *p++ = (UChar ) (code & 0xff); + return 4; +} + +static int +utf32be_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) +{ + const UChar* p = *pp; + + if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { + p += 3; + if (end > p + 4 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+4) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+4) == 'S'))) && + *(p+3) == 0 && *(p+2) == 0 && *(p+1) == 0) { + *lower++ = '\0'; + *lower++ = '\0'; + *lower++ = '\0'; + *lower = 0xdf; + (*pp) += 8; + return 4; + } + + *lower++ = '\0'; + *lower++ = '\0'; + *lower++ = '\0'; + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp) += 4; + return 4; /* return byte length of converted char to lower */ + } + else { + int len = 4; + if (lower != p) { + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + +static int +utf32be_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + const UChar* p = *pp; + + (*pp) += 4; + + if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { + int c, v; + + p += 3; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 4 && + ((*p == 's' && *(p+4) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+4) == 'S'))) && + *(p+3) == 0 && *(p+2) == 0 && *(p+1) == 0) { + (*pp) += 4; + return TRUE; + } + else if (*p == 0xdf) { + return TRUE; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + c = *p; + v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (c >= 0xaa && c <= 0xba) + return FALSE; + else + return TRUE; + } + return (v != 0 ? TRUE : FALSE); + } + } + + return FALSE; +} + +static UChar* +utf32be_left_adjust_char_head(const UChar* start, const UChar* s) +{ + int rem; + + if (s <= start) return (UChar* )s; + + rem = (s - start) % 4; + return (UChar* )(s - rem); +} + +OnigEncodingType OnigEncodingUTF32_BE = { + utf32be_mbc_enc_len, + "UTF-32BE", /* name */ + 4, /* max byte length */ + 4, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + utf32be_is_mbc_newline, + utf32be_mbc_to_code, + utf32be_code_to_mbclen, + utf32be_code_to_mbc, + utf32be_mbc_to_normalize, + utf32be_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + onigenc_unicode_is_code_ctype, + onigenc_unicode_get_ctype_code_range, + utf32be_left_adjust_char_head, + onigenc_always_false_is_allowed_reverse_match +}; diff --git a/ext/mbstring/oniguruma/enc/utf32_le.c b/ext/mbstring/oniguruma/enc/utf32_le.c new file mode 100755 index 00000000000..bba9689f761 --- /dev/null +++ b/ext/mbstring/oniguruma/enc/utf32_le.c @@ -0,0 +1,206 @@ +/********************************************************************** + utf32_le.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + +static int +utf32le_mbc_enc_len(const UChar* p) +{ + return 4; +} + +static int +utf32le_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p + 3 < end) { + if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) + return 1; + } + return 0; +} + +static OnigCodePoint +utf32le_mbc_to_code(const UChar* p, const UChar* end) +{ + return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); +} + +static int +utf32le_code_to_mbclen(OnigCodePoint code) +{ + return 4; +} + +static int +utf32le_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar* p = buf; + + *p++ = (UChar ) (code & 0xff); + *p++ = (UChar )((code & 0xff00) >> 8); + *p++ = (UChar )((code & 0xff0000) >>16); + *p++ = (UChar )((code & 0xff000000) >>24); + return 4; +} + +static int +utf32le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) +{ + const UChar* p = *pp; + + if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { + if (end > p + 7 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+4) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+4) == 'S'))) && + *(p+5) == 0 && *(p+6) == 0 && *(p+7) == 0) { + *lower++ = 0xdf; + *lower++ = '\0'; + *lower++ = '\0'; + *lower = '\0'; + (*pp) += 8; + return 4; + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + *lower++ = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + } + else { + *lower++ = *p; + } + *lower++ = '\0'; + *lower++ = '\0'; + *lower = '\0'; + + (*pp) += 4; + return 4; /* return byte length of converted char to lower */ + } + else { + int len = 4; + if (lower != p) { + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + +static int +utf32le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + const UChar* p = *pp; + + (*pp) += 4; + + if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { + int c, v; + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + if (end > p + 7 && + ((*p == 's' && *(p+4) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+4) == 'S'))) && + *(p+5) == 0 && *(p+6) == 0 && *(p+7) == 0) { + (*pp) += 4; + return TRUE; + } + else if (*p == 0xdf) { + return TRUE; + } + } + + if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + ONIGENC_IS_MBC_ASCII(p)) || + ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && + !ONIGENC_IS_MBC_ASCII(p))) { + c = *p; + v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, + (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); + if ((v | ONIGENC_CTYPE_LOWER) != 0) { + /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ + if (c >= 0xaa && c <= 0xba) + return FALSE; + else + return TRUE; + } + return (v != 0 ? TRUE : FALSE); + } + } + + return FALSE; +} + +static UChar* +utf32le_left_adjust_char_head(const UChar* start, const UChar* s) +{ + int rem; + + if (s <= start) return (UChar* )s; + + rem = (s - start) % 4; + return (UChar* )(s - rem); +} + +OnigEncodingType OnigEncodingUTF32_LE = { + utf32le_mbc_enc_len, + "UTF-32LE", /* name */ + 4, /* max byte length */ + 4, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + utf32le_is_mbc_newline, + utf32le_mbc_to_code, + utf32le_code_to_mbclen, + utf32le_code_to_mbc, + utf32le_mbc_to_normalize, + utf32le_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + onigenc_unicode_is_code_ctype, + onigenc_unicode_get_ctype_code_range, + utf32le_left_adjust_char_head, + onigenc_always_false_is_allowed_reverse_match +}; diff --git a/ext/mbstring/oniguruma/enc/utf8.c b/ext/mbstring/oniguruma/enc/utf8.c index 604cfac2ef9..592bebfe8f2 100644 --- a/ext/mbstring/oniguruma/enc/utf8.c +++ b/ext/mbstring/oniguruma/enc/utf8.c @@ -1,60 +1,78 @@ /********************************************************************** - utf8.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +#define USE_INVALID_CODE_SCHEME + +#ifdef USE_INVALID_CODE_SCHEME +/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ +#define INVALID_CODE_FE 0xfffffffe +#define INVALID_CODE_FF 0xffffffff +#define VALID_CODE_LIMIT 0x7fffffff +#endif + #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) -#define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ - ((EncUnicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) - -static unsigned short EncUnicode_ISO_8859_1_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x1050, 0x1050, 0x1050, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x18d0, - 0x1050, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1850, 0x1850, 0x1050, 0x1871, 0x1050, 0x10d0, - 0x1050, 0x1850, 0x1871, 0x10d0, 0x1850, 0x1850, 0x1850, 0x10d0, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1050, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1050, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871 +static int EncLen_UTF8[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 }; +static int +utf8_mbc_enc_len(const UChar* p) +{ + return EncLen_UTF8[*p]; +} + static OnigCodePoint -utf8_mbc_to_code(UChar* p, UChar* end) +utf8_mbc_to_code(const UChar* p, const UChar* end) { int c, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_UTF8, p); c = *p++; - len = enc_len(ONIG_ENCODING_UTF8, c); if (len > 1) { len--; n = c & ((1 << (6 - len)) - 1); @@ -64,8 +82,14 @@ utf8_mbc_to_code(UChar* p, UChar* end) } return n; } - else + else { +#ifdef USE_INVALID_CODE_SCHEME + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } +#endif return (OnigCodePoint )c; + } } static int @@ -81,8 +105,12 @@ utf8_code_to_mbclen(OnigCodePoint code) else if ((code & 0xffe00000) == 0) return 4; else if ((code & 0xfc000000) == 0) return 5; else if ((code & 0x80000000) == 0) return 6; +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) return 1; + else if (code == INVALID_CODE_FF) return 1; +#endif else - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } #if 0 @@ -103,7 +131,7 @@ utf8_code_to_mbc_first(OnigCodePoint code) else if ((code & 0x80000000) == 0) return ((code>>30) & 0x01) | 0xfc; else { - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } } } @@ -147,8 +175,18 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); } +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) { + *p = 0xfe; + return 1; + } + else if (code == INVALID_CODE_FF) { + *p = 0xff; + return 1; + } +#endif else { - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } *p++ = UTF8_TRAIL0(code); @@ -157,221 +195,3377 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -utf8_mbc_to_lower(UChar* p, UChar* lower) +utf8_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* lower) { - int len; + const UChar* p = *pp; - /* !!! U+0080 - U+00ff is treated by fold match. !!! */ if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } else { - len = enc_len(ONIG_ENCODING_UTF8, *p); + int len; + + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if (c <= (UChar )'\236' && /* upper */ + (flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c != (UChar )'\227') { + *lower++ = *p; + *lower = (UChar )(c + 32); + (*pp) += 2; + return 2; + } + } +#if 0 + else if (c == (UChar )'\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } +#endif + } + } + + len = enc_len(ONIG_ENCODING_UTF8, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -utf8_mbc_is_case_ambig(UChar* p) +utf8_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) { - /* !!! U+0080 - U+00ff ( 0x80[0xc2,0x80] - 0xff[0xc3,0xbf] ) - is treated by fold match. !!! */ + const UChar* p = *pp; - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + if (ONIGENC_IS_MBC_ASCII(p)) { + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + (*pp) += 2; + return TRUE; + } - return FALSE; -} - -static int -utf8_code_is_ctype(OnigCodePoint code, unsigned int ctype) -{ - if (code < 256) { - return ENC_IS_ISO_8859_1_CTYPE(code, ctype); + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } } + else { + (*pp) += enc_len(ONIG_ENCODING_UTF8, p); - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - return TRUE; + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c <= (UChar )'\236') { /* upper */ + if (c == (UChar )'\227') return FALSE; + return TRUE; + } + else if (c >= (UChar )'\240' && c <= (UChar )'\276') { /* lower */ + if (c == (UChar )'\267') return FALSE; + return TRUE; + } + } + else if (c == (UChar )'\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + return TRUE; + } + } + } } return FALSE; } + +static OnigCodePoint EmptyRange[] = { 0 }; + +static OnigCodePoint SBAlnum[] = { + 3, + 0x0030, 0x0039, + 0x0041, 0x005a, + 0x0061, 0x007a +}; + +static OnigCodePoint MBAlnum[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 411, +#else + 6, +#endif + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, + 0x00f8, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x0669, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0966, 0x096f, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09f1, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b6f, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bef, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e50, 0x0e59, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f20, 0x0f29, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1049, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1369, 0x1371, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x17e0, 0x17e9, + 0x180b, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1946, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x3005, 0x3006, + 0x302a, 0x302f, + 0x3031, 0x3035, + 0x303b, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30fa, + 0x30fc, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff10, 0xff19, + 0xff21, 0xff3a, + 0xff41, 0xff5a, + 0xff66, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10300, 0x1031e, + 0x10330, 0x10349, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBAlnum */ + +static OnigCodePoint SBAlpha[] = { + 2, + 0x0041, 0x005a, + 0x0061, 0x007a +}; + +static OnigCodePoint MBAlpha[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 394, +#else + 6, +#endif + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, + 0x00f8, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06ef, + 0x06fa, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09f0, 0x09f1, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a70, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x180b, 0x180d, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1950, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x3005, 0x3006, + 0x302a, 0x302f, + 0x3031, 0x3035, + 0x303b, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30fa, + 0x30fc, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff21, 0xff3a, + 0xff41, 0xff5a, + 0xff66, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10300, 0x1031e, + 0x10330, 0x10349, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBAlpha */ + +static OnigCodePoint SBBlank[] = { + 2, + 0x0009, 0x0009, + 0x0020, 0x0020 +}; + +static OnigCodePoint MBBlank[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 7, +#else + 1, +#endif + 0x00a0, 0x00a0 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x1680, 0x1680, + 0x180e, 0x180e, + 0x2000, 0x200a, + 0x202f, 0x202f, + 0x205f, 0x205f, + 0x3000, 0x3000 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBBlank */ + +static OnigCodePoint SBCntrl[] = { + 2, + 0x0000, 0x001f, + 0x007f, 0x007f +}; + +static OnigCodePoint MBCntrl[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 18, +#else + 2, +#endif + 0x0080, 0x009f, + 0x00ad, 0x00ad +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0600, 0x0603, + 0x06dd, 0x06dd, + 0x070f, 0x070f, + 0x17b4, 0x17b5, + 0x200b, 0x200f, + 0x202a, 0x202e, + 0x2060, 0x2063, + 0x206a, 0x206f, + 0xd800, 0xf8ff, + 0xfeff, 0xfeff, + 0xfff9, 0xfffb, + 0x1d173, 0x1d17a, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBCntrl */ + +static OnigCodePoint SBDigit[] = { + 1, + 0x0030, 0x0039 +}; + +static OnigCodePoint MBDigit[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 22, +#else + 0 +#endif +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 0x0660, 0x0669, + 0x06f0, 0x06f9, + 0x0966, 0x096f, + 0x09e6, 0x09ef, + 0x0a66, 0x0a6f, + 0x0ae6, 0x0aef, + 0x0b66, 0x0b6f, + 0x0be7, 0x0bef, + 0x0c66, 0x0c6f, + 0x0ce6, 0x0cef, + 0x0d66, 0x0d6f, + 0x0e50, 0x0e59, + 0x0ed0, 0x0ed9, + 0x0f20, 0x0f29, + 0x1040, 0x1049, + 0x1369, 0x1371, + 0x17e0, 0x17e9, + 0x1810, 0x1819, + 0x1946, 0x194f, + 0xff10, 0xff19, + 0x104a0, 0x104a9, + 0x1d7ce, 0x1d7ff +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBDigit */ + +static OnigCodePoint SBGraph[] = { + 1, + 0x0021, 0x007e +}; + +static OnigCodePoint MBGraph[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 404, +#else + 1, +#endif + 0x00a1, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x0357, + 0x035d, 0x036f, + 0x0374, 0x0375, + 0x037a, 0x037a, + 0x037e, 0x037e, + 0x0384, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03fb, + 0x0400, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x055f, + 0x0561, 0x0587, + 0x0589, 0x058a, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f4, + 0x0600, 0x0603, + 0x060c, 0x0615, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x070d, + 0x070f, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0970, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09fa, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0af1, 0x0af1, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bfa, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df4, + 0x0e01, 0x0e3a, + 0x0e3f, 0x0e5b, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fbe, 0x0fcc, + 0x0fcf, 0x0fcf, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x10fb, 0x10fb, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1361, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x1676, + 0x1681, 0x169c, + 0x16a0, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1736, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x1800, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1940, 0x1940, + 0x1944, 0x196d, + 0x1970, 0x1974, + 0x19e0, 0x19ff, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fc4, + 0x1fc6, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fdd, 0x1fef, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffe, + 0x200b, 0x2027, + 0x202a, 0x202e, + 0x2030, 0x2054, + 0x2057, 0x2057, + 0x2060, 0x2063, + 0x206a, 0x2071, + 0x2074, 0x208e, + 0x20a0, 0x20b1, + 0x20d0, 0x20ea, + 0x2100, 0x213b, + 0x213d, 0x214b, + 0x2153, 0x2183, + 0x2190, 0x23d0, + 0x2400, 0x2426, + 0x2440, 0x244a, + 0x2460, 0x2617, + 0x2619, 0x267d, + 0x2680, 0x2691, + 0x26a0, 0x26a1, + 0x2701, 0x2704, + 0x2706, 0x2709, + 0x270c, 0x2727, + 0x2729, 0x274b, + 0x274d, 0x274d, + 0x274f, 0x2752, + 0x2756, 0x2756, + 0x2758, 0x275e, + 0x2761, 0x2794, + 0x2798, 0x27af, + 0x27b1, 0x27be, + 0x27d0, 0x27eb, + 0x27f0, 0x2b0d, + 0x2e80, 0x2e99, + 0x2e9b, 0x2ef3, + 0x2f00, 0x2fd5, + 0x2ff0, 0x2ffb, + 0x3001, 0x303f, + 0x3041, 0x3096, + 0x3099, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3190, 0x31b7, + 0x31f0, 0x321e, + 0x3220, 0x3243, + 0x3250, 0x327d, + 0x327f, 0x32fe, + 0x3300, 0x4db5, + 0x4dc0, 0x9fa5, + 0xa000, 0xa48c, + 0xa490, 0xa4c6, + 0xac00, 0xd7a3, + 0xe000, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3f, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfd, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe30, 0xfe52, + 0xfe54, 0xfe66, + 0xfe68, 0xfe6b, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xfeff, 0xfeff, + 0xff01, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0xffe0, 0xffe6, + 0xffe8, 0xffee, + 0xfff9, 0xfffd, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10100, 0x10102, + 0x10107, 0x10133, + 0x10137, 0x1013f, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x1039f, 0x1039f, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d000, 0x1d0f5, + 0x1d100, 0x1d126, + 0x1d12a, 0x1d1dd, + 0x1d300, 0x1d356, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xe0100, 0xe01ef, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBGraph */ + +static OnigCodePoint SBLower[] = { + 1, + 0x0061, 0x007a +}; + +static OnigCodePoint MBLower[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 423, +#else + 5, +#endif + 0x00aa, 0x00aa, + 0x00b5, 0x00b5, + 0x00ba, 0x00ba, + 0x00df, 0x00f6, + 0x00f8, 0x00ff +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0101, 0x0101, + 0x0103, 0x0103, + 0x0105, 0x0105, + 0x0107, 0x0107, + 0x0109, 0x0109, + 0x010b, 0x010b, + 0x010d, 0x010d, + 0x010f, 0x010f, + 0x0111, 0x0111, + 0x0113, 0x0113, + 0x0115, 0x0115, + 0x0117, 0x0117, + 0x0119, 0x0119, + 0x011b, 0x011b, + 0x011d, 0x011d, + 0x011f, 0x011f, + 0x0121, 0x0121, + 0x0123, 0x0123, + 0x0125, 0x0125, + 0x0127, 0x0127, + 0x0129, 0x0129, + 0x012b, 0x012b, + 0x012d, 0x012d, + 0x012f, 0x012f, + 0x0131, 0x0131, + 0x0133, 0x0133, + 0x0135, 0x0135, + 0x0137, 0x0138, + 0x013a, 0x013a, + 0x013c, 0x013c, + 0x013e, 0x013e, + 0x0140, 0x0140, + 0x0142, 0x0142, + 0x0144, 0x0144, + 0x0146, 0x0146, + 0x0148, 0x0149, + 0x014b, 0x014b, + 0x014d, 0x014d, + 0x014f, 0x014f, + 0x0151, 0x0151, + 0x0153, 0x0153, + 0x0155, 0x0155, + 0x0157, 0x0157, + 0x0159, 0x0159, + 0x015b, 0x015b, + 0x015d, 0x015d, + 0x015f, 0x015f, + 0x0161, 0x0161, + 0x0163, 0x0163, + 0x0165, 0x0165, + 0x0167, 0x0167, + 0x0169, 0x0169, + 0x016b, 0x016b, + 0x016d, 0x016d, + 0x016f, 0x016f, + 0x0171, 0x0171, + 0x0173, 0x0173, + 0x0175, 0x0175, + 0x0177, 0x0177, + 0x017a, 0x017a, + 0x017c, 0x017c, + 0x017e, 0x0180, + 0x0183, 0x0183, + 0x0185, 0x0185, + 0x0188, 0x0188, + 0x018c, 0x018d, + 0x0192, 0x0192, + 0x0195, 0x0195, + 0x0199, 0x019b, + 0x019e, 0x019e, + 0x01a1, 0x01a1, + 0x01a3, 0x01a3, + 0x01a5, 0x01a5, + 0x01a8, 0x01a8, + 0x01aa, 0x01ab, + 0x01ad, 0x01ad, + 0x01b0, 0x01b0, + 0x01b4, 0x01b4, + 0x01b6, 0x01b6, + 0x01b9, 0x01ba, + 0x01bd, 0x01bf, + 0x01c6, 0x01c6, + 0x01c9, 0x01c9, + 0x01cc, 0x01cc, + 0x01ce, 0x01ce, + 0x01d0, 0x01d0, + 0x01d2, 0x01d2, + 0x01d4, 0x01d4, + 0x01d6, 0x01d6, + 0x01d8, 0x01d8, + 0x01da, 0x01da, + 0x01dc, 0x01dd, + 0x01df, 0x01df, + 0x01e1, 0x01e1, + 0x01e3, 0x01e3, + 0x01e5, 0x01e5, + 0x01e7, 0x01e7, + 0x01e9, 0x01e9, + 0x01eb, 0x01eb, + 0x01ed, 0x01ed, + 0x01ef, 0x01f0, + 0x01f3, 0x01f3, + 0x01f5, 0x01f5, + 0x01f9, 0x01f9, + 0x01fb, 0x01fb, + 0x01fd, 0x01fd, + 0x01ff, 0x01ff, + 0x0201, 0x0201, + 0x0203, 0x0203, + 0x0205, 0x0205, + 0x0207, 0x0207, + 0x0209, 0x0209, + 0x020b, 0x020b, + 0x020d, 0x020d, + 0x020f, 0x020f, + 0x0211, 0x0211, + 0x0213, 0x0213, + 0x0215, 0x0215, + 0x0217, 0x0217, + 0x0219, 0x0219, + 0x021b, 0x021b, + 0x021d, 0x021d, + 0x021f, 0x021f, + 0x0221, 0x0221, + 0x0223, 0x0223, + 0x0225, 0x0225, + 0x0227, 0x0227, + 0x0229, 0x0229, + 0x022b, 0x022b, + 0x022d, 0x022d, + 0x022f, 0x022f, + 0x0231, 0x0231, + 0x0233, 0x0236, + 0x0250, 0x02af, + 0x0390, 0x0390, + 0x03ac, 0x03ce, + 0x03d0, 0x03d1, + 0x03d5, 0x03d7, + 0x03d9, 0x03d9, + 0x03db, 0x03db, + 0x03dd, 0x03dd, + 0x03df, 0x03df, + 0x03e1, 0x03e1, + 0x03e3, 0x03e3, + 0x03e5, 0x03e5, + 0x03e7, 0x03e7, + 0x03e9, 0x03e9, + 0x03eb, 0x03eb, + 0x03ed, 0x03ed, + 0x03ef, 0x03f3, + 0x03f5, 0x03f5, + 0x03f8, 0x03f8, + 0x03fb, 0x03fb, + 0x0430, 0x045f, + 0x0461, 0x0461, + 0x0463, 0x0463, + 0x0465, 0x0465, + 0x0467, 0x0467, + 0x0469, 0x0469, + 0x046b, 0x046b, + 0x046d, 0x046d, + 0x046f, 0x046f, + 0x0471, 0x0471, + 0x0473, 0x0473, + 0x0475, 0x0475, + 0x0477, 0x0477, + 0x0479, 0x0479, + 0x047b, 0x047b, + 0x047d, 0x047d, + 0x047f, 0x047f, + 0x0481, 0x0481, + 0x048b, 0x048b, + 0x048d, 0x048d, + 0x048f, 0x048f, + 0x0491, 0x0491, + 0x0493, 0x0493, + 0x0495, 0x0495, + 0x0497, 0x0497, + 0x0499, 0x0499, + 0x049b, 0x049b, + 0x049d, 0x049d, + 0x049f, 0x049f, + 0x04a1, 0x04a1, + 0x04a3, 0x04a3, + 0x04a5, 0x04a5, + 0x04a7, 0x04a7, + 0x04a9, 0x04a9, + 0x04ab, 0x04ab, + 0x04ad, 0x04ad, + 0x04af, 0x04af, + 0x04b1, 0x04b1, + 0x04b3, 0x04b3, + 0x04b5, 0x04b5, + 0x04b7, 0x04b7, + 0x04b9, 0x04b9, + 0x04bb, 0x04bb, + 0x04bd, 0x04bd, + 0x04bf, 0x04bf, + 0x04c2, 0x04c2, + 0x04c4, 0x04c4, + 0x04c6, 0x04c6, + 0x04c8, 0x04c8, + 0x04ca, 0x04ca, + 0x04cc, 0x04cc, + 0x04ce, 0x04ce, + 0x04d1, 0x04d1, + 0x04d3, 0x04d3, + 0x04d5, 0x04d5, + 0x04d7, 0x04d7, + 0x04d9, 0x04d9, + 0x04db, 0x04db, + 0x04dd, 0x04dd, + 0x04df, 0x04df, + 0x04e1, 0x04e1, + 0x04e3, 0x04e3, + 0x04e5, 0x04e5, + 0x04e7, 0x04e7, + 0x04e9, 0x04e9, + 0x04eb, 0x04eb, + 0x04ed, 0x04ed, + 0x04ef, 0x04ef, + 0x04f1, 0x04f1, + 0x04f3, 0x04f3, + 0x04f5, 0x04f5, + 0x04f9, 0x04f9, + 0x0501, 0x0501, + 0x0503, 0x0503, + 0x0505, 0x0505, + 0x0507, 0x0507, + 0x0509, 0x0509, + 0x050b, 0x050b, + 0x050d, 0x050d, + 0x050f, 0x050f, + 0x0561, 0x0587, + 0x1d00, 0x1d2b, + 0x1d62, 0x1d6b, + 0x1e01, 0x1e01, + 0x1e03, 0x1e03, + 0x1e05, 0x1e05, + 0x1e07, 0x1e07, + 0x1e09, 0x1e09, + 0x1e0b, 0x1e0b, + 0x1e0d, 0x1e0d, + 0x1e0f, 0x1e0f, + 0x1e11, 0x1e11, + 0x1e13, 0x1e13, + 0x1e15, 0x1e15, + 0x1e17, 0x1e17, + 0x1e19, 0x1e19, + 0x1e1b, 0x1e1b, + 0x1e1d, 0x1e1d, + 0x1e1f, 0x1e1f, + 0x1e21, 0x1e21, + 0x1e23, 0x1e23, + 0x1e25, 0x1e25, + 0x1e27, 0x1e27, + 0x1e29, 0x1e29, + 0x1e2b, 0x1e2b, + 0x1e2d, 0x1e2d, + 0x1e2f, 0x1e2f, + 0x1e31, 0x1e31, + 0x1e33, 0x1e33, + 0x1e35, 0x1e35, + 0x1e37, 0x1e37, + 0x1e39, 0x1e39, + 0x1e3b, 0x1e3b, + 0x1e3d, 0x1e3d, + 0x1e3f, 0x1e3f, + 0x1e41, 0x1e41, + 0x1e43, 0x1e43, + 0x1e45, 0x1e45, + 0x1e47, 0x1e47, + 0x1e49, 0x1e49, + 0x1e4b, 0x1e4b, + 0x1e4d, 0x1e4d, + 0x1e4f, 0x1e4f, + 0x1e51, 0x1e51, + 0x1e53, 0x1e53, + 0x1e55, 0x1e55, + 0x1e57, 0x1e57, + 0x1e59, 0x1e59, + 0x1e5b, 0x1e5b, + 0x1e5d, 0x1e5d, + 0x1e5f, 0x1e5f, + 0x1e61, 0x1e61, + 0x1e63, 0x1e63, + 0x1e65, 0x1e65, + 0x1e67, 0x1e67, + 0x1e69, 0x1e69, + 0x1e6b, 0x1e6b, + 0x1e6d, 0x1e6d, + 0x1e6f, 0x1e6f, + 0x1e71, 0x1e71, + 0x1e73, 0x1e73, + 0x1e75, 0x1e75, + 0x1e77, 0x1e77, + 0x1e79, 0x1e79, + 0x1e7b, 0x1e7b, + 0x1e7d, 0x1e7d, + 0x1e7f, 0x1e7f, + 0x1e81, 0x1e81, + 0x1e83, 0x1e83, + 0x1e85, 0x1e85, + 0x1e87, 0x1e87, + 0x1e89, 0x1e89, + 0x1e8b, 0x1e8b, + 0x1e8d, 0x1e8d, + 0x1e8f, 0x1e8f, + 0x1e91, 0x1e91, + 0x1e93, 0x1e93, + 0x1e95, 0x1e9b, + 0x1ea1, 0x1ea1, + 0x1ea3, 0x1ea3, + 0x1ea5, 0x1ea5, + 0x1ea7, 0x1ea7, + 0x1ea9, 0x1ea9, + 0x1eab, 0x1eab, + 0x1ead, 0x1ead, + 0x1eaf, 0x1eaf, + 0x1eb1, 0x1eb1, + 0x1eb3, 0x1eb3, + 0x1eb5, 0x1eb5, + 0x1eb7, 0x1eb7, + 0x1eb9, 0x1eb9, + 0x1ebb, 0x1ebb, + 0x1ebd, 0x1ebd, + 0x1ebf, 0x1ebf, + 0x1ec1, 0x1ec1, + 0x1ec3, 0x1ec3, + 0x1ec5, 0x1ec5, + 0x1ec7, 0x1ec7, + 0x1ec9, 0x1ec9, + 0x1ecb, 0x1ecb, + 0x1ecd, 0x1ecd, + 0x1ecf, 0x1ecf, + 0x1ed1, 0x1ed1, + 0x1ed3, 0x1ed3, + 0x1ed5, 0x1ed5, + 0x1ed7, 0x1ed7, + 0x1ed9, 0x1ed9, + 0x1edb, 0x1edb, + 0x1edd, 0x1edd, + 0x1edf, 0x1edf, + 0x1ee1, 0x1ee1, + 0x1ee3, 0x1ee3, + 0x1ee5, 0x1ee5, + 0x1ee7, 0x1ee7, + 0x1ee9, 0x1ee9, + 0x1eeb, 0x1eeb, + 0x1eed, 0x1eed, + 0x1eef, 0x1eef, + 0x1ef1, 0x1ef1, + 0x1ef3, 0x1ef3, + 0x1ef5, 0x1ef5, + 0x1ef7, 0x1ef7, + 0x1ef9, 0x1ef9, + 0x1f00, 0x1f07, + 0x1f10, 0x1f15, + 0x1f20, 0x1f27, + 0x1f30, 0x1f37, + 0x1f40, 0x1f45, + 0x1f50, 0x1f57, + 0x1f60, 0x1f67, + 0x1f70, 0x1f7d, + 0x1f80, 0x1f87, + 0x1f90, 0x1f97, + 0x1fa0, 0x1fa7, + 0x1fb0, 0x1fb4, + 0x1fb6, 0x1fb7, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fc7, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fd7, + 0x1fe0, 0x1fe7, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ff7, + 0x2071, 0x2071, + 0x207f, 0x207f, + 0x210a, 0x210a, + 0x210e, 0x210f, + 0x2113, 0x2113, + 0x212f, 0x212f, + 0x2134, 0x2134, + 0x2139, 0x2139, + 0x213d, 0x213d, + 0x2146, 0x2149, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xff41, 0xff5a, + 0x10428, 0x1044f, + 0x1d41a, 0x1d433, + 0x1d44e, 0x1d454, + 0x1d456, 0x1d467, + 0x1d482, 0x1d49b, + 0x1d4b6, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d4cf, + 0x1d4ea, 0x1d503, + 0x1d51e, 0x1d537, + 0x1d552, 0x1d56b, + 0x1d586, 0x1d59f, + 0x1d5ba, 0x1d5d3, + 0x1d5ee, 0x1d607, + 0x1d622, 0x1d63b, + 0x1d656, 0x1d66f, + 0x1d68a, 0x1d6a3, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6e1, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d71b, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d755, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d78f, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBLower */ + +static OnigCodePoint SBPrint[] = { + 2, + 0x0009, 0x000d, + 0x0020, 0x007e +}; + +static OnigCodePoint MBPrint[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 403, +#else + 2, +#endif + 0x0085, 0x0085, + 0x00a0, 0x0236 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0250, 0x0357, + 0x035d, 0x036f, + 0x0374, 0x0375, + 0x037a, 0x037a, + 0x037e, 0x037e, + 0x0384, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03fb, + 0x0400, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x055f, + 0x0561, 0x0587, + 0x0589, 0x058a, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f4, + 0x0600, 0x0603, + 0x060c, 0x0615, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x070d, + 0x070f, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0970, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09fa, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0af1, 0x0af1, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bfa, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df4, + 0x0e01, 0x0e3a, + 0x0e3f, 0x0e5b, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fbe, 0x0fcc, + 0x0fcf, 0x0fcf, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x10fb, 0x10fb, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1361, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x1676, + 0x1680, 0x169c, + 0x16a0, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1736, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x1800, 0x180e, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1940, 0x1940, + 0x1944, 0x196d, + 0x1970, 0x1974, + 0x19e0, 0x19ff, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fc4, + 0x1fc6, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fdd, 0x1fef, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffe, + 0x2000, 0x2054, + 0x2057, 0x2057, + 0x205f, 0x2063, + 0x206a, 0x2071, + 0x2074, 0x208e, + 0x20a0, 0x20b1, + 0x20d0, 0x20ea, + 0x2100, 0x213b, + 0x213d, 0x214b, + 0x2153, 0x2183, + 0x2190, 0x23d0, + 0x2400, 0x2426, + 0x2440, 0x244a, + 0x2460, 0x2617, + 0x2619, 0x267d, + 0x2680, 0x2691, + 0x26a0, 0x26a1, + 0x2701, 0x2704, + 0x2706, 0x2709, + 0x270c, 0x2727, + 0x2729, 0x274b, + 0x274d, 0x274d, + 0x274f, 0x2752, + 0x2756, 0x2756, + 0x2758, 0x275e, + 0x2761, 0x2794, + 0x2798, 0x27af, + 0x27b1, 0x27be, + 0x27d0, 0x27eb, + 0x27f0, 0x2b0d, + 0x2e80, 0x2e99, + 0x2e9b, 0x2ef3, + 0x2f00, 0x2fd5, + 0x2ff0, 0x2ffb, + 0x3000, 0x303f, + 0x3041, 0x3096, + 0x3099, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3190, 0x31b7, + 0x31f0, 0x321e, + 0x3220, 0x3243, + 0x3250, 0x327d, + 0x327f, 0x32fe, + 0x3300, 0x4db5, + 0x4dc0, 0x9fa5, + 0xa000, 0xa48c, + 0xa490, 0xa4c6, + 0xac00, 0xd7a3, + 0xe000, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3f, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfd, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe30, 0xfe52, + 0xfe54, 0xfe66, + 0xfe68, 0xfe6b, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xfeff, 0xfeff, + 0xff01, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0xffe0, 0xffe6, + 0xffe8, 0xffee, + 0xfff9, 0xfffd, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10100, 0x10102, + 0x10107, 0x10133, + 0x10137, 0x1013f, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x1039f, 0x1039f, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d000, 0x1d0f5, + 0x1d100, 0x1d126, + 0x1d12a, 0x1d1dd, + 0x1d300, 0x1d356, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xe0100, 0xe01ef, + 0xf0000, 0xffffd, + 0x100000, 0x10fffd +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBPrint */ + +static OnigCodePoint SBPunct[] = { + 9, + 0x0021, 0x0023, + 0x0025, 0x002a, + 0x002c, 0x002f, + 0x003a, 0x003b, + 0x003f, 0x0040, + 0x005b, 0x005d, + 0x005f, 0x005f, + 0x007b, 0x007b, + 0x007d, 0x007d +}; /* end of SBPunct */ + +static OnigCodePoint MBPunct[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 77, +#else + 5, +#endif + 0x00a1, 0x00a1, + 0x00ab, 0x00ab, + 0x00b7, 0x00b7, + 0x00bb, 0x00bb, + 0x00bf, 0x00bf +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x037e, 0x037e, + 0x0387, 0x0387, + 0x055a, 0x055f, + 0x0589, 0x058a, + 0x05be, 0x05be, + 0x05c0, 0x05c0, + 0x05c3, 0x05c3, + 0x05f3, 0x05f4, + 0x060c, 0x060d, + 0x061b, 0x061b, + 0x061f, 0x061f, + 0x066a, 0x066d, + 0x06d4, 0x06d4, + 0x0700, 0x070d, + 0x0964, 0x0965, + 0x0970, 0x0970, + 0x0df4, 0x0df4, + 0x0e4f, 0x0e4f, + 0x0e5a, 0x0e5b, + 0x0f04, 0x0f12, + 0x0f3a, 0x0f3d, + 0x0f85, 0x0f85, + 0x104a, 0x104f, + 0x10fb, 0x10fb, + 0x1361, 0x1368, + 0x166d, 0x166e, + 0x169b, 0x169c, + 0x16eb, 0x16ed, + 0x1735, 0x1736, + 0x17d4, 0x17d6, + 0x17d8, 0x17da, + 0x1800, 0x180a, + 0x1944, 0x1945, + 0x2010, 0x2027, + 0x2030, 0x2043, + 0x2045, 0x2051, + 0x2053, 0x2054, + 0x2057, 0x2057, + 0x207d, 0x207e, + 0x208d, 0x208e, + 0x2329, 0x232a, + 0x23b4, 0x23b6, + 0x2768, 0x2775, + 0x27e6, 0x27eb, + 0x2983, 0x2998, + 0x29d8, 0x29db, + 0x29fc, 0x29fd, + 0x3001, 0x3003, + 0x3008, 0x3011, + 0x3014, 0x301f, + 0x3030, 0x3030, + 0x303d, 0x303d, + 0x30a0, 0x30a0, + 0x30fb, 0x30fb, + 0xfd3e, 0xfd3f, + 0xfe30, 0xfe52, + 0xfe54, 0xfe61, + 0xfe63, 0xfe63, + 0xfe68, 0xfe68, + 0xfe6a, 0xfe6b, + 0xff01, 0xff03, + 0xff05, 0xff0a, + 0xff0c, 0xff0f, + 0xff1a, 0xff1b, + 0xff1f, 0xff20, + 0xff3b, 0xff3d, + 0xff3f, 0xff3f, + 0xff5b, 0xff5b, + 0xff5d, 0xff5d, + 0xff5f, 0xff65, + 0x10100, 0x10101, + 0x1039f, 0x1039f +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBPunct */ + +static OnigCodePoint SBSpace[] = { + 2, + 0x0009, 0x000d, + 0x0020, 0x0020 +}; + +static OnigCodePoint MBSpace[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 9, +#else + 2, +#endif + 0x0085, 0x0085, + 0x00a0, 0x00a0 +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x1680, 0x1680, + 0x180e, 0x180e, + 0x2000, 0x200a, + 0x2028, 0x2029, + 0x202f, 0x202f, + 0x205f, 0x205f, + 0x3000, 0x3000 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBSpace */ + +static OnigCodePoint SBUpper[] = { + 1, + 0x0041, 0x005a +}; + +static OnigCodePoint MBUpper[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 420, +#else + 2, +#endif + 0x00c0, 0x00d6, + 0x00d8, 0x00de +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + , + 0x0100, 0x0100, + 0x0102, 0x0102, + 0x0104, 0x0104, + 0x0106, 0x0106, + 0x0108, 0x0108, + 0x010a, 0x010a, + 0x010c, 0x010c, + 0x010e, 0x010e, + 0x0110, 0x0110, + 0x0112, 0x0112, + 0x0114, 0x0114, + 0x0116, 0x0116, + 0x0118, 0x0118, + 0x011a, 0x011a, + 0x011c, 0x011c, + 0x011e, 0x011e, + 0x0120, 0x0120, + 0x0122, 0x0122, + 0x0124, 0x0124, + 0x0126, 0x0126, + 0x0128, 0x0128, + 0x012a, 0x012a, + 0x012c, 0x012c, + 0x012e, 0x012e, + 0x0130, 0x0130, + 0x0132, 0x0132, + 0x0134, 0x0134, + 0x0136, 0x0136, + 0x0139, 0x0139, + 0x013b, 0x013b, + 0x013d, 0x013d, + 0x013f, 0x013f, + 0x0141, 0x0141, + 0x0143, 0x0143, + 0x0145, 0x0145, + 0x0147, 0x0147, + 0x014a, 0x014a, + 0x014c, 0x014c, + 0x014e, 0x014e, + 0x0150, 0x0150, + 0x0152, 0x0152, + 0x0154, 0x0154, + 0x0156, 0x0156, + 0x0158, 0x0158, + 0x015a, 0x015a, + 0x015c, 0x015c, + 0x015e, 0x015e, + 0x0160, 0x0160, + 0x0162, 0x0162, + 0x0164, 0x0164, + 0x0166, 0x0166, + 0x0168, 0x0168, + 0x016a, 0x016a, + 0x016c, 0x016c, + 0x016e, 0x016e, + 0x0170, 0x0170, + 0x0172, 0x0172, + 0x0174, 0x0174, + 0x0176, 0x0176, + 0x0178, 0x0179, + 0x017b, 0x017b, + 0x017d, 0x017d, + 0x0181, 0x0182, + 0x0184, 0x0184, + 0x0186, 0x0187, + 0x0189, 0x018b, + 0x018e, 0x0191, + 0x0193, 0x0194, + 0x0196, 0x0198, + 0x019c, 0x019d, + 0x019f, 0x01a0, + 0x01a2, 0x01a2, + 0x01a4, 0x01a4, + 0x01a6, 0x01a7, + 0x01a9, 0x01a9, + 0x01ac, 0x01ac, + 0x01ae, 0x01af, + 0x01b1, 0x01b3, + 0x01b5, 0x01b5, + 0x01b7, 0x01b8, + 0x01bc, 0x01bc, + 0x01c4, 0x01c4, + 0x01c7, 0x01c7, + 0x01ca, 0x01ca, + 0x01cd, 0x01cd, + 0x01cf, 0x01cf, + 0x01d1, 0x01d1, + 0x01d3, 0x01d3, + 0x01d5, 0x01d5, + 0x01d7, 0x01d7, + 0x01d9, 0x01d9, + 0x01db, 0x01db, + 0x01de, 0x01de, + 0x01e0, 0x01e0, + 0x01e2, 0x01e2, + 0x01e4, 0x01e4, + 0x01e6, 0x01e6, + 0x01e8, 0x01e8, + 0x01ea, 0x01ea, + 0x01ec, 0x01ec, + 0x01ee, 0x01ee, + 0x01f1, 0x01f1, + 0x01f4, 0x01f4, + 0x01f6, 0x01f8, + 0x01fa, 0x01fa, + 0x01fc, 0x01fc, + 0x01fe, 0x01fe, + 0x0200, 0x0200, + 0x0202, 0x0202, + 0x0204, 0x0204, + 0x0206, 0x0206, + 0x0208, 0x0208, + 0x020a, 0x020a, + 0x020c, 0x020c, + 0x020e, 0x020e, + 0x0210, 0x0210, + 0x0212, 0x0212, + 0x0214, 0x0214, + 0x0216, 0x0216, + 0x0218, 0x0218, + 0x021a, 0x021a, + 0x021c, 0x021c, + 0x021e, 0x021e, + 0x0220, 0x0220, + 0x0222, 0x0222, + 0x0224, 0x0224, + 0x0226, 0x0226, + 0x0228, 0x0228, + 0x022a, 0x022a, + 0x022c, 0x022c, + 0x022e, 0x022e, + 0x0230, 0x0230, + 0x0232, 0x0232, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x038f, + 0x0391, 0x03a1, + 0x03a3, 0x03ab, + 0x03d2, 0x03d4, + 0x03d8, 0x03d8, + 0x03da, 0x03da, + 0x03dc, 0x03dc, + 0x03de, 0x03de, + 0x03e0, 0x03e0, + 0x03e2, 0x03e2, + 0x03e4, 0x03e4, + 0x03e6, 0x03e6, + 0x03e8, 0x03e8, + 0x03ea, 0x03ea, + 0x03ec, 0x03ec, + 0x03ee, 0x03ee, + 0x03f4, 0x03f4, + 0x03f7, 0x03f7, + 0x03f9, 0x03fa, + 0x0400, 0x042f, + 0x0460, 0x0460, + 0x0462, 0x0462, + 0x0464, 0x0464, + 0x0466, 0x0466, + 0x0468, 0x0468, + 0x046a, 0x046a, + 0x046c, 0x046c, + 0x046e, 0x046e, + 0x0470, 0x0470, + 0x0472, 0x0472, + 0x0474, 0x0474, + 0x0476, 0x0476, + 0x0478, 0x0478, + 0x047a, 0x047a, + 0x047c, 0x047c, + 0x047e, 0x047e, + 0x0480, 0x0480, + 0x048a, 0x048a, + 0x048c, 0x048c, + 0x048e, 0x048e, + 0x0490, 0x0490, + 0x0492, 0x0492, + 0x0494, 0x0494, + 0x0496, 0x0496, + 0x0498, 0x0498, + 0x049a, 0x049a, + 0x049c, 0x049c, + 0x049e, 0x049e, + 0x04a0, 0x04a0, + 0x04a2, 0x04a2, + 0x04a4, 0x04a4, + 0x04a6, 0x04a6, + 0x04a8, 0x04a8, + 0x04aa, 0x04aa, + 0x04ac, 0x04ac, + 0x04ae, 0x04ae, + 0x04b0, 0x04b0, + 0x04b2, 0x04b2, + 0x04b4, 0x04b4, + 0x04b6, 0x04b6, + 0x04b8, 0x04b8, + 0x04ba, 0x04ba, + 0x04bc, 0x04bc, + 0x04be, 0x04be, + 0x04c0, 0x04c1, + 0x04c3, 0x04c3, + 0x04c5, 0x04c5, + 0x04c7, 0x04c7, + 0x04c9, 0x04c9, + 0x04cb, 0x04cb, + 0x04cd, 0x04cd, + 0x04d0, 0x04d0, + 0x04d2, 0x04d2, + 0x04d4, 0x04d4, + 0x04d6, 0x04d6, + 0x04d8, 0x04d8, + 0x04da, 0x04da, + 0x04dc, 0x04dc, + 0x04de, 0x04de, + 0x04e0, 0x04e0, + 0x04e2, 0x04e2, + 0x04e4, 0x04e4, + 0x04e6, 0x04e6, + 0x04e8, 0x04e8, + 0x04ea, 0x04ea, + 0x04ec, 0x04ec, + 0x04ee, 0x04ee, + 0x04f0, 0x04f0, + 0x04f2, 0x04f2, + 0x04f4, 0x04f4, + 0x04f8, 0x04f8, + 0x0500, 0x0500, + 0x0502, 0x0502, + 0x0504, 0x0504, + 0x0506, 0x0506, + 0x0508, 0x0508, + 0x050a, 0x050a, + 0x050c, 0x050c, + 0x050e, 0x050e, + 0x0531, 0x0556, + 0x10a0, 0x10c5, + 0x1e00, 0x1e00, + 0x1e02, 0x1e02, + 0x1e04, 0x1e04, + 0x1e06, 0x1e06, + 0x1e08, 0x1e08, + 0x1e0a, 0x1e0a, + 0x1e0c, 0x1e0c, + 0x1e0e, 0x1e0e, + 0x1e10, 0x1e10, + 0x1e12, 0x1e12, + 0x1e14, 0x1e14, + 0x1e16, 0x1e16, + 0x1e18, 0x1e18, + 0x1e1a, 0x1e1a, + 0x1e1c, 0x1e1c, + 0x1e1e, 0x1e1e, + 0x1e20, 0x1e20, + 0x1e22, 0x1e22, + 0x1e24, 0x1e24, + 0x1e26, 0x1e26, + 0x1e28, 0x1e28, + 0x1e2a, 0x1e2a, + 0x1e2c, 0x1e2c, + 0x1e2e, 0x1e2e, + 0x1e30, 0x1e30, + 0x1e32, 0x1e32, + 0x1e34, 0x1e34, + 0x1e36, 0x1e36, + 0x1e38, 0x1e38, + 0x1e3a, 0x1e3a, + 0x1e3c, 0x1e3c, + 0x1e3e, 0x1e3e, + 0x1e40, 0x1e40, + 0x1e42, 0x1e42, + 0x1e44, 0x1e44, + 0x1e46, 0x1e46, + 0x1e48, 0x1e48, + 0x1e4a, 0x1e4a, + 0x1e4c, 0x1e4c, + 0x1e4e, 0x1e4e, + 0x1e50, 0x1e50, + 0x1e52, 0x1e52, + 0x1e54, 0x1e54, + 0x1e56, 0x1e56, + 0x1e58, 0x1e58, + 0x1e5a, 0x1e5a, + 0x1e5c, 0x1e5c, + 0x1e5e, 0x1e5e, + 0x1e60, 0x1e60, + 0x1e62, 0x1e62, + 0x1e64, 0x1e64, + 0x1e66, 0x1e66, + 0x1e68, 0x1e68, + 0x1e6a, 0x1e6a, + 0x1e6c, 0x1e6c, + 0x1e6e, 0x1e6e, + 0x1e70, 0x1e70, + 0x1e72, 0x1e72, + 0x1e74, 0x1e74, + 0x1e76, 0x1e76, + 0x1e78, 0x1e78, + 0x1e7a, 0x1e7a, + 0x1e7c, 0x1e7c, + 0x1e7e, 0x1e7e, + 0x1e80, 0x1e80, + 0x1e82, 0x1e82, + 0x1e84, 0x1e84, + 0x1e86, 0x1e86, + 0x1e88, 0x1e88, + 0x1e8a, 0x1e8a, + 0x1e8c, 0x1e8c, + 0x1e8e, 0x1e8e, + 0x1e90, 0x1e90, + 0x1e92, 0x1e92, + 0x1e94, 0x1e94, + 0x1ea0, 0x1ea0, + 0x1ea2, 0x1ea2, + 0x1ea4, 0x1ea4, + 0x1ea6, 0x1ea6, + 0x1ea8, 0x1ea8, + 0x1eaa, 0x1eaa, + 0x1eac, 0x1eac, + 0x1eae, 0x1eae, + 0x1eb0, 0x1eb0, + 0x1eb2, 0x1eb2, + 0x1eb4, 0x1eb4, + 0x1eb6, 0x1eb6, + 0x1eb8, 0x1eb8, + 0x1eba, 0x1eba, + 0x1ebc, 0x1ebc, + 0x1ebe, 0x1ebe, + 0x1ec0, 0x1ec0, + 0x1ec2, 0x1ec2, + 0x1ec4, 0x1ec4, + 0x1ec6, 0x1ec6, + 0x1ec8, 0x1ec8, + 0x1eca, 0x1eca, + 0x1ecc, 0x1ecc, + 0x1ece, 0x1ece, + 0x1ed0, 0x1ed0, + 0x1ed2, 0x1ed2, + 0x1ed4, 0x1ed4, + 0x1ed6, 0x1ed6, + 0x1ed8, 0x1ed8, + 0x1eda, 0x1eda, + 0x1edc, 0x1edc, + 0x1ede, 0x1ede, + 0x1ee0, 0x1ee0, + 0x1ee2, 0x1ee2, + 0x1ee4, 0x1ee4, + 0x1ee6, 0x1ee6, + 0x1ee8, 0x1ee8, + 0x1eea, 0x1eea, + 0x1eec, 0x1eec, + 0x1eee, 0x1eee, + 0x1ef0, 0x1ef0, + 0x1ef2, 0x1ef2, + 0x1ef4, 0x1ef4, + 0x1ef6, 0x1ef6, + 0x1ef8, 0x1ef8, + 0x1f08, 0x1f0f, + 0x1f18, 0x1f1d, + 0x1f28, 0x1f2f, + 0x1f38, 0x1f3f, + 0x1f48, 0x1f4d, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f5f, + 0x1f68, 0x1f6f, + 0x1fb8, 0x1fbb, + 0x1fc8, 0x1fcb, + 0x1fd8, 0x1fdb, + 0x1fe8, 0x1fec, + 0x1ff8, 0x1ffb, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210b, 0x210d, + 0x2110, 0x2112, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x2130, 0x2131, + 0x2133, 0x2133, + 0x213e, 0x213f, + 0x2145, 0x2145, + 0xff21, 0xff3a, + 0x10400, 0x10427, + 0x1d400, 0x1d419, + 0x1d434, 0x1d44d, + 0x1d468, 0x1d481, + 0x1d49c, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b5, + 0x1d4d0, 0x1d4e9, + 0x1d504, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d538, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d56c, 0x1d585, + 0x1d5a0, 0x1d5b9, + 0x1d5d4, 0x1d5ed, + 0x1d608, 0x1d621, + 0x1d63c, 0x1d655, + 0x1d670, 0x1d689, + 0x1d6a8, 0x1d6c0, + 0x1d6e2, 0x1d6fa, + 0x1d71c, 0x1d734, + 0x1d756, 0x1d76e, + 0x1d790, 0x1d7a8 +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBUpper */ + +static OnigCodePoint SBXDigit[] = { + 3, + 0x0030, 0x0039, + 0x0041, 0x0046, + 0x0061, 0x0066 +}; + +static OnigCodePoint SBASCII[] = { + 1, + 0x0000, 0x007f +}; + +static OnigCodePoint SBWord[] = { + 4, + 0x0030, 0x0039, + 0x0041, 0x005a, + 0x005f, 0x005f, + 0x0061, 0x007a +}; + +static OnigCodePoint MBWord[] = { +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + 432, +#else + 8, +#endif + 0x00aa, 0x00aa, + 0x00b2, 0x00b3, + 0x00b5, 0x00b5, + 0x00b9, 0x00ba, + 0x00bc, 0x00be, + 0x00c0, 0x00d6, + 0x00d8, 0x00f6, +#ifndef USE_UNICODE_FULL_RANGE_CTYPE + 0x00f8, 0x7fffffff +#else /* not USE_UNICODE_FULL_RANGE_CTYPE */ + 0x00f8, 0x0236, + 0x0250, 0x02c1, + 0x02c6, 0x02d1, + 0x02e0, 0x02e4, + 0x02ee, 0x02ee, + 0x0300, 0x0357, + 0x035d, 0x036f, + 0x037a, 0x037a, + 0x0386, 0x0386, + 0x0388, 0x038a, + 0x038c, 0x038c, + 0x038e, 0x03a1, + 0x03a3, 0x03ce, + 0x03d0, 0x03f5, + 0x03f7, 0x03fb, + 0x0400, 0x0481, + 0x0483, 0x0486, + 0x0488, 0x04ce, + 0x04d0, 0x04f5, + 0x04f8, 0x04f9, + 0x0500, 0x050f, + 0x0531, 0x0556, + 0x0559, 0x0559, + 0x0561, 0x0587, + 0x0591, 0x05a1, + 0x05a3, 0x05b9, + 0x05bb, 0x05bd, + 0x05bf, 0x05bf, + 0x05c1, 0x05c2, + 0x05c4, 0x05c4, + 0x05d0, 0x05ea, + 0x05f0, 0x05f2, + 0x0610, 0x0615, + 0x0621, 0x063a, + 0x0640, 0x0658, + 0x0660, 0x0669, + 0x066e, 0x06d3, + 0x06d5, 0x06dc, + 0x06de, 0x06e8, + 0x06ea, 0x06fc, + 0x06ff, 0x06ff, + 0x0710, 0x074a, + 0x074d, 0x074f, + 0x0780, 0x07b1, + 0x0901, 0x0939, + 0x093c, 0x094d, + 0x0950, 0x0954, + 0x0958, 0x0963, + 0x0966, 0x096f, + 0x0981, 0x0983, + 0x0985, 0x098c, + 0x098f, 0x0990, + 0x0993, 0x09a8, + 0x09aa, 0x09b0, + 0x09b2, 0x09b2, + 0x09b6, 0x09b9, + 0x09bc, 0x09c4, + 0x09c7, 0x09c8, + 0x09cb, 0x09cd, + 0x09d7, 0x09d7, + 0x09dc, 0x09dd, + 0x09df, 0x09e3, + 0x09e6, 0x09f1, + 0x09f4, 0x09f9, + 0x0a01, 0x0a03, + 0x0a05, 0x0a0a, + 0x0a0f, 0x0a10, + 0x0a13, 0x0a28, + 0x0a2a, 0x0a30, + 0x0a32, 0x0a33, + 0x0a35, 0x0a36, + 0x0a38, 0x0a39, + 0x0a3c, 0x0a3c, + 0x0a3e, 0x0a42, + 0x0a47, 0x0a48, + 0x0a4b, 0x0a4d, + 0x0a59, 0x0a5c, + 0x0a5e, 0x0a5e, + 0x0a66, 0x0a74, + 0x0a81, 0x0a83, + 0x0a85, 0x0a8d, + 0x0a8f, 0x0a91, + 0x0a93, 0x0aa8, + 0x0aaa, 0x0ab0, + 0x0ab2, 0x0ab3, + 0x0ab5, 0x0ab9, + 0x0abc, 0x0ac5, + 0x0ac7, 0x0ac9, + 0x0acb, 0x0acd, + 0x0ad0, 0x0ad0, + 0x0ae0, 0x0ae3, + 0x0ae6, 0x0aef, + 0x0b01, 0x0b03, + 0x0b05, 0x0b0c, + 0x0b0f, 0x0b10, + 0x0b13, 0x0b28, + 0x0b2a, 0x0b30, + 0x0b32, 0x0b33, + 0x0b35, 0x0b39, + 0x0b3c, 0x0b43, + 0x0b47, 0x0b48, + 0x0b4b, 0x0b4d, + 0x0b56, 0x0b57, + 0x0b5c, 0x0b5d, + 0x0b5f, 0x0b61, + 0x0b66, 0x0b6f, + 0x0b71, 0x0b71, + 0x0b82, 0x0b83, + 0x0b85, 0x0b8a, + 0x0b8e, 0x0b90, + 0x0b92, 0x0b95, + 0x0b99, 0x0b9a, + 0x0b9c, 0x0b9c, + 0x0b9e, 0x0b9f, + 0x0ba3, 0x0ba4, + 0x0ba8, 0x0baa, + 0x0bae, 0x0bb5, + 0x0bb7, 0x0bb9, + 0x0bbe, 0x0bc2, + 0x0bc6, 0x0bc8, + 0x0bca, 0x0bcd, + 0x0bd7, 0x0bd7, + 0x0be7, 0x0bf2, + 0x0c01, 0x0c03, + 0x0c05, 0x0c0c, + 0x0c0e, 0x0c10, + 0x0c12, 0x0c28, + 0x0c2a, 0x0c33, + 0x0c35, 0x0c39, + 0x0c3e, 0x0c44, + 0x0c46, 0x0c48, + 0x0c4a, 0x0c4d, + 0x0c55, 0x0c56, + 0x0c60, 0x0c61, + 0x0c66, 0x0c6f, + 0x0c82, 0x0c83, + 0x0c85, 0x0c8c, + 0x0c8e, 0x0c90, + 0x0c92, 0x0ca8, + 0x0caa, 0x0cb3, + 0x0cb5, 0x0cb9, + 0x0cbc, 0x0cc4, + 0x0cc6, 0x0cc8, + 0x0cca, 0x0ccd, + 0x0cd5, 0x0cd6, + 0x0cde, 0x0cde, + 0x0ce0, 0x0ce1, + 0x0ce6, 0x0cef, + 0x0d02, 0x0d03, + 0x0d05, 0x0d0c, + 0x0d0e, 0x0d10, + 0x0d12, 0x0d28, + 0x0d2a, 0x0d39, + 0x0d3e, 0x0d43, + 0x0d46, 0x0d48, + 0x0d4a, 0x0d4d, + 0x0d57, 0x0d57, + 0x0d60, 0x0d61, + 0x0d66, 0x0d6f, + 0x0d82, 0x0d83, + 0x0d85, 0x0d96, + 0x0d9a, 0x0db1, + 0x0db3, 0x0dbb, + 0x0dbd, 0x0dbd, + 0x0dc0, 0x0dc6, + 0x0dca, 0x0dca, + 0x0dcf, 0x0dd4, + 0x0dd6, 0x0dd6, + 0x0dd8, 0x0ddf, + 0x0df2, 0x0df3, + 0x0e01, 0x0e3a, + 0x0e40, 0x0e4e, + 0x0e50, 0x0e59, + 0x0e81, 0x0e82, + 0x0e84, 0x0e84, + 0x0e87, 0x0e88, + 0x0e8a, 0x0e8a, + 0x0e8d, 0x0e8d, + 0x0e94, 0x0e97, + 0x0e99, 0x0e9f, + 0x0ea1, 0x0ea3, + 0x0ea5, 0x0ea5, + 0x0ea7, 0x0ea7, + 0x0eaa, 0x0eab, + 0x0ead, 0x0eb9, + 0x0ebb, 0x0ebd, + 0x0ec0, 0x0ec4, + 0x0ec6, 0x0ec6, + 0x0ec8, 0x0ecd, + 0x0ed0, 0x0ed9, + 0x0edc, 0x0edd, + 0x0f00, 0x0f00, + 0x0f18, 0x0f19, + 0x0f20, 0x0f33, + 0x0f35, 0x0f35, + 0x0f37, 0x0f37, + 0x0f39, 0x0f39, + 0x0f3e, 0x0f47, + 0x0f49, 0x0f6a, + 0x0f71, 0x0f84, + 0x0f86, 0x0f8b, + 0x0f90, 0x0f97, + 0x0f99, 0x0fbc, + 0x0fc6, 0x0fc6, + 0x1000, 0x1021, + 0x1023, 0x1027, + 0x1029, 0x102a, + 0x102c, 0x1032, + 0x1036, 0x1039, + 0x1040, 0x1049, + 0x1050, 0x1059, + 0x10a0, 0x10c5, + 0x10d0, 0x10f8, + 0x1100, 0x1159, + 0x115f, 0x11a2, + 0x11a8, 0x11f9, + 0x1200, 0x1206, + 0x1208, 0x1246, + 0x1248, 0x1248, + 0x124a, 0x124d, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125a, 0x125d, + 0x1260, 0x1286, + 0x1288, 0x1288, + 0x128a, 0x128d, + 0x1290, 0x12ae, + 0x12b0, 0x12b0, + 0x12b2, 0x12b5, + 0x12b8, 0x12be, + 0x12c0, 0x12c0, + 0x12c2, 0x12c5, + 0x12c8, 0x12ce, + 0x12d0, 0x12d6, + 0x12d8, 0x12ee, + 0x12f0, 0x130e, + 0x1310, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x131e, + 0x1320, 0x1346, + 0x1348, 0x135a, + 0x1369, 0x137c, + 0x13a0, 0x13f4, + 0x1401, 0x166c, + 0x166f, 0x1676, + 0x1681, 0x169a, + 0x16a0, 0x16ea, + 0x16ee, 0x16f0, + 0x1700, 0x170c, + 0x170e, 0x1714, + 0x1720, 0x1734, + 0x1740, 0x1753, + 0x1760, 0x176c, + 0x176e, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17b3, + 0x17b6, 0x17d3, + 0x17d7, 0x17d7, + 0x17dc, 0x17dd, + 0x17e0, 0x17e9, + 0x17f0, 0x17f9, + 0x180b, 0x180d, + 0x1810, 0x1819, + 0x1820, 0x1877, + 0x1880, 0x18a9, + 0x1900, 0x191c, + 0x1920, 0x192b, + 0x1930, 0x193b, + 0x1946, 0x196d, + 0x1970, 0x1974, + 0x1d00, 0x1d6b, + 0x1e00, 0x1e9b, + 0x1ea0, 0x1ef9, + 0x1f00, 0x1f15, + 0x1f18, 0x1f1d, + 0x1f20, 0x1f45, + 0x1f48, 0x1f4d, + 0x1f50, 0x1f57, + 0x1f59, 0x1f59, + 0x1f5b, 0x1f5b, + 0x1f5d, 0x1f5d, + 0x1f5f, 0x1f7d, + 0x1f80, 0x1fb4, + 0x1fb6, 0x1fbc, + 0x1fbe, 0x1fbe, + 0x1fc2, 0x1fc4, + 0x1fc6, 0x1fcc, + 0x1fd0, 0x1fd3, + 0x1fd6, 0x1fdb, + 0x1fe0, 0x1fec, + 0x1ff2, 0x1ff4, + 0x1ff6, 0x1ffc, + 0x203f, 0x2040, + 0x2054, 0x2054, + 0x2070, 0x2071, + 0x2074, 0x2079, + 0x207f, 0x2089, + 0x20d0, 0x20ea, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210a, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211d, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212a, 0x212d, + 0x212f, 0x2131, + 0x2133, 0x2139, + 0x213d, 0x213f, + 0x2145, 0x2149, + 0x2153, 0x2183, + 0x2460, 0x249b, + 0x24ea, 0x24ff, + 0x2776, 0x2793, + 0x3005, 0x3007, + 0x3021, 0x302f, + 0x3031, 0x3035, + 0x3038, 0x303c, + 0x3041, 0x3096, + 0x3099, 0x309a, + 0x309d, 0x309f, + 0x30a1, 0x30ff, + 0x3105, 0x312c, + 0x3131, 0x318e, + 0x3192, 0x3195, + 0x31a0, 0x31b7, + 0x31f0, 0x31ff, + 0x3220, 0x3229, + 0x3251, 0x325f, + 0x3280, 0x3289, + 0x32b1, 0x32bf, + 0x3400, 0x4db5, + 0x4e00, 0x9fa5, + 0xa000, 0xa48c, + 0xac00, 0xd7a3, + 0xf900, 0xfa2d, + 0xfa30, 0xfa6a, + 0xfb00, 0xfb06, + 0xfb13, 0xfb17, + 0xfb1d, 0xfb28, + 0xfb2a, 0xfb36, + 0xfb38, 0xfb3c, + 0xfb3e, 0xfb3e, + 0xfb40, 0xfb41, + 0xfb43, 0xfb44, + 0xfb46, 0xfbb1, + 0xfbd3, 0xfd3d, + 0xfd50, 0xfd8f, + 0xfd92, 0xfdc7, + 0xfdf0, 0xfdfb, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe23, + 0xfe33, 0xfe34, + 0xfe4d, 0xfe4f, + 0xfe70, 0xfe74, + 0xfe76, 0xfefc, + 0xff10, 0xff19, + 0xff21, 0xff3a, + 0xff3f, 0xff3f, + 0xff41, 0xff5a, + 0xff65, 0xffbe, + 0xffc2, 0xffc7, + 0xffca, 0xffcf, + 0xffd2, 0xffd7, + 0xffda, 0xffdc, + 0x10000, 0x1000b, + 0x1000d, 0x10026, + 0x10028, 0x1003a, + 0x1003c, 0x1003d, + 0x1003f, 0x1004d, + 0x10050, 0x1005d, + 0x10080, 0x100fa, + 0x10107, 0x10133, + 0x10300, 0x1031e, + 0x10320, 0x10323, + 0x10330, 0x1034a, + 0x10380, 0x1039d, + 0x10400, 0x1049d, + 0x104a0, 0x104a9, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080a, 0x10835, + 0x10837, 0x10838, + 0x1083c, 0x1083c, + 0x1083f, 0x1083f, + 0x1d165, 0x1d169, + 0x1d16d, 0x1d172, + 0x1d17b, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d400, 0x1d454, + 0x1d456, 0x1d49c, + 0x1d49e, 0x1d49f, + 0x1d4a2, 0x1d4a2, + 0x1d4a5, 0x1d4a6, + 0x1d4a9, 0x1d4ac, + 0x1d4ae, 0x1d4b9, + 0x1d4bb, 0x1d4bb, + 0x1d4bd, 0x1d4c3, + 0x1d4c5, 0x1d505, + 0x1d507, 0x1d50a, + 0x1d50d, 0x1d514, + 0x1d516, 0x1d51c, + 0x1d51e, 0x1d539, + 0x1d53b, 0x1d53e, + 0x1d540, 0x1d544, + 0x1d546, 0x1d546, + 0x1d54a, 0x1d550, + 0x1d552, 0x1d6a3, + 0x1d6a8, 0x1d6c0, + 0x1d6c2, 0x1d6da, + 0x1d6dc, 0x1d6fa, + 0x1d6fc, 0x1d714, + 0x1d716, 0x1d734, + 0x1d736, 0x1d74e, + 0x1d750, 0x1d76e, + 0x1d770, 0x1d788, + 0x1d78a, 0x1d7a8, + 0x1d7aa, 0x1d7c2, + 0x1d7c4, 0x1d7c9, + 0x1d7ce, 0x1d7ff, + 0x20000, 0x2a6d6, + 0x2f800, 0x2fa1d, + 0xe0100, 0xe01ef +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ +}; /* end of MBWord */ + + static int -utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, - OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) +utf8_get_ctype_code_range(int ctype, + OnigCodePoint* sbr[], OnigCodePoint* mbr[]) { #define CR_SET(sbl,mbl) do { \ - *nsb = sizeof(sbl) / sizeof(OnigCodePointRange); \ - *nmb = sizeof(mbl) / sizeof(OnigCodePointRange); \ *sbr = sbl; \ *mbr = mbl; \ } while (0) #define CR_SB_SET(sbl) do { \ - *nsb = sizeof(sbl) / sizeof(OnigCodePointRange); \ - *nmb = 0; \ *sbr = sbl; \ + *mbr = EmptyRange; \ } while (0) - static OnigCodePointRange SBAlpha[] = { - { 0x41, 0x5a }, - { 0x61, 0x7a } - }; - - static OnigCodePointRange MBAlpha[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } - }; - - static OnigCodePointRange SBBlank[] = { - { 0x09, 0x09 }, - { 0x20, 0x20 } - }; - - static OnigCodePointRange MBBlank[] = { - { 0xa0, 0xa0 } - }; - - static OnigCodePointRange SBCntrl[] = { - { 0x00, 0x1f }, - { 0x7f, 0x7f } - }; - - static OnigCodePointRange MBCntrl[] = { - { 0x80, 0x9f } - }; - - static OnigCodePointRange SBDigit[] = { - { 0x30, 0x39 } - }; - - static OnigCodePointRange SBGraph[] = { - { 0x21, 0x7e } - }; - - static OnigCodePointRange MBGraph[] = { - { 0xa1, 0x220 } - }; - - static OnigCodePointRange SBLower[] = { - { 0x61, 0x7a } - }; - - static OnigCodePointRange MBLower[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xdf, 0xf6 }, - { 0xf8, 0xff } - }; - - static OnigCodePointRange SBPrint[] = { - { 0x20, 0x7e } - }; - - static OnigCodePointRange MBPrint[] = { - { 0xa0, 0x220 } - }; - - static OnigCodePointRange SBPunct[] = { - { 0x21, 0x23 }, - { 0x25, 0x2a }, - { 0x2c, 0x2f }, - { 0x3a, 0x3b }, - { 0x3f, 0x40 }, - { 0x5b, 0x5d }, - { 0x5f, 0x5f }, - { 0x7b, 0x7b }, - { 0x7d, 0x7d } - }; - - static OnigCodePointRange MBPunct[] = { - { 0xa1, 0xa1 }, - { 0xab, 0xab }, - { 0xad, 0xad }, - { 0xb7, 0xb7 }, - { 0xbb, 0xbb }, - { 0xbf, 0xbf } - }; - - static OnigCodePointRange SBSpace[] = { - { 0x09, 0x0d }, - { 0x20, 0x20 } - }; - - static OnigCodePointRange MBSpace[] = { - { 0xa0, 0xa0 } - }; - - static OnigCodePointRange SBUpper[] = { - { 0x41, 0x5a } - }; - - static OnigCodePointRange MBUpper[] = { - { 0xc0, 0xd6 }, - { 0xd8, 0xde } - }; - - static OnigCodePointRange SBXDigit[] = { - { 0x30, 0x39 }, - { 0x41, 0x46 }, - { 0x61, 0x66 } - }; - - static OnigCodePointRange SBWord[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x5f, 0x5f }, - { 0x61, 0x7a } - }; - - static OnigCodePointRange MBWord[] = { - { 0xaa, 0xaa }, - { 0xb2, 0xb3 }, - { 0xb5, 0xb5 }, - { 0xb9, 0xba }, - { 0xbc, 0xbe }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, -#if 0 - { 0xf8, 0x220 } -#else - { 0xf8, 0x7fffffff } /* all multibyte code as word */ -#endif - }; - - static OnigCodePointRange SBAscii[] = { - { 0x00, 0x7f } - }; - - static OnigCodePointRange SBAlnum[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x61, 0x7a } - }; - - static OnigCodePointRange MBAlnum[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } - }; - switch (ctype) { case ONIGENC_CTYPE_ALPHA: CR_SET(SBAlpha, MBAlpha); @@ -383,7 +3577,7 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, CR_SET(SBCntrl, MBCntrl); break; case ONIGENC_CTYPE_DIGIT: - CR_SB_SET(SBDigit); + CR_SET(SBDigit, MBDigit); break; case ONIGENC_CTYPE_GRAPH: CR_SET(SBGraph, MBGraph); @@ -410,14 +3604,14 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, CR_SET(SBWord, MBWord); break; case ONIGENC_CTYPE_ASCII: - CR_SB_SET(SBAscii); + CR_SB_SET(SBASCII); break; case ONIGENC_CTYPE_ALNUM: CR_SET(SBAlnum, MBAlnum); break; default: - return ONIGERR_TYPE_BUG; + return ONIGENCERR_TYPE_BUG; break; } @@ -425,142 +3619,120 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, } static int -utf8_get_all_fold_match_code(OnigCodePoint** codes) +utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { - static OnigCodePoint list[] = { - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, - 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, - 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + OnigCodePoint *range; +#endif - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, - 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, - 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, - }; - - *codes = list; - return sizeof(list) / sizeof(OnigCodePoint); -} - -static int -utf8_get_fold_match_info(UChar* p, UChar* end, OnigEncFoldMatchInfo** info) -{ - - static OnigEncFoldMatchInfo xc[] = { - { 2, { 2, 2 }, { "\303\200", "\303\240" } }, /* CodePoint 0xc0 */ - { 2, { 2, 2 }, { "\303\201", "\303\241" } }, - { 2, { 2, 2 }, { "\303\202", "\303\242" } }, - { 2, { 2, 2 }, { "\303\203", "\303\243" } }, - { 2, { 2, 2 }, { "\303\204", "\303\244" } }, - { 2, { 2, 2 }, { "\303\205", "\303\245" } }, - { 2, { 2, 2 }, { "\303\206", "\303\246" } }, - { 2, { 2, 2 }, { "\303\207", "\303\247" } }, - { 2, { 2, 2 }, { "\303\210", "\303\250" } }, - { 2, { 2, 2 }, { "\303\211", "\303\251" } }, - { 2, { 2, 2 }, { "\303\212", "\303\252" } }, - { 2, { 2, 2 }, { "\303\213", "\303\253" } }, - { 2, { 2, 2 }, { "\303\214", "\303\254" } }, - { 2, { 2, 2 }, { "\303\215", "\303\255" } }, - { 2, { 2, 2 }, { "\303\216", "\303\256" } }, - { 2, { 2, 2 }, { "\303\217", "\303\257" } }, - { 2, { 2, 2 }, { "\303\220", "\303\260" } }, /* CodePoint 0xd0 */ - { 2, { 2, 2 }, { "\303\221", "\303\261" } }, - { 2, { 2, 2 }, { "\303\222", "\303\262" } }, - { 2, { 2, 2 }, { "\303\223", "\303\263" } }, - { 2, { 2, 2 }, { "\303\224", "\303\264" } }, - { 2, { 2, 2 }, { "\303\225", "\303\265" } }, - { 2, { 2, 2 }, { "\303\226", "\303\266" } }, - { 0, { 0 }, { "" } }, - { 2, { 2, 2 }, { "\303\230", "\303\270" } }, - { 2, { 2, 2 }, { "\303\231", "\303\271" } }, - { 2, { 2, 2 }, { "\303\232", "\303\272" } }, - { 2, { 2, 2 }, { "\303\233", "\303\273" } }, - { 2, { 2, 2 }, { "\303\234", "\303\274" } }, - { 2, { 2, 2 }, { "\303\235", "\303\275" } }, - { 2, { 2, 2 }, { "\303\236", "\303\276" } }, - { 3, { 2, 2, 2 }, { "\303\237", "ss", "SS" }} /* ess-tsett(U+00DF) */ - }; - - if (p + 1 >= end) return -1; - if (*p < 0x80) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &(xc[0xdf - 0xc0]); - return 2; - } - } - else if (*p == 195) { /* 195 == '\303' */ - int c = *(p+1); - if (c >= 128) { - if (c <= 159) { /* upper */ - if (c == 151) return -1; /* 0xd7 */ - *info = &(xc[c - 128]); - return 2; - } - else { /* lower */ - if (c == 183) return -1; /* 0xf7 */ - *info = &(xc[c - 160]); - return 2; - } - } + if (code < 256) { + return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); } - return -1; /* is not a fold string. */ -} +#ifdef USE_UNICODE_FULL_RANGE_CTYPE + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + range = MBAlpha; + break; + case ONIGENC_CTYPE_BLANK: + range = MBBlank; + break; + case ONIGENC_CTYPE_CNTRL: + range = MBCntrl; + break; + case ONIGENC_CTYPE_DIGIT: + range = MBDigit; + break; + case ONIGENC_CTYPE_GRAPH: + range = MBGraph; + break; + case ONIGENC_CTYPE_LOWER: + range = MBLower; + break; + case ONIGENC_CTYPE_PRINT: + range = MBPrint; + break; + case ONIGENC_CTYPE_PUNCT: + range = MBPunct; + break; + case ONIGENC_CTYPE_SPACE: + range = MBSpace; + break; + case ONIGENC_CTYPE_UPPER: + range = MBUpper; + break; + case ONIGENC_CTYPE_XDIGIT: + return FALSE; + break; + case ONIGENC_CTYPE_WORD: + range = MBWord; + break; + case ONIGENC_CTYPE_ASCII: + return FALSE; + break; + case ONIGENC_CTYPE_ALNUM: + range = MBAlnum; + break; + + default: + return ONIGENCERR_TYPE_BUG; + break; + } + + return onig_is_in_code_range((UChar* )range, code); + +#else + + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { +#ifdef USE_INVALID_CODE_SCHEME + if (code <= VALID_CODE_LIMIT) +#endif + return TRUE; + } +#endif /* USE_UNICODE_FULL_RANGE_CTYPE */ + + return FALSE; +} static UChar* -utf8_left_adjust_char_head(UChar* start, UChar* s) +utf8_left_adjust_char_head(const UChar* start, const UChar* s) { - UChar *p; + const UChar *p; - if (s <= start) return s; + if (s <= start) return (UChar* )s; p = s; while (!utf8_islead(*p) && p > start) p--; - return p; -} - -static int -utf8_is_allowed_reverse_match(UChar* s, UChar* end) -{ - return TRUE; + return (UChar* )p; } OnigEncodingType OnigEncodingUTF8 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 - }, + utf8_mbc_enc_len, "UTF-8", /* name */ 6, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_FULL, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, utf8_mbc_to_code, utf8_code_to_mbclen, utf8_code_to_mbc, - utf8_mbc_to_lower, - utf8_mbc_is_case_ambig, - utf8_code_is_ctype, + utf8_mbc_to_normalize, + utf8_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + utf8_is_code_ctype, utf8_get_ctype_code_range, utf8_left_adjust_char_head, - utf8_is_allowed_reverse_match, - utf8_get_all_fold_match_code, - utf8_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; diff --git a/ext/mbstring/oniguruma/index.html b/ext/mbstring/oniguruma/index.html new file mode 100755 index 00000000000..293ea442a37 --- /dev/null +++ b/ext/mbstring/oniguruma/index.html @@ -0,0 +1,173 @@ + + + + Oniguruma + + + +

Oniguruma

+

+2005/02/19 (C) K.Kosako +

+

+ + + +

+ +

+Oniguruma is a regular expressions library.
+The characteristics of this library is that different character encoding +
for every regular expression object can be specified. +

+ +
+
Supported character encodings:
+ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE,
+EUC-JP, EUC-TW, EUC-KR, EUC-CN,
+Shift_JIS, Big5, KOI8-R, KOI8,
+ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5,
+ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10,
+ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 +

+
+

+ +

+ +
What's new + +
    +
  • Character types(\w, \s, \d and POSIX bracket) were supported in full code point range with the Version 4.0.1 of the Unicode Standard. (since Version 3.5.0) +
+
+ +
+ +
+
There are two ways of using of it in this program. +
    +
  • (1) C library (supported APIs: GNU regex, POSIX and Oniguruma native) +
  • (2) Built-in regular expressions engine of Ruby 1.6/1.8/1.9
    + In Ruby 1.9, Oniguruma is already incorporated by Kazuo Saito. +
+
+ +
+
Platform: +
    +
  • Unix (include Mac OS X) +
  • Cygwin +
  • Win32 +
+ +
+
License:
+When this software is partly used or it is distributed with Ruby, +this of Ruby follows the license of Ruby.
+It follows the BSD license in the case of the one except for it. +

+ +
Download: + + +
+ +* 3.X.X supports UTF-16/UTF-32, Ruby 1.9.X.
+* 2.X.X does not support UTF-16/UTF-32, supports Ruby 1.6/1.8. +
+ +
+
+
Documents: (version 3.7.0) + + +
+
Sample Programs: + + +
+
Links: + + +
+
References: + + +
+ +
+

+and I'm thankful to Akinori MUSHA. +

+ + +
+ + diff --git a/ext/mbstring/oniguruma/oniggnu.h b/ext/mbstring/oniguruma/oniggnu.h index d78dc18b11e..4a695154669 100644 --- a/ext/mbstring/oniguruma/oniggnu.h +++ b/ext/mbstring/oniguruma/oniggnu.h @@ -1,15 +1,40 @@ -/********************************************************************** - - oniggnu.h - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef ONIGGNU_H #define ONIGGNU_H +/********************************************************************** + oniggnu.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "oniguruma.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MBCTYPE_ASCII 0 #define MBCTYPE_EUC 1 #define MBCTYPE_SJIS 2 @@ -19,14 +44,31 @@ #ifndef RE_NREGS #define RE_NREGS ONIG_NREGION #endif -#define RE_OPTION_IGNORECASE ONIG_OPTION_IGNORECASE -#define RE_OPTION_EXTENDED ONIG_OPTION_EXTEND -#define RE_OPTION_MULTILINE ONIG_OPTION_MULTILINE -#define RE_OPTION_SINGLELINE ONIG_OPTION_SINGLELINE -#define RE_OPTION_LONGEST ONIG_OPTION_FIND_LONGEST -#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) + +#define RE_OPTION_IGNORECASE ONIG_OPTION_IGNORECASE +#define RE_OPTION_EXTENDED ONIG_OPTION_EXTEND +#define RE_OPTION_MULTILINE ONIG_OPTION_MULTILINE +#define RE_OPTION_SINGLELINE ONIG_OPTION_SINGLELINE +#define RE_OPTION_LONGEST ONIG_OPTION_FIND_LONGEST +#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) +#define RE_OPTION_FIND_NOT_EMPTY ONIG_OPTION_FIND_NOT_EMPTY +#define RE_OPTION_NEGATE_SINGLELINE ONIG_OPTION_NEGATE_SINGLELINE +#define RE_OPTION_DONT_CAPTURE_GROUP ONIG_OPTION_DONT_CAPTURE_GROUP +#define RE_OPTION_CAPTURE_GROUP ONIG_OPTION_CAPTURE_GROUP #ifdef RUBY_PLATFORM + +#ifndef ONIG_RUBY_M17N + +ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; + +#undef ismbchar +#define ismbchar(c) (mbclen((c)) != 1) +#define mbclen(c) \ + ONIGENC_MBC_ENC_LEN(OnigEncDefaultCharEncoding, (UChar* )(&c)) + +#endif /* ifndef ONIG_RUBY_M17N */ + #define re_mbcinit ruby_re_mbcinit #define re_compile_pattern ruby_re_compile_pattern #define re_recompile_pattern ruby_re_recompile_pattern @@ -74,4 +116,8 @@ void re_free_registers P_((struct re_registers*)); ONIG_EXTERN int re_alloc_pattern P_((struct re_pattern_buffer**)); /* added */ +#ifdef __cplusplus +} +#endif + #endif /* ONIGGNU_H */ diff --git a/ext/mbstring/oniguruma/onigposix.h b/ext/mbstring/oniguruma/onigposix.h index 3793ae6bd99..cfeb88a2928 100644 --- a/ext/mbstring/oniguruma/onigposix.h +++ b/ext/mbstring/oniguruma/onigposix.h @@ -1,12 +1,38 @@ -/********************************************************************** - - onigposix.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef ONIGPOSIX_H #define ONIGPOSIX_H +/********************************************************************** + onigposix.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include + +#ifdef __cplusplus +extern "C" { +#endif /* options */ #define REG_ICASE (1<<0) @@ -38,12 +64,12 @@ #define REG_EONIG_THREAD 17 /* character encodings (for reg_set_encoding()) */ -#define REG_POSIX_ENCODING_ASCII 0 -#define REG_POSIX_ENCODING_EUC_JP 1 -#define REG_POSIX_ENCODING_SJIS 2 -#define REG_POSIX_ENCODING_UTF8 3 - -#include +#define REG_POSIX_ENCODING_ASCII 0 +#define REG_POSIX_ENCODING_EUC_JP 1 +#define REG_POSIX_ENCODING_SJIS 2 +#define REG_POSIX_ENCODING_UTF8 3 +#define REG_POSIX_ENCODING_UTF16_BE 4 +#define REG_POSIX_ENCODING_UTF16_LE 5 typedef int regoff_t; @@ -70,7 +96,7 @@ typedef struct { #endif #ifndef ONIG_EXTERN -#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(_WIN32) && !defined(__GNUC__) #if defined(EXPORT) || defined(RUBY_EXPORT) #define ONIG_EXTERN extern __declspec(dllexport) #else @@ -103,7 +129,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; -/* predefined syntaxes (see regparse.c) */ +/* predefined syntaxes (see regsyntax.c) */ #define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) #define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) #define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) @@ -119,6 +145,9 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax)); ONIG_EXTERN void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from)); +ONIG_EXTERN const char* onig_version P_((void)); +ONIG_EXTERN const char* onig_copyright P_((void)); + #endif /* ONIGURUMA_H */ @@ -129,8 +158,12 @@ ONIG_EXTERN size_t regerror P_((int code, const regex_t* reg, char* buf, size_t /* extended API */ ONIG_EXTERN void reg_set_encoding P_((int enc)); -ONIG_EXTERN int reg_name_to_group_numbers P_((regex_t* reg, unsigned char* name, unsigned char* name_end, int** nums)); -ONIG_EXTERN int reg_foreach_name P_((regex_t* reg, int (*func)(unsigned char*,unsigned char*,int,int*,regex_t*,void*), void* arg)); +ONIG_EXTERN int reg_name_to_group_numbers P_((regex_t* reg, const unsigned char* name, const unsigned char* name_end, int** nums)); +ONIG_EXTERN int reg_foreach_name P_((regex_t* reg, int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*), void* arg)); ONIG_EXTERN int reg_number_of_names P_((regex_t* reg)); +#ifdef __cplusplus +} +#endif + #endif /* ONIGPOSIX_H */ diff --git a/ext/mbstring/oniguruma/oniguruma.h b/ext/mbstring/oniguruma/oniguruma.h index 99f6bdab0d2..2106774f717 100644 --- a/ext/mbstring/oniguruma/oniguruma.h +++ b/ext/mbstring/oniguruma/oniguruma.h @@ -1,19 +1,53 @@ -/********************************************************************** - - oniguruma.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef ONIGURUMA_H #define ONIGURUMA_H +/********************************************************************** + oniguruma.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "php_onig_compat.h" +#ifdef __cplusplus +extern "C" { +#endif + #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 2 -#define ONIGURUMA_VERSION_MINOR 2 -#define ONIGURUMA_VERSION_TEENY 4 +#define ONIGURUMA_VERSION_MAJOR 3 +#define ONIGURUMA_VERSION_MINOR 7 +#define ONIGURUMA_VERSION_TEENY 0 + +#ifdef __cplusplus +# ifndef HAVE_PROTOTYPES +# define HAVE_PROTOTYPES 1 +# endif +# ifndef HAVE_STDARG_PROTOTYPES +# define HAVE_STDARG_PROTOTYPES 1 +# endif +#endif #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -32,7 +66,7 @@ #endif #ifndef ONIG_EXTERN -#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(_WIN32) && !defined(__GNUC__) #if defined(EXPORT) || defined(RUBY_EXPORT) #define ONIG_EXTERN extern __declspec(dllexport) #else @@ -53,17 +87,60 @@ typedef unsigned int OnigDistance; #define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) +/* ambiguous match flag */ +typedef unsigned int OnigAmbigType; + +ONIG_EXTERN OnigAmbigType OnigDefaultAmbigFlag; + +#define ONIGENC_AMBIGUOUS_MATCH_NONE 0 +#define ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE (1<<0) +#define ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE (1<<1) +/* #define ONIGENC_AMBIGUOUS_MATCH_ACCENT (1<<2) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_HIRAGANA_KATAKANA (1<<3) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_KATAKANA_WIDTH (1<<4) */ + +#define ONIGENC_AMBIGUOUS_MATCH_LIMIT (1<<1) +#define ONIGENC_AMBIGUOUS_MATCH_COMPOUND (1<<30) + +#define ONIGENC_AMBIGUOUS_MATCH_FULL \ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_COMPOUND ) +#define ONIGENC_AMBIGUOUS_MATCH_DEFAULT OnigDefaultAmbigFlag + + +#define ONIGENC_MAX_COMP_AMBIG_CODE_LEN 3 +#define ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM 4 + +/* code range */ +#define ONIGENC_CODE_RANGE_NUM(range) ((int )range[0]) +#define ONIGENC_CODE_RANGE_FROM(range,i) range[((i)*2) + 1] +#define ONIGENC_CODE_RANGE_TO(range,i) range[((i)*2) + 2] + +typedef struct { + int len; + OnigCodePoint code[ONIGENC_MAX_COMP_AMBIG_CODE_LEN]; +} OnigCompAmbigCodeItem; + +typedef struct { + int n; + OnigCodePoint code; + OnigCompAmbigCodeItem items[ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM]; +} OnigCompAmbigCodes; + typedef struct { OnigCodePoint from; OnigCodePoint to; -} OnigCodePointRange; +} OnigPairAmbigCodes; -#define ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE 16 typedef struct { - int target_num; - int target_byte_len[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; - UChar* target_str[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; -} OnigEncFoldMatchInfo; + OnigCodePoint esc; + OnigCodePoint anychar; + OnigCodePoint anytime; + OnigCodePoint zero_or_one_time; + OnigCodePoint one_or_more_time; + OnigCodePoint anychar_anytime; +} OnigMetaCharTableType; #if defined(RUBY_PLATFORM) && defined(M17N_H) @@ -74,23 +151,24 @@ typedef m17n_encoding* OnigEncoding; #else typedef struct { - const char len_table[256]; - const char* name; - int max_enc_len; - int is_fold_match; - int ctype_support_level; /* sb-only/full */ - int is_continuous_sb_mb; /* code point is continuous from sb to mb */ - OnigCodePoint (*mbc_to_code)(UChar* p, UChar* end); + int (*mbc_enc_len)(const UChar* p); + const char* name; + int max_enc_len; + int min_enc_len; + OnigAmbigType support_ambig_flag; + OnigMetaCharTableType meta_char_table; + int (*is_mbc_newline)(const UChar* p, const UChar* end); + OnigCodePoint (*mbc_to_code)(const UChar* p, const UChar* end); int (*code_to_mbclen)(OnigCodePoint code); int (*code_to_mbc)(OnigCodePoint code, UChar *buf); - int (*mbc_to_lower)(UChar* p, UChar* lower); - int (*mbc_is_case_ambig)(UChar* p); - int (*code_is_ctype)(OnigCodePoint code, unsigned int ctype); - int (*get_ctype_code_range)(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]); - UChar* (*left_adjust_char_head)(UChar* start, UChar* s); - int (*is_allowed_reverse_match)(UChar* p, UChar* e); - int (*get_all_fold_match_code)(OnigCodePoint** codes); - int (*get_fold_match_info)(UChar* p, UChar* end, OnigEncFoldMatchInfo** info); + int (*mbc_to_normalize)(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* to); + int (*is_mbc_ambiguous)(OnigAmbigType flag, const UChar** pp, const UChar* end); + int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs); + int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs); + int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype); + int (*get_ctype_code_range)(int ctype, OnigCodePoint* sb_range[], OnigCodePoint* mb_range[]); + UChar* (*left_adjust_char_head)(const UChar* start, const UChar* p); + int (*is_allowed_reverse_match)(const UChar* p, const UChar* end); } OnigEncodingType; typedef OnigEncodingType* OnigEncoding; @@ -112,6 +190,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16; ONIG_EXTERN OnigEncodingType OnigEncodingUTF8; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_LE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_LE; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR; @@ -138,6 +220,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15) #define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16) #define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8) +#define ONIG_ENCODING_UTF16_BE (&OnigEncodingUTF16_BE) +#define ONIG_ENCODING_UTF16_LE (&OnigEncodingUTF16_LE) +#define ONIG_ENCODING_UTF32_BE (&OnigEncodingUTF32_BE) +#define ONIG_ENCODING_UTF32_LE (&OnigEncodingUTF32_LE) #define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP) #define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW) #define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR) @@ -153,35 +239,31 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; /* work size */ -#define ONIGENC_CODE_TO_MBC_MAXLEN 7 -#define ONIGENC_MBC_TO_LOWER_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN +#define ONIGENC_CODE_TO_MBC_MAXLEN 7 +#define ONIGENC_MBC_NORMALIZE_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN /* character types */ -#define ONIGENC_CTYPE_ALPHA (1<< 0) -#define ONIGENC_CTYPE_BLANK (1<< 1) -#define ONIGENC_CTYPE_CNTRL (1<< 2) -#define ONIGENC_CTYPE_DIGIT (1<< 3) -#define ONIGENC_CTYPE_GRAPH (1<< 4) -#define ONIGENC_CTYPE_LOWER (1<< 5) -#define ONIGENC_CTYPE_PRINT (1<< 6) -#define ONIGENC_CTYPE_PUNCT (1<< 7) -#define ONIGENC_CTYPE_SPACE (1<< 8) -#define ONIGENC_CTYPE_UPPER (1<< 9) -#define ONIGENC_CTYPE_XDIGIT (1<<10) -#define ONIGENC_CTYPE_WORD (1<<11) -#define ONIGENC_CTYPE_ASCII (1<<12) +#define ONIGENC_CTYPE_NEWLINE (1<< 0) +#define ONIGENC_CTYPE_ALPHA (1<< 1) +#define ONIGENC_CTYPE_BLANK (1<< 2) +#define ONIGENC_CTYPE_CNTRL (1<< 3) +#define ONIGENC_CTYPE_DIGIT (1<< 4) +#define ONIGENC_CTYPE_GRAPH (1<< 5) +#define ONIGENC_CTYPE_LOWER (1<< 6) +#define ONIGENC_CTYPE_PRINT (1<< 7) +#define ONIGENC_CTYPE_PUNCT (1<< 8) +#define ONIGENC_CTYPE_SPACE (1<< 9) +#define ONIGENC_CTYPE_UPPER (1<<10) +#define ONIGENC_CTYPE_XDIGIT (1<<11) +#define ONIGENC_CTYPE_WORD (1<<12) +#define ONIGENC_CTYPE_ASCII (1<<13) #define ONIGENC_CTYPE_ALNUM (ONIGENC_CTYPE_ALPHA | ONIGENC_CTYPE_DIGIT) -/* ctype support level */ -#define ONIGENC_CTYPE_SUPPORT_LEVEL_SB 0 -#define ONIGENC_CTYPE_SUPPORT_LEVEL_FULL 1 - - -#define enc_len(enc,byte) ONIGENC_MBC_LEN_BY_HEAD(enc,byte) +#define enc_len(enc,p) ONIGENC_MBC_ENC_LEN(enc, p) #define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) #define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) -#define ONIGENC_IS_MBC_HEAD(enc,byte) (ONIGENC_MBC_LEN_BY_HEAD(enc,byte) != 1) +#define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1) #define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) #define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) #define ONIGENC_IS_CODE_SB_WORD(enc,code) \ @@ -194,31 +276,33 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #include /* for isblank(), isgraph() */ -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) onigenc_mbc_to_lower(enc,p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) onigenc_mbc_is_case_ambig(enc,p) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + onigenc_mbc_to_normalize(enc,flag,pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + onigenc_is_mbc_ambiguous(enc,flag,pp,end) -#define ONIGENC_IS_FOLD_MATCH(enc) FALSE -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) FALSE -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ONIGENC_CTYPE_SUPPORT_LEVEL_SB +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ onigenc_is_allowed_reverse_match(enc, s, end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ onigenc_get_left_adjust_char_head(enc, start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) 0 -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) ONIG_NO_SUPPORT_CONFIG -#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, ambig_flag, acs) 0 +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, ambig_flag, acs) 0 +#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,sbr,mbr) \ ONIG_NO_SUPPORT_CONFIG -#define ONIGENC_MBC_LEN_BY_HEAD(enc,b) m17n_mbclen(enc,(int )b) +#define ONIGENC_MBC_ENC_LEN(enc,p) m17n_mbclen(enc,(int )(*p)) #define ONIGENC_MBC_MAXLEN(enc) m17n_mbmaxlen(enc) #define ONIGENC_MBC_MAXLEN_DIST(enc) \ (ONIGENC_MBC_MAXLEN(enc) > 0 ? ONIGENC_MBC_MAXLEN(enc) \ : ONIG_INFINITE_DISTANCE) +#define ONIGENC_MBC_MINLEN(enc) 1 #define ONIGENC_MBC_TO_CODE(enc,p,e) m17n_codepoint((enc),(p),(e)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) m17n_codelen((enc),(code)) #define ONIGENC_CODE_TO_MBC(enc,code,buf) onigenc_code_to_mbc(enc, code, buf) -#if 0 -#define ONIGENC_STEP_BACK(enc,start,s,n) /* !! not supported !! */ +#if 0 /* !! not supported !! */ +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) +#define ONIGENC_STEP_BACK(enc,start,s,n) #endif #define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) \ @@ -253,42 +337,45 @@ int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype)); ONIG_EXTERN int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); ONIG_EXTERN -int onigenc_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* buf)); +int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* buf)); ONIG_EXTERN -int onigenc_mbc_is_case_ambig P_((OnigEncoding enc, UChar* p)); +int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, const UChar** pp, const UChar* end)); ONIG_EXTERN -int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end)); +int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, const UChar* s, const UChar* end)); #else /* ONIG_RUBY_M17N */ #define ONIGENC_NAME(enc) ((enc)->name) -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) (enc)->mbc_to_lower(p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) (enc)->mbc_is_case_ambig(p) - -#define ONIGENC_IS_FOLD_MATCH(enc) ((enc)->is_fold_match) -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) ((enc)->is_continuous_sb_mb) -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ((enc)->ctype_support_level) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + (enc)->mbc_to_normalize(flag,(const UChar** )pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + (enc)->is_mbc_ambiguous(flag,(const UChar** )pp,end) +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ((enc)->support_ambig_flag) #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ (enc)->is_allowed_reverse_match(s,end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ (enc)->left_adjust_char_head(start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) \ - (enc)->get_all_fold_match_code(codes) -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) \ - (enc)->get_fold_match_info(p,end,info) +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_pair_ambig_codes(ambig_flag,acs) +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_comp_ambig_codes(ambig_flag,acs) #define ONIGENC_STEP_BACK(enc,start,s,n) \ onigenc_step_back((enc),(start),(s),(n)) -#define ONIGENC_MBC_LEN_BY_HEAD(enc,byte) ((enc)->len_table[(int )(byte)]) +#define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) -#define ONIGENC_MBC_TO_CODE(enc,p,e) (enc)->mbc_to_code((p),(e)) +#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end)) +#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code) #define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf) -#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->code_is_ctype(code,ctype) +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype) +#define ONIGENC_IS_CODE_NEWLINE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE) #define ONIGENC_IS_CODE_GRAPH(enc,code) \ ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) #define ONIGENC_IS_CODE_PRINT(enc,code) \ @@ -316,11 +403,11 @@ int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end) #define ONIGENC_IS_CODE_WORD(enc,code) \ ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_WORD) -#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ - (enc)->get_ctype_code_range(ctype,nsb,nmb,sbr,mbr) +#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,sbr,mbr) \ + (enc)->get_ctype_code_range(ctype,sbr,mbr) ONIG_EXTERN -UChar* onigenc_step_back P_((OnigEncoding enc, UChar* start, UChar* s, int n)); +UChar* onigenc_step_back P_((OnigEncoding enc, const UChar* start, const UChar* s, int n)); #endif /* is not ONIG_RUBY_M17N */ @@ -333,15 +420,21 @@ int onigenc_set_default_encoding P_((OnigEncoding enc)); ONIG_EXTERN OnigEncoding onigenc_get_default_encoding P_(()); ONIG_EXTERN -void onigenc_set_default_caseconv_table P_((UChar* table)); +void onigenc_set_default_caseconv_table P_((const UChar* table)); ONIG_EXTERN -UChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, UChar* start, UChar* s, UChar** prev)); +UChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const UChar* start, const UChar* s, const UChar** prev)); ONIG_EXTERN -UChar* onigenc_get_prev_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +UChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); ONIG_EXTERN -UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); ONIG_EXTERN -UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); +ONIG_EXTERN +int onigenc_strlen P_((OnigEncoding enc, const UChar* p, const UChar* end)); +ONIG_EXTERN +int onigenc_strlen_null P_((OnigEncoding enc, const UChar* p)); +ONIG_EXTERN +int onigenc_str_bytelen_null P_((OnigEncoding enc, const UChar* p)); @@ -355,13 +448,6 @@ UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UC /* constants */ #define ONIG_MAX_ERROR_MESSAGE_LEN 90 -#if defined(RUBY_PLATFORM) && !defined(ONIG_RUBY_M17N) -ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#undef ismbchar -#define ismbchar(c) (mbclen((c)) != 1) -#define mbclen(c) (OnigEncDefaultCharEncoding->len_table[(unsigned char )(c)]) -#endif - typedef unsigned int OnigOptionType; #define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE @@ -403,7 +489,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; -/* predefined syntaxes (see regparse.c) */ +/* predefined syntaxes (see regsyntax.c) */ #define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) #define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) #define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) @@ -466,7 +552,10 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_ESC_V_VTAB (1<<13) /* \v as VTAB */ #define ONIG_SYN_OP2_ESC_U_HEX4 (1<<14) /* \uHHHH */ #define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1<<15) /* \`, \' */ -#define ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */ +#define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */ +#define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ @@ -479,6 +568,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ #define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */ #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?)(?) */ +#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1<<9) /* a{n}?=(?:a{n})? */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */ @@ -505,7 +595,10 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_NORMAL 0 #define ONIG_MISMATCH -1 #define ONIG_NO_SUPPORT_CONFIG -2 + /* internal error */ +#define ONIGERR_MEMORY -5 +#define ONIGERR_TYPE_BUG -6 #define ONIGERR_PARSER_BUG -11 #define ONIGERR_STACK_BUG -12 #define ONIGERR_UNDEFINED_BYTECODE -13 @@ -520,7 +613,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_END_PATTERN_AT_LEFT_BRACKET -101 #define ONIGERR_EMPTY_CHAR_CLASS -102 #define ONIGERR_PREMATURE_END_OF_CHAR_CLASS -103 -#define ONIGERR_END_PATTERN_AT_BACKSLASH -104 +#define ONIGERR_END_PATTERN_AT_ESCAPE -104 #define ONIGERR_END_PATTERN_AT_META -105 #define ONIGERR_END_PATTERN_AT_CONTROL -106 #define ONIGERR_META_CODE_SYNTAX -108 @@ -560,8 +653,12 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_NEVER_ENDING_RECURSION -221 #define ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY -222 #define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 +#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 +#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 +#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 + /* errors related to thread */ -#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 +#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 /* must be smaller than BIT_STATUS_BITS_NUM (unsigned int * 8) */ @@ -569,6 +666,15 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) +typedef struct OnigCaptureTreeNodeStruct { + int group; /* group number */ + int beg; + int end; + int allocated; + int num_childs; + struct OnigCaptureTreeNodeStruct** childs; +} OnigCaptureTreeNode; + /* match result region type */ struct re_registers { int allocated; @@ -576,9 +682,16 @@ struct re_registers { int* beg; int* end; /* extended */ - struct re_registers** list; /* capture history. list[1]-list[31] */ + OnigCaptureTreeNode* history_root; /* capture history tree root */ }; +/* capture tree traverse */ +#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1 +#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2 +#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \ + ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST ) + + #define ONIG_REGION_NOTPOS -1 typedef struct re_registers OnigRegion; @@ -593,8 +706,8 @@ typedef struct { int upper; } OnigRepeatRange; -typedef void (*OnigWarnFunc) P_((char* s)); -extern void onig_null_warn P_((char* s)); +typedef void (*OnigWarnFunc) P_((const char* s)); +extern void onig_null_warn P_((const char* s)); #define ONIG_NULL_WARN onig_null_warn #define ONIG_CHAR_TABLE_SIZE 256 @@ -629,6 +742,7 @@ typedef struct re_pattern_buffer { OnigEncoding enc; OnigOptionType options; OnigSyntaxType* syntax; + OnigAmbigType ambig_flag; void* name_table; /* optimization info (string search, char-map and anchors) */ @@ -640,7 +754,7 @@ typedef struct re_pattern_buffer { int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; - unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ int *int_map; /* BM skip for exact_len > 255 */ int *int_map_backward; /* BM skip for backward search */ OnigDistance dmin; /* min-distance of exact or map */ @@ -651,6 +765,15 @@ typedef struct re_pattern_buffer { } regex_t; +typedef struct { + int num_of_elements; + OnigEncoding pattern_enc; + OnigEncoding target_enc; + OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; +} OnigCompileInfo; + /* Oniguruma Native API */ ONIG_EXTERN int onig_init P_((void)); @@ -661,18 +784,24 @@ void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN -int onig_new P_((regex_t**, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_new P_((regex_t**, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +ONIG_EXTERN +int onig_new_deluxe P_((regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN void onig_free P_((regex_t*)); ONIG_EXTERN -int onig_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_recompile P_((regex_t*, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, OnigRegion* region, OnigOptionType option)); +int onig_recompile_deluxe P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_match P_((regex_t*, UChar* str, UChar* end, UChar* at, OnigRegion* region, OnigOptionType option)); +int onig_search P_((regex_t*, const UChar* str, const UChar* end, const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option)); +ONIG_EXTERN +int onig_match P_((regex_t*, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN +void onig_region_init P_((OnigRegion* region)); +ONIG_EXTERN void onig_region_free P_((OnigRegion* region, int free_self)); ONIG_EXTERN void onig_region_copy P_((OnigRegion* to, OnigRegion* from)); @@ -681,25 +810,44 @@ void onig_region_clear P_((OnigRegion* region)); ONIG_EXTERN int onig_region_resize P_((OnigRegion* region, int n)); ONIG_EXTERN -int onig_name_to_group_numbers P_((regex_t* reg, UChar* name, UChar* name_end, - int** nums)); +int onig_region_set P_((OnigRegion* region, int at, int beg, int end)); ONIG_EXTERN -int onig_name_to_backref_number P_((regex_t* reg, UChar* name, UChar* name_end, OnigRegion *region)); +int onig_name_to_group_numbers P_((regex_t* reg, const UChar* name, const UChar* name_end, int** nums)); ONIG_EXTERN -int onig_foreach_name P_((regex_t* reg, int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), void* arg)); +int onig_name_to_backref_number P_((regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region)); +ONIG_EXTERN +int onig_foreach_name P_((regex_t* reg, int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)); ONIG_EXTERN int onig_number_of_names P_((regex_t* reg)); ONIG_EXTERN +int onig_number_of_captures P_((regex_t* reg)); +ONIG_EXTERN +int onig_number_of_capture_histories P_((regex_t* reg)); +ONIG_EXTERN +OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region)); +ONIG_EXTERN +int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg)); +ONIG_EXTERN OnigEncoding onig_get_encoding P_((regex_t* reg)); ONIG_EXTERN OnigOptionType onig_get_options P_((regex_t* reg)); ONIG_EXTERN +OnigAmbigType onig_get_ambig_flag P_((regex_t* reg)); +ONIG_EXTERN OnigSyntaxType* onig_get_syntax P_((regex_t* reg)); ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax)); ONIG_EXTERN void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from)); ONIG_EXTERN +unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax)); +ONIG_EXTERN void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op)); ONIG_EXTERN void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2)); @@ -708,10 +856,26 @@ void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior) ONIG_EXTERN void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options)); ONIG_EXTERN -int onig_set_meta_char P_((unsigned int what, unsigned int c)); +int onig_set_meta_char P_((OnigEncoding enc, unsigned int what, OnigCodePoint code)); +ONIG_EXTERN +void onig_copy_encoding P_((OnigEncoding to, OnigEncoding from)); +ONIG_EXTERN +OnigAmbigType onig_get_default_ambig_flag P_(()); +ONIG_EXTERN +int onig_set_default_ambig_flag P_((OnigAmbigType ambig_flag)); +ONIG_EXTERN +unsigned int onig_get_match_stack_limit_size P_((void)); +ONIG_EXTERN +int onig_set_match_stack_limit_size P_((unsigned int size)); ONIG_EXTERN int onig_end P_((void)); ONIG_EXTERN const char* onig_version P_((void)); +ONIG_EXTERN +const char* onig_copyright P_((void)); + +#ifdef __cplusplus +} +#endif #endif /* ONIGURUMA_H */ diff --git a/ext/mbstring/oniguruma/regcomp.c b/ext/mbstring/oniguruma/regcomp.c index 24d44dd1b81..5171b15a36f 100644 --- a/ext/mbstring/oniguruma/regcomp.c +++ b/ext/mbstring/oniguruma/regcomp.c @@ -1,16 +1,75 @@ /********************************************************************** - regcomp.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" +OnigAmbigType OnigDefaultAmbigFlag = + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE); + +extern OnigAmbigType +onig_get_default_ambig_flag() +{ + return OnigDefaultAmbigFlag; +} + +extern int +onig_set_default_ambig_flag(OnigAmbigType ambig_flag) +{ + OnigDefaultAmbigFlag = ambig_flag; + return 0; +} + + #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif +static UChar* +k_strdup(UChar* s, UChar* end) +{ + int len = end - s; + + if (len > 0) { + UChar* r = (UChar* )xmalloc(len + 1); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, len); + r[len] = (UChar )0; + return r; + } + else return NULL; +} + +/* + Caution: node should not be a string node. + (s and end member address break) +*/ static void swap_node(Node* a, Node* b) { @@ -120,33 +179,6 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) #endif /* USE_SUBEXP_CALL */ -#if 0 -static int -bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) -{ - int i; - int len, maxlen = 0; - - if (negative) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - return maxlen; -} -#endif - static int add_opcode(regex_t* reg, int opcode) { @@ -159,13 +191,7 @@ add_rel_addr(regex_t* reg, int addr) { RelAddrType ra = (RelAddrType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &ra, SIZE_RELADDR); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_RELADDR(ra, buf); - BBUF_ADD(reg, buf, SIZE_RELADDR); -#endif return 0; } @@ -174,13 +200,7 @@ add_abs_addr(regex_t* reg, int addr) { AbsAddrType ra = (AbsAddrType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &ra, SIZE_ABSADDR); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_ABSADDR(ra, buf); - BBUF_ADD(reg, buf, SIZE_ABSADDR); -#endif return 0; } @@ -189,13 +209,7 @@ add_length(regex_t* reg, int len) { LengthType l = (LengthType )len; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &l, SIZE_LENGTH); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_LENGTH(l, buf); - BBUF_ADD(reg, buf, SIZE_LENGTH); -#endif return 0; } @@ -204,43 +218,23 @@ add_mem_num(regex_t* reg, int num) { MemNumType n = (MemNumType )num; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &n, SIZE_MEMNUM); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_MEMNUM(n, buf); - BBUF_ADD(reg, buf, SIZE_MEMNUM); -#endif return 0; } -#if 0 static int -add_repeat_num(regex_t* reg, int num) +add_pointer(regex_t* reg, void* addr) { - RepeatNumType n = (RepeatNumType )num; + PointerType ptr = (PointerType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - BBUF_ADD(reg, &n, SIZE_REPEATNUM); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_REPEATNUM(n, buf); - BBUF_ADD(reg, buf, SIZE_REPEATNUM); -#endif + BBUF_ADD(reg, &ptr, SIZE_POINTER); return 0; } -#endif static int add_option(regex_t* reg, OnigOptionType option) { -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &option, SIZE_OPTION); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_OPTION(option, buf); - BBUF_ADD(reg, buf, SIZE_OPTION); -#endif return 0; } @@ -293,15 +287,15 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) { int op; - switch (mb_len) { - case 1: - if (ignore_case) { - switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; - } + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; } - else { + } + else { + switch (mb_len) { + case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; @@ -310,25 +304,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } + break; + + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; + + case 3: + op = OP_EXACTMB3N; + break; + + default: + op = OP_EXACTMBN; + break; } - break; - - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; - - case 3: - op = OP_EXACTMB3N; - break; - - default: - op = OP_EXACTMBN; - break; } return op; } @@ -373,7 +367,7 @@ compile_call(CallNode* node, regex_t* reg) r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), - node->target); + node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; @@ -394,15 +388,14 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) static int add_compile_string_length(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int len; int op = select_str_opcode(mb_len, str_len, ignore_case); len = SIZE_OPCODE; - if (op == OP_EXACTMBN) - len += SIZE_LENGTH; + if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; @@ -412,7 +405,7 @@ add_compile_string_length(UChar* s, int mb_len, int str_len, static int add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int op = select_str_opcode(mb_len, str_len, ignore_case); add_opcode(reg, op); @@ -420,8 +413,12 @@ add_compile_string(UChar* s, int mb_len, int str_len, if (op == OP_EXACTMBN) add_length(reg, mb_len); - if (IS_NEED_STR_LEN_OP_EXACT(op)) - add_length(reg, str_len); + if (IS_NEED_STR_LEN_OP_EXACT(op)) { + if (op == OP_EXACTN_IC) + add_length(reg, mb_len * str_len); + else + add_length(reg, str_len); + } add_bytes(reg, s, mb_len * str_len); return 0; @@ -429,49 +426,37 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int -compile_length_string_node(StrNode* sn, regex_t* reg) +compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig, ic; + int rlen, r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; rlen = 0; for (; p < sn->end; ) { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && ambig == 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); } else { r = add_compile_string_length(prev, prev_len, slen, reg, ambig); rlen += r; - - if (ic != 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - prev = p; slen = 1; prev_len = len; } - p += len; } r = add_compile_string_length(prev, prev_len, slen, reg, ambig); @@ -489,49 +474,33 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) } static int -compile_string_node(StrNode* sn, regex_t* reg) +compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig, ic; + int r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; - UChar *p, *prev; + UChar *p, *prev, *end; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + end = sn->end; + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) - ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; - for (; p < sn->end; ) { - len = enc_len(enc, *p); + for (; p < end; ) { + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && len == 1) { - if (ambig == 0) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } } else { r = add_compile_string(prev, prev_len, slen, reg, ambig); if (r) return r; - if (ic != 0 && len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; prev = p; slen = 1; @@ -580,12 +549,16 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) { int len; + if (IS_CCLASS_SHARE(cc)) { + len = SIZE_OPCODE + SIZE_POINTER; + return len; + } + if (IS_NULL(cc->mbuf)) { len = SIZE_OPCODE + SIZE_BITSET; } else { - if (bitset_is_empty(cc->bs)) { - /* SIZE_BITSET is included in mbuf->used. */ + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { @@ -606,22 +579,34 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) { int r; + if (IS_CCLASS_SHARE(cc)) { + add_opcode(reg, OP_CCLASS_NODE); + r = add_pointer(reg, cc); + return r; + } + if (IS_NULL(cc->mbuf)) { - if (cc->not) add_opcode(reg, OP_CCLASS_NOT); - else add_opcode(reg, OP_CCLASS); + if (IS_CCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_NOT); + else + add_opcode(reg, OP_CCLASS); r = add_bitset(reg, cc->bs); } else { - if (bitset_is_empty(cc->bs)) { - if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); - else add_opcode(reg, OP_CCLASS_MB); + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { + if (IS_CCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_MB_NOT); + else + add_opcode(reg, OP_CCLASS_MB); r = add_multi_byte_cclass(cc->mbuf, reg); } else { - if (cc->not) add_opcode(reg, OP_CCLASS_MIX_NOT); - else add_opcode(reg, OP_CCLASS_MIX); + if (IS_CCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_MIX_NOT); + else + add_opcode(reg, OP_CCLASS_MIX); r = add_bitset(reg, cc->bs); if (r) return r; @@ -649,7 +634,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, - sizeof(OnigRepeatRange) * n); + sizeof(OnigRepeatRange) * n); CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -665,7 +650,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) static int compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, - regex_t* reg) + regex_t* reg) { int r; int num_repeat = reg->num_repeat; @@ -684,7 +669,16 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; - r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + if ( +#ifdef USE_SUBEXP_CALL + reg->num_call > 0 || +#endif + IS_QUALIFIER_IN_REPEAT(qn)) { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); + } + else { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + } if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; @@ -706,9 +700,9 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) if (NTYPE(qn->target) == N_ANYCHAR) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -741,7 +735,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } @@ -865,7 +860,8 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); @@ -925,18 +921,16 @@ compile_option_node(EffectNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; + } - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } - else { - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; - } return r; } @@ -983,7 +977,7 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; @@ -1073,7 +1067,7 @@ compile_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; @@ -1258,7 +1252,7 @@ compile_length_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_length_string_raw_node(&(NSTRING(node)), reg); else - r = compile_length_string_node(&(NSTRING(node)), reg); + r = compile_length_string_node(node, reg); break; case N_CCLASS: @@ -1356,7 +1350,7 @@ compile_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_string_raw_node(&(NSTRING(node)), reg); else - r = compile_string_node(&(NSTRING(node)), reg); + r = compile_string_node(node, reg); break; case N_CCLASS: @@ -1412,8 +1406,14 @@ compile_tree(Node* node, regex_t* reg) } else { int* p; - add_opcode(reg, (IS_IGNORECASE(reg->options) ? - OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); + + if (IS_IGNORECASE(reg->options)) { + add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + add_opcode(reg, OP_BACKREF_MULTI); + } + if (r) return r; add_length(reg, br->back_num); if (r) return r; @@ -1455,12 +1455,9 @@ compile_tree(Node* node, regex_t* reg) } #ifdef USE_NAMED_GROUP -typedef struct { - int new_val; -} NumMap; static int -noname_disable_map(Node** plink, NumMap* map, int* counter) +noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { int r = 0; Node* node = *plink; @@ -1514,7 +1511,7 @@ noname_disable_map(Node** plink, NumMap* map, int* counter) } static int -renumber_node_backref(Node* node, NumMap* map) +renumber_node_backref(Node* node, GroupNumRemap* map) { int i, pos, n, old_num; int *backs; @@ -1542,7 +1539,7 @@ renumber_node_backref(Node* node, NumMap* map) } static int -renumber_by_map(Node* node, NumMap* map) +renumber_by_map(Node* node, GroupNumRemap* map) { int r = 0; @@ -1607,9 +1604,9 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) { int r, i, pos, counter; BitStatusType loc; - NumMap* map; + GroupNumRemap* map; - map = (NumMap* )xalloca(sizeof(NumMap) * (env->num_mem + 1)); + map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1)); CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY); for (i = 1; i <= env->num_mem; i++) { map[i].new_val = 0; @@ -1638,7 +1635,8 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) env->num_mem = env->num_named; reg->num_mem = env->num_named; - return 0; + + return onig_renumber_name_table(reg, map); } #endif /* USE_NAMED_GROUP */ @@ -1649,9 +1647,6 @@ unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) int i, offset; EffectNode* en; AbsAddrType addr; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS - UChar buf[SERIALIZE_BUFSIZE]; -#endif for (i = 0; i < uslist->num; i++) { en = &(NEFFECT(uslist->us[i].target)); @@ -1659,12 +1654,7 @@ unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) addr = en->call_addr; offset = uslist->us[i].offset; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); -#else - SERIALIZE_ABSADDR(addr, buf); - BBUF_WRITE(reg, offset, buf, SIZE_ABSADDR); -#endif } return 0; } @@ -2044,7 +2034,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) StrNode* sn = &(NSTRING(node)); UChar *s = sn->s; while (s < sn->end) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); (*len)++; } } @@ -2135,7 +2125,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) { int found; - if (code >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { if (IS_NULL(cc->mbuf)) { found = 0; } @@ -2147,10 +2137,10 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); } - if (cc->not == 0) - return found; - else + if (IS_CCLASS_NOT(cc)) return !found; + else + return found; } /* x is not included y ==> 1 : 0 */ @@ -2213,7 +2203,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) case N_CTYPE: switch (NCTYPE(y).type) { case CTYPE_WORD: - if (IS_NULL(xc->mbuf) && xc->not == 0) { + if (IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (BITSET_AT(xc->bs, i)) { if (ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) return 0; @@ -2226,7 +2216,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) case CTYPE_NOT_WORD: for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (! ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) { - if (xc->not == 0) { + if (!IS_CCLASS_NOT(xc)) { if (BITSET_AT(xc->bs, i)) return 0; } @@ -2251,14 +2241,16 @@ is_not_included(Node* x, Node* y, regex_t* reg) for (i = 0; i < SINGLE_BYTE_SIZE; i++) { v = BITSET_AT(xc->bs, i); - if ((v != 0 && xc->not == 0) || (v == 0 && xc->not)) { + if ((v != 0 && !IS_CCLASS_NOT(xc)) || + (v == 0 && IS_CCLASS_NOT(xc))) { v = BITSET_AT(yc->bs, i); - if ((v != 0 && yc->not == 0) || (v == 0 && yc->not)) + if ((v != 0 && !IS_CCLASS_NOT(yc)) || + (v == 0 && IS_CCLASS_NOT(yc))) return 0; } } - if ((IS_NULL(xc->mbuf) && xc->not == 0) || - (IS_NULL(yc->mbuf) && yc->not == 0)) + if ((IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) || + (IS_NULL(yc->mbuf) && !IS_CCLASS_NOT(yc))) return 1; return 0; } @@ -2300,7 +2292,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) CClassNode* cc = &(NCCLASS(y)); code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + enc_len(reg->enc, c)); + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); } break; @@ -2311,18 +2303,9 @@ is_not_included(Node* x, Node* y, regex_t* reg) StrNode* ys = &(NSTRING(y)); len = NSTRING_LEN(x); if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { - UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int plen, qlen; - for (p = ys->s, q = xs->s; q < xs->end; ) { - plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); - qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); - if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) - return 1; - p += enc_len(reg->enc, *p); - q += enc_len(reg->enc, *q); - } + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; } else { for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { @@ -2379,8 +2362,12 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (exact != 0 && !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) +#if 0 + UChar* tmp = sn->s; + if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, + &tmp, sn->end)) n = node; +#endif } else { n = node; @@ -2937,7 +2924,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); swap_node(node, en); NEFFECT(node).target = en; } @@ -2956,9 +2943,114 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) +static int +divide_ambig_string_node(Node* node, regex_t* reg) +{ + StrNode* sn = &NSTRING(node); + int ambig, prev_ambig; + UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; + Node *snode; + Node *root = NULL_NODE; + Node **tailp = (Node** )0; + + start = prev_start = p = sn->s; + end = sn->end; + if (p >= end) return 0; + + prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end); + + while (p < end) { + prev = p; + if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, + reg->ambig_flag, &p, end))) { + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(root).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + prev_ambig = ambig; + prev_start = prev; + } + } + + if (prev_start == start) { + if (prev_ambig != 0) { + NSTRING_SET_AMBIG(node); + tmp = start; + wp = start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node); + sn->end = wp; + } + } + else { + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(node).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + swap_node(node, root); + onig_node_str_clear(root); /* should be after swap! */ + onig_node_free(root); /* free original string node */ + } + + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -2996,33 +3088,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case N_CCLASS: - if (IS_IGNORECASE(reg->options)) { - int i; - UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - BitSetRef bs = NCCLASS(node).bs; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - c = (UChar )i; - ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); - if (*lowbuf != c) { - if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); - if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); - } - } - } break; case N_STRING: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - StrNode* sn = &NSTRING(node); - UChar* p = sn->s; - - while (p < sn->end) { - if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { - NSTRING_SET_CASE_AMBIG(node); - break; - } - p++; - } + r = divide_ambig_string_node(node, reg); } break; @@ -3057,6 +3127,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) QualifierNode* qn = &(NQUALIFIER(node)); Node* target = qn->target; + if ((state & IN_REPEAT) != 0) { + qn->state |= NST_IN_REPEAT; + } + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { r = get_min_match_length(target, &d, env); if (r) break; @@ -3083,8 +3157,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } } + state |= IN_REPEAT; if (qn->lower != qn->upper) - state |= IN_REPEAT; + state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; @@ -3141,11 +3216,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case EFFECT_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ } - /* fall */ + r = setup_tree(en->target, reg, state, env); + break; + case EFFECT_STOP_BACKTRACK: { Node* target = en->target; @@ -3156,7 +3233,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) tqn->greedy != 0) { /* (?>a*), a*+ etc... */ int qtype = NTYPE(tqn->target); if (IS_NODE_TYPE_SIMPLE(qtype)) - SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); } } } @@ -3228,26 +3305,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) /* set skip map for Boyer-Moor search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, +set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, UChar skip[], int** int_skip) { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - skip[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; } else { if (IS_NULL(*int_skip)) { @@ -3256,16 +3324,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*int_skip)[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; } return 0; } @@ -3278,11 +3338,12 @@ typedef struct { } MinMaxLen; typedef struct { - MinMaxLen mmd; - BitStatusType backrefed_status; - OnigEncoding enc; - OnigOptionType options; - ScanEnv* scan_env; + MinMaxLen mmd; + BitStatusType backrefed_status; + OnigEncoding enc; + OnigOptionType options; + OnigAmbigType ambig_flag; + ScanEnv* scan_env; } OptEnv; typedef struct { @@ -3321,29 +3382,34 @@ typedef struct { static int -map_position_value(int i) +map_position_value(OnigEncoding enc, int i) { - static int vals[] = { - 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, + static short int ByteValTable[] = { + 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 }; - if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; - - return 7; /* Take it easy. */ + if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) { + if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1) + return 20; + else + return (int )ByteValTable[i]; + } + else + return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static int dist_vals[] = { + static short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3363,7 +3429,7 @@ distance_value(MinMaxLen* mm) d = mm->max - mm->min; if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) /* return dist_vals[d] * 16 / (mm->min + 12); */ - return dist_vals[d]; + return (int )dist_vals[d]; else return 1; } @@ -3419,12 +3485,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from) to->max = distance_add(to->max, from->max); } +#if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } +#endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) @@ -3571,7 +3639,7 @@ concat_opt_exact_info_str(OptExactInfo* to, to->s[i++] = *p++; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len; j++) to->s[i++] = *p++; @@ -3598,7 +3666,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; - len = enc_len(env->enc, to->s[i]); + len = enc_len(env->enc, to->s + i); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; @@ -3618,27 +3686,55 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) } static void -select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) +select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt) { - int vlen1, vlen2; + int v1, v2; - vlen1 = now->len * (now->ignore_case ? 1 : 2); - vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + v1 = now->len; + v2 = alt->len; - if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + if (v1 <= 2 && v2 <= 2) { + /* ByteValTable[x] is big value --> low price */ + v2 = map_position_value(enc, now->s[0]); + v1 = map_position_value(enc, alt->s[0]); + + if (now->len > 1) v1 += 5; + if (alt->len > 1) v2 += 5; + } + + if (now->ignore_case == 0) v1 *= 2; + if (alt->ignore_case == 0) v2 *= 2; + + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } static void clear_opt_map_info(OptMapInfo* map) { - int i; + static OptMapInfo clean_info = { + {0, 0}, {0, 0}, 0, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + } + }; - clear_mml(&map->mmd); - clear_opt_anc_info(&map->anc); - map->value = 0; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - map->map[i] = 0; + xmemcpy(map, &clean_info, sizeof(OptMapInfo)); } static void @@ -3648,34 +3744,56 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) } static void -add_char_opt_map_info(OptMapInfo* map, int c) +add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc) { if (map->map[c] == 0) { map->map[c] = 1; - map->value += map_position_value(c); + map->value += map_position_value(enc, c); } } -static void -add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +static int +add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, + OnigEncoding enc, OnigAmbigType ambig_flag) { - UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int i, j, n, len; + UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; + OnigCodePoint code, ccode; + OnigCompAmbigCodes* ccs; + OnigPairAmbigCodes* pccs; + OnigAmbigType amb; - add_char_opt_map_info(map, c); + add_char_opt_map_info(map, p[0], enc); + code = ONIGENC_MBC_TO_CODE(enc, p, end); - x = (UChar )c; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if (*low != x) { - add_char_opt_map_info(map, (int )(*low)); - } - else { - int i; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { - x = (UChar )i; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if ((int )(*low) == c) add_char_opt_map_info(map, i); + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs); + for (i = 0; i < n; i++) { + if (pccs[i].from == code) { + len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0], enc); + } + } + + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + n = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < n; i++) { + if (ccs[i].code == code) { + for (j = 0; j < ccs[i].n; j++) { + ccode = ccs[i].items[j].code[0]; + len = ONIGENC_CODE_TO_MBC(enc, ccode, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0], enc); + } + break; + } + } } } + return 0; } static void @@ -3711,7 +3829,7 @@ comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m) } static void -alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) +alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add) { int i, val; @@ -3730,7 +3848,7 @@ alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) to->map[i] = 1; if (to->map[i]) - val += map_position_value(i); + val += map_position_value(enc, i); } to->value = val; @@ -3763,7 +3881,7 @@ copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from) } static void -concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) +concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) { int exb_reach, exm_reach; OptAncInfo tanc; @@ -3798,8 +3916,8 @@ concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) clear_opt_exact_info(&add->exb); } } - select_opt_exact_info(&to->exm, &add->exb); - select_opt_exact_info(&to->exm, &add->exm); + select_opt_exact_info(enc, &to->exm, &add->exb); + select_opt_exact_info(enc, &to->exm, &add->exm); if (to->expr.len > 0) { if (add->len.max > 0) { @@ -3807,9 +3925,9 @@ concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) to->expr.len = add->len.max; if (to->expr.mmd.max == 0) - select_opt_exact_info(&to->exb, &to->expr); + select_opt_exact_info(enc, &to->exb, &to->expr); else - select_opt_exact_info(&to->exm, &to->expr); + select_opt_exact_info(enc, &to->exm, &to->expr); } } else if (add->expr.len > 0) { @@ -3828,7 +3946,7 @@ alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env) alt_merge_opt_exact_info(&to->exb, &add->exb, env); alt_merge_opt_exact_info(&to->exm, &add->exm, env); alt_merge_opt_exact_info(&to->expr, &add->expr, env); - alt_merge_opt_map_info (&to->map, &add->map); + alt_merge_opt_map_info(env->enc, &to->map, &add->map); alt_merge_mml(&to->len, &add->len); } @@ -3858,7 +3976,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) r = optimize_node_left(NCONS(nd).left, &nopt, &nenv); if (r == 0) { add_mml(&nenv.mmd, &nopt.len); - concat_left_node_opt_info(opt, &nopt); + concat_left_node_opt_info(env->enc, opt, &nopt); } } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right)); } @@ -3881,148 +3999,110 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case N_STRING: { - UChar *p; - int len, plen; StrNode* sn = &(NSTRING(node)); int slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); - if ((! IS_IGNORECASE(env->options)) || is_raw) { + if (! NSTRING_IS_AMBIG(node)) { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, NSTRING_IS_RAW(node), env->enc); if (slen > 0) { - add_char_opt_map_info(&opt->map, *(sn->s)); + add_char_opt_map_info(&opt->map, *(sn->s), env->enc); } + set_mml(&opt->len, slen, slen); } else { - for (p = sn->s; p < sn->end; ) { - len = enc_len(env->enc, *p); - if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { - break; - } - p += len; - } + int n, max; - plen = p - sn->s; - if (plen > slen / 5) { - concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); - concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); - opt->exm.ignore_case = 1; - if (opt->exm.len == sn->end - p) - opt->exm.reach_end = 1; - - copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); - add_len_mml(&(opt->exm.mmd), plen); - } - else { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - is_raw, env->enc); - opt->exb.ignore_case = 1; - } + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; if (slen > 0) { - if (p == sn->s) - add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); - else - add_char_opt_map_info(&opt->map, *(sn->s)); + r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, + env->enc, env->ambig_flag); + if (r != 0) break; } + + if (NSTRING_IS_AMBIG_REDUCE(node)) { + n = onigenc_strlen(env->enc, sn->s, sn->end); + max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; + } + else { + max = slen; + } + set_mml(&opt->len, slen, max); } if (opt->exb.len == slen) opt->exb.reach_end = 1; - - set_mml(&opt->len, slen, slen); } break; case N_CCLASS: { - int i, z, len, found, mb_found; + int i, z; CClassNode* cc = &(NCCLASS(node)); /* no need to check ignore case. (setted in setup_tree()) */ - found = mb_found = 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !cc->not) || (!z && cc->not)) { - found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - if (IS_NULL(cc->mbuf)) { - if (cc->not) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - add_char_opt_map_info(&opt->map, i); - } - mb_found = 1; - } + if (IS_NOT_NULL(cc->mbuf) || IS_CCLASS_NOT(cc)) { + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + + set_mml(&opt->len, min, max); } else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = ONIGENC_IS_MBC_HEAD(env->enc, i); - if (z) { - mb_found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - } - - if (mb_found) { - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); - } - else if (found) { - len = 1; - set_mml(&opt->len, 1, len); + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !IS_CCLASS_NOT(cc)) || (!z && IS_CCLASS_NOT(cc))) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + set_mml(&opt->len, 1, 1); } } break; case N_CTYPE: { - int c; - int len, min, max; + int i, min, max; - min = ONIGENC_MBC_MAXLEN_DIST(env->enc); - max = 0; + max = ONIGENC_MBC_MAXLEN_DIST(env->enc); -#define IS_WORD_HEAD_BYTE(enc,b) \ - (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ - : ONIGENC_IS_MBC_HEAD(enc,b)) + if (max == 1) { + min = 1; - switch (NCTYPE(node).type) { - case CTYPE_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } + switch (NCTYPE(node).type) { + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + break; + + case CTYPE_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + break; } - break; - - case CTYPE_NOT_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; } - + else { + min = ONIGENC_MBC_MINLEN(env->enc); + } set_mml(&opt->len, min, max); } break; case N_ANYCHAR: { - OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, min, max); } break; @@ -4223,36 +4303,20 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->len == 0) return 0; - reg->exact = onig_strdup(e->s, e->s + e->len); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - - reg->exact_end = reg->exact + e->len; - if (e->ignore_case) { - UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int len, low_len, i, j, alloc_size; - - alloc_size = e->len; - i = j = 0; - while (i < e->len) { - low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); - len = enc_len(reg->enc, e->s[i]); - if (low_len > alloc_size - i) { - reg->exact = xrealloc(reg->exact, alloc_size * 2); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - alloc_size *= 2; - } - - xmemcpy(&(reg->exact[j]), buf, low_len); - i += len; - j += low_len; - } - reg->exact_end = reg->exact + j; + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + xmemcpy(reg->exact, e->s, e->len); + reg->exact_end = reg->exact + e->len; reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } else { int allow_reverse; + reg->exact = k_strdup(e->s, e->s + e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + reg->exact_end = reg->exact + e->len; + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) allow_reverse = 1; else @@ -4260,7 +4324,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, reg->map, &(reg->int_map)); if (r) return r; @@ -4320,6 +4384,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.enc = reg->enc; env.options = reg->options; + env.ambig_flag = reg->ambig_flag; env.scan_env = scan_env; clear_mml(&env.mmd); @@ -4337,7 +4402,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) } if (opt.exb.len > 0 || opt.exm.len > 0) { - select_opt_exact_info(&opt.exb, &opt.exm); + select_opt_exact_info(reg->enc, &opt.exb, &opt.exm); if (opt.map.value > 0 && comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { goto set_map; @@ -4474,17 +4539,26 @@ print_optimize_info(FILE* f, regex_t* reg) fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { - int i, n = 0; + int c, i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { + c = 0; fputc('[', f); - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - if (reg->map[i] && enc_len(reg->enc, i) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, i)) - fputc(i, f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (reg->map[i] != 0) { + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } + } fprintf(f, "]\n"); } } @@ -4500,7 +4574,7 @@ onig_free_body(regex_t* reg) if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map); if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward); if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); - if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain); + if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain); #ifdef USE_NAMED_GROUP onig_names_free(reg); @@ -4523,7 +4597,7 @@ onig_free(regex_t* reg) xfree(from);\ } while (0) -static void +extern void onig_transfer(regex_t* to, regex_t* from) { THREAD_ATOMIC_START; @@ -4537,7 +4611,7 @@ onig_transfer(regex_t* to, regex_t* from) }\ } while (0) -static void +extern void onig_chain_link_add(regex_t* to, regex_t* add) { THREAD_ATOMIC_START; @@ -4573,11 +4647,12 @@ onig_clone(regex_t** to, regex_t* from) int r, size; regex_t* reg; - if (ONIG_STATE(from) == ONIG_STATE_NORMAL) { - from->state++; /* increment as search counter */ - if (IS_NOT_NULL(from->chain)) { +#ifdef USE_MULTI_THREAD_SYSTEM + if (ONIG_STATE(from) >= ONIG_STATE_NORMAL) { + ONIG_STATE_INC(from); + if (IS_NOT_NULL(from->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { onig_chain_reduce(from); - from->state++; + ONIG_STATE_INC(from); } } else { @@ -4587,18 +4662,20 @@ onig_clone(regex_t** to, regex_t* from) return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; THREAD_PASS; } - from->state++; /* increment as search counter */ + ONIG_STATE_INC(from); } +#endif /* USE_MULTI_THREAD_SYSTEM */ - r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); + r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + from->enc, ONIG_SYNTAX_DEFAULT); if (r != 0) { - from->state--; + ONIG_STATE_DEC(from); return r; } xmemcpy(reg, from, sizeof(onig_t)); - reg->state = ONIG_STATE_NORMAL; reg->chain = (regex_t* )NULL; + reg->state = ONIG_STATE_NORMAL; if (from->p) { reg->p = (UChar* )xmalloc(reg->alloc); @@ -4631,12 +4708,12 @@ onig_clone(regex_t** to, regex_t* from) reg->name_table = names_clone(from); /* names_clone is not implemented */ #endif - from->state--; + ONIG_STATE_DEC(from); *to = reg; return 0; mem_error: - from->state--; + ONIG_STATE_DEC(from); return ONIGERR_MEMORY; } #endif @@ -4649,7 +4726,7 @@ static void print_tree P_((FILE* f, Node* node)); #endif extern int -onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo) { #define COMPILE_INIT_SIZE 20 @@ -4800,7 +4877,7 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, } extern int -onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, +onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo) { @@ -4821,8 +4898,8 @@ onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, static int onig_inited = 0; extern int -onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax) +onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, + OnigEncoding enc, OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); @@ -4832,6 +4909,7 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, *reg = (regex_t* )xmalloc(sizeof(regex_t)); if (IS_NULL(*reg)) return ONIGERR_MEMORY; + (*reg)->state = ONIG_STATE_MODIFY; if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) { option |= syntax->options; @@ -4840,7 +4918,6 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, else option |= syntax->options; - (*reg)->state = ONIG_STATE_NORMAL; (*reg)->enc = enc; (*reg)->options = option; (*reg)->syntax = syntax; @@ -4855,11 +4932,14 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, (*reg)->used = 0; (*reg)->name_table = (void* )NULL; + (*reg)->ambig_flag = ambig_flag; + (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc); + return 0; } extern int -onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, +onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo) { @@ -4867,7 +4947,8 @@ onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; - r = onig_alloc_init(reg, option, enc, syntax); + r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + enc, syntax); if (r) return r; r = onig_compile(*reg, pattern, pattern_end, einfo); @@ -4899,9 +4980,14 @@ onig_init() return 0; } + extern int onig_end() { + extern int onig_free_shared_cclass_table(); + + THREAD_ATOMIC_START; + #ifdef ONIG_DEBUG_STATISTICS onig_print_statistics(stderr); #endif @@ -4910,7 +4996,13 @@ onig_end() onig_free_node_list(); #endif +#ifdef USE_SHARED_CCLASS_TABLE + onig_free_shared_cclass_table(); +#endif + onig_inited = 0; + + THREAD_ATOMIC_END; return 0; } @@ -4940,6 +5032,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, + { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL }, { OP_ANYCHAR, "anychar", ARG_NON }, { OP_ANYCHAR_ML, "anychar-ml", ARG_NON }, { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, @@ -4964,7 +5057,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREF3, "backref3", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, @@ -4985,6 +5078,8 @@ OnigOpInfoType OnigOpInfo[] = { { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, @@ -5051,7 +5146,8 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) } extern void -onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, + OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; @@ -5068,8 +5164,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) case ARG_NON: break; case ARG_RELADDR: - addr = *((RelAddrType* )bp); - bp += SIZE_RELADDR; + GET_RELADDR_INC(addr, bp); fprintf(f, ":(%d)", addr); break; case ARG_ABSADDR: @@ -5143,7 +5238,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case OP_EXACT1_IC: - p_string(f, 1, bp++); + len = enc_len(enc, bp); + p_string(f, len, bp); + bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); @@ -5189,8 +5286,24 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_BACKREF_MULTI: + case OP_CCLASS_NODE: + { + CClassNode *cc; + + GET_POINTER_INC(cc, bp); + n = bitset_on_num(cc->bs); + fprintf(f, ":%u:%d", (unsigned int )cc, n); + } + break; + + case OP_BACKREFN_IC: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { @@ -5258,7 +5371,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) else fputs(" ", f); } - onig_print_compiled_byte_code(f, bp, &bp); + onig_print_compiled_byte_code(f, bp, &bp, reg->enc); } fprintf(f, "\n"); @@ -5310,7 +5423,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case N_CCLASS: fprintf(f, "", (int )node); - if (NCCLASS(node).not) fputs(" not", f); + if (IS_CCLASS_NOT(&NCCLASS(node))) fputs(" not", f); if (NCCLASS(node).mbuf) { BBuf* bbuf = NCCLASS(node).mbuf; for (i = 0; i < bbuf->used; i++) { @@ -5318,12 +5431,6 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "%0x", bbuf->p[i]); } } -#if 0 - fprintf(f, "\n"); - Indent(f, indent); - for (i = 0; i < SINGLE_BYTE_SIZE; i++) - fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); -#endif break; case N_CTYPE: diff --git a/ext/mbstring/oniguruma/regenc.c b/ext/mbstring/oniguruma/regenc.c index 21598ca7c7d..a767ca60b6a 100644 --- a/ext/mbstring/oniguruma/regenc.c +++ b/ext/mbstring/oniguruma/regenc.c @@ -1,11 +1,33 @@ /********************************************************************** - regenc.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#include "regenc.h" +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regint.h" OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; @@ -29,33 +51,33 @@ onigenc_set_default_encoding(OnigEncoding enc) } extern UChar* -onigenc_get_right_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) +onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s) { UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); if (p < s) { - p += enc_len(enc, *p); + p += enc_len(enc, p); } return p; } extern UChar* onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, - UChar* start, UChar* s, UChar** prev) + const UChar* start, const UChar* s, const UChar** prev) { UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); if (p < s) { - if (prev) *prev = p; - p += enc_len(enc, *p); + if (prev) *prev = (const UChar* )p; + p += enc_len(enc, p); } else { - if (prev) *prev = (UChar* )NULL; /* Sorry */ + if (prev) *prev = (const UChar* )NULL; /* Sorry */ } return p; } extern UChar* -onigenc_get_prev_char_head(OnigEncoding enc, UChar* start, UChar* s) +onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s) { if (s <= start) return (UChar* )NULL; @@ -64,7 +86,7 @@ onigenc_get_prev_char_head(OnigEncoding enc, UChar* start, UChar* s) } extern UChar* -onigenc_step_back(OnigEncoding enc, UChar* start, UChar* s, int n) +onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) { while (ONIG_IS_NOT_NULL(s) && n-- > 0) { if (s <= start) @@ -72,20 +94,127 @@ onigenc_step_back(OnigEncoding enc, UChar* start, UChar* s, int n) s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1); } - return s; + return (UChar* )s; } +extern UChar* +onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) +{ + UChar* q = (UChar* )p; + while (n-- > 0) { + q += ONIGENC_MBC_ENC_LEN(enc, q); + } + return (q <= end ? q : NULL); +} + +extern int +onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end) +{ + int n = 0; + UChar* q = (UChar* )p; + + while (q < end) { + q += ONIGENC_MBC_ENC_LEN(enc, q); + n++; + } + return n; +} + +extern int +onigenc_strlen_null(OnigEncoding enc, const UChar* s) +{ + int n = 0; + UChar* p = (UChar* )s; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return n; + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return n; + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + n++; + } +} + +extern int +onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) +{ + UChar* start = (UChar* )s; + UChar* p = (UChar* )s; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return (int )(p - start); + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return (int )(p - start); + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + } +} #ifndef ONIG_RUBY_M17N #ifndef NOT_RUBY + #define USE_APPLICATION_TO_LOWER_CASE_TABLE + +unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 +}; #endif -UChar* OnigEncAsciiToLowerCaseTable = (UChar* )0; +const UChar* OnigEncAsciiToLowerCaseTable = (const UChar* )0; #ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE -static UChar BuiltInAsciiToLowerCaseTable[] = { +static const UChar BuiltInAsciiToLowerCaseTable[] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -121,23 +250,61 @@ static UChar BuiltInAsciiToLowerCaseTable[] = { }; #endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncAsciiToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', +}; +#endif + unsigned short OnigEncAsciiCtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -156,10 +323,82 @@ unsigned short OnigEncAsciiCtypeTable[256] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; +UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', + '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' +}; + +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377', +}; +#endif + extern void -onigenc_set_default_caseconv_table(UChar* table) +onigenc_set_default_caseconv_table(const UChar* table) { - if (table == (UChar* )0) { + if (table == (const UChar* )0) { #ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE table = BuiltInAsciiToLowerCaseTable; #else @@ -173,47 +412,240 @@ onigenc_set_default_caseconv_table(UChar* table) } extern UChar* -onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) +onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s) { return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } +OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { + { 0x41, 0x61 }, + { 0x42, 0x62 }, + { 0x43, 0x63 }, + { 0x44, 0x64 }, + { 0x45, 0x65 }, + { 0x46, 0x66 }, + { 0x47, 0x67 }, + { 0x48, 0x68 }, + { 0x49, 0x69 }, + { 0x4a, 0x6a }, + { 0x4b, 0x6b }, + { 0x4c, 0x6c }, + { 0x4d, 0x6d }, + { 0x4e, 0x6e }, + { 0x4f, 0x6f }, + { 0x50, 0x70 }, + { 0x51, 0x71 }, + { 0x52, 0x72 }, + { 0x53, 0x73 }, + { 0x54, 0x74 }, + { 0x55, 0x75 }, + { 0x56, 0x76 }, + { 0x57, 0x77 }, + { 0x58, 0x78 }, + { 0x59, 0x79 }, + { 0x5a, 0x7a }, + + { 0x61, 0x41 }, + { 0x62, 0x42 }, + { 0x63, 0x43 }, + { 0x64, 0x44 }, + { 0x65, 0x45 }, + { 0x66, 0x46 }, + { 0x67, 0x47 }, + { 0x68, 0x48 }, + { 0x69, 0x49 }, + { 0x6a, 0x4a }, + { 0x6b, 0x4b }, + { 0x6c, 0x4c }, + { 0x6d, 0x4d }, + { 0x6e, 0x4e }, + { 0x6f, 0x4f }, + { 0x70, 0x50 }, + { 0x71, 0x51 }, + { 0x72, 0x52 }, + { 0x73, 0x53 }, + { 0x74, 0x54 }, + { 0x75, 0x55 }, + { 0x76, 0x56 }, + { 0x77, 0x57 }, + { 0x78, 0x58 }, + { 0x79, 0x59 }, + { 0x7a, 0x5a } +}; + extern int -onigenc_nothing_get_all_fold_match_code(OnigCodePoint** codes) +onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else { + return 0; + } +} + +extern int +onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { return 0; } extern int -onigenc_nothing_get_fold_match_info(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) +onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) { - return -1; + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; } extern int -onigenc_nothing_get_ctype_code_range(int ctype, int* nsb, int* nmb, - OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) +onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { - return -1; + static OnigCompAmbigCodes folds[] = { + { 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = folds; + return sizeof(folds) / sizeof(OnigCompAmbigCodes); + } + else + return 0; +} + +extern int +onigenc_not_support_get_ctype_code_range(int ctype, + OnigCodePoint* sbr[], OnigCodePoint* mbr[]) +{ + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end) +{ + if (p < end) { + if (*p == 0x0a) return 1; + } + return 0; } /* for single byte encodings */ extern int -onigenc_ascii_mbc_to_lower(UChar* p, UChar* lower) +onigenc_ascii_mbc_to_normalize(OnigAmbigType flag, const UChar** p, const UChar*end, + UChar* lower) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p); + } + else { + *lower = **p; + } + + (*p)++; return 1; /* return byte length of converted char to lower */ } extern int -onigenc_ascii_mbc_is_case_ambig(UChar* p) +onigenc_ascii_is_mbc_ambiguous(OnigAmbigType flag, + const UChar** pp, const UChar* end) { - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + const UChar* p = *pp; + + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } +} + +extern int +onigenc_single_byte_mbc_enc_len(const UChar* p) +{ + return 1; } extern OnigCodePoint -onigenc_single_byte_mbc_to_code(UChar* p, UChar* end) +onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end) { return (OnigCodePoint )(*p); } @@ -238,26 +670,31 @@ onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf) } extern UChar* -onigenc_single_byte_left_adjust_char_head(UChar* start, UChar* s) +onigenc_single_byte_left_adjust_char_head(const UChar* start, const UChar* s) { - return s; + return (UChar* )s; } extern int -onigenc_single_byte_is_allowed_reverse_match(UChar* s, UChar* end) +onigenc_always_true_is_allowed_reverse_match(const UChar* s, const UChar* end) { return TRUE; } +extern int +onigenc_always_false_is_allowed_reverse_match(const UChar* s, const UChar* end) +{ + return FALSE; +} + extern OnigCodePoint -onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) +onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(enc, c); - n = c; + len = enc_len(enc, p); + n = (OnigCodePoint )(*p++); if (len == 1) return n; for (i = 1; i < len; i++) { @@ -269,33 +706,52 @@ onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) } extern int -onigenc_mbn_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* lower) +onigenc_mbn_mbc_to_normalize(OnigEncoding enc, OnigAmbigType flag, + const UChar** pp, const UChar* end, UChar* lower) { int len; + const UChar *p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted to lower char */ } } extern int -onigenc_mbn_mbc_is_case_ambig(UChar* p) +onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + const UChar** pp, const UChar* end) { - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + const UChar* p = *pp; + if (ONIGENC_IS_MBC_ASCII(p)) { + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } + } + + (*pp) += enc_len(enc, p); return FALSE; } @@ -360,8 +816,8 @@ onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; + if (enc_len(enc, buf) != (p - buf)) + return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } @@ -383,23 +839,21 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; + if (enc_len(enc, buf) != (p - buf)) + return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } extern int -onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -412,16 +866,14 @@ onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, } extern int -onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb4_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -434,39 +886,22 @@ onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, } extern int -onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) +onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end, + const UChar* sascii /* ascii */, int n) { - static OnigCodePoint list[] = { 0xdf }; - *codes = list; - return 1; -} + int x, c; -extern int -onigenc_get_fold_match_info_ss_0xdf(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) -{ - /* German alphabet ess-tsett(U+00DF) */ - static OnigEncFoldMatchInfo ss = { - 3, - { 1, 2, 2 }, - { "\337", "ss", "SS" } /* 0337: 0xdf */ - }; + while (n-- > 0) { + if (p >= end) return (int )(*sascii); - if (p >= end) return -1; + c = (int )ONIGENC_MBC_TO_CODE(enc, p, end); + x = *sascii - c; + if (x) return x; - if (*p == 0xdf) { - *info = &ss; - return 1; + sascii++; + p += enc_len(enc, p); } - else if (p + 1 < end) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &ss; - return 2; - } - } - - return -1; /* is not a fold string. */ + return 0; } #else /* ONIG_RUBY_M17N */ @@ -475,6 +910,10 @@ extern int onigenc_is_code_ctype(OnigEncoding enc, OnigCodePoint code, int ctype) { switch (ctype) { + case ONIGENC_CTYPE_NEWLINE: + if (code == 0x0a) return 1; + break; + case ONIGENC_CTYPE_ALPHA: return m17n_isalpha(enc, code); break; @@ -548,12 +987,22 @@ onigenc_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* buf) } extern int -onigenc_mbc_is_case_ambig(OnigEncoding enc, UChar* p) +onigenc_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end) { - unsigned int c = m17n_codepoint(enc, p, p + enc_len(enc, *p)); + int len; + unsigned int c; + UChar* p = *pp; + + len = enc_len(enc, *p); + (*pp) += len; + c = m17n_codepoint(enc, p, p + len); + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + if (m17n_isupper(enc, c) || m17n_islower(enc, c)) + return TRUE; + } - if (m17n_isupper(enc, c) || m17n_islower(enc, c)) - return TRUE; return FALSE; } @@ -575,7 +1024,8 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) } extern int -onigenc_is_allowed_reverse_match(OnigEncoding enc, UChar* s, UChar* end) +onigenc_is_allowed_reverse_match(OnigEncoding enc, + const UChar* s, const UChar* end) { return ONIGENC_IS_SINGLEBYTE(enc); } diff --git a/ext/mbstring/oniguruma/regenc.h b/ext/mbstring/oniguruma/regenc.h index e0c6211d32d..510455146ef 100644 --- a/ext/mbstring/oniguruma/regenc.h +++ b/ext/mbstring/oniguruma/regenc.h @@ -1,12 +1,33 @@ -/********************************************************************** - - regenc.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef REGENC_H #define REGENC_H +/********************************************************************** + regenc.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #ifndef RUBY_PLATFORM #include "config.h" @@ -26,15 +47,11 @@ #endif /* error codes */ -/* internal error */ -#define ONIGERR_MEMORY -5 -#define ONIGERR_TYPE_BUG -6 -/* syntax error [-400, -999] */ -#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 -#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 +#define ONIGENCERR_MEMORY -5 +#define ONIGENCERR_TYPE_BUG -6 +#define ONIGENCERR_INVALID_WIDE_CHAR_VALUE -400 +#define ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE -401 -#define ONIG_NEWLINE '\n' -#define ONIG_IS_NEWLINE(c) ((c) == ONIG_NEWLINE) #define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0) #define ONIG_IS_NOT_NULL(p) (((void*)(p)) != (void*)0) #define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL @@ -47,47 +64,79 @@ #else /* ONIG_RUBY_M17N */ +#define USE_UNICODE_FULL_RANGE_CTYPE + #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_nothing_get_all_fold_match_code P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_nothing_get_fold_match_info P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); -ONIG_EXTERN int onigenc_nothing_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); +ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); +ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end)); /* methods for single byte encoding */ -ONIG_EXTERN int onigenc_ascii_mbc_to_lower P_((UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_ascii_mbc_is_case_ambig P_((UChar* p)); -ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((UChar* p, UChar* end)); +ONIG_EXTERN int onigenc_ascii_mbc_to_normalize P_((OnigAmbigType flag, const UChar** p, const UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_ascii_is_mbc_ambiguous P_((OnigAmbigType flag, const UChar** p, const UChar* end)); +ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((const UChar* p)); +ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end)); ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf)); -ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((UChar* start, UChar* s)); -ONIG_EXTERN int onigenc_single_byte_is_allowed_reverse_match P_((UChar* s, UChar* end)); +ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s)); +ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); +ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); /* methods for multi byte encoding */ -ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, UChar* p, UChar* end)); -ONIG_EXTERN int onigenc_mbn_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_mbn_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); +ONIG_EXTERN int onigenc_mbn_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, const UChar** p, const UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_mbn_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, const UChar** p, const UChar* end)); ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb2_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb4_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_get_fold_match_info_ss_0xdf P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); + +/* in enc/unicode.c */ +ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); + + +#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ + OnigEncISO_8859_1_ToLowerCaseTable[c] +#define ONIGENC_ISO_8859_1_TO_UPPER_CASE(c) \ + OnigEncISO_8859_1_ToUpperCaseTable[c] +#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ + ((OnigEnc_Unicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) + +ONIG_EXTERN UChar OnigEncISO_8859_1_ToLowerCaseTable[]; +ONIG_EXTERN UChar OnigEncISO_8859_1_ToUpperCaseTable[]; +ONIG_EXTERN unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; +ONIG_EXTERN OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; #endif /* is not ONIG_RUBY_M17N */ +ONIG_EXTERN int +onigenc_with_ascii_strncmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n)); +ONIG_EXTERN UChar* +onigenc_step P_((OnigEncoding enc, const UChar* p, const UChar* end, int n)); + +/* defined in regexec.c, but used in enc/xxx.c */ +extern int onig_is_in_code_range P_((const UChar* p, OnigCodePoint code)); ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -ONIG_EXTERN UChar* OnigEncAsciiToLowerCaseTable; +ONIG_EXTERN const UChar* OnigEncAsciiToLowerCaseTable; +ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[]; ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] +#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] #define ONIGENC_IS_ASCII_CODE_CTYPE(code,ctype) \ ((OnigEncAsciiCtypeTable[code] & ctype) != 0) #define ONIGENC_IS_ASCII_CODE_CASE_AMBIG(code) \ diff --git a/ext/mbstring/oniguruma/regerror.c b/ext/mbstring/oniguruma/regerror.c index 5a6c31b82ec..560b5e12c56 100644 --- a/ext/mbstring/oniguruma/regerror.c +++ b/ext/mbstring/oniguruma/regerror.c @@ -1,10 +1,32 @@ /********************************************************************** - regerror.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #include /* for vsnprintf() */ @@ -56,8 +78,8 @@ onig_error_code_to_format(int code) p = "empty char-class"; break; case ONIGERR_PREMATURE_END_OF_CHAR_CLASS: p = "premature end of char-class"; break; - case ONIGERR_END_PATTERN_AT_BACKSLASH: - p = "end pattern at backslash"; break; + case ONIGERR_END_PATTERN_AT_ESCAPE: + p = "end pattern at escape"; break; case ONIGERR_END_PATTERN_AT_META: p = "end pattern at meta"; break; case ONIGERR_END_PATTERN_AT_CONTROL: @@ -145,7 +167,9 @@ onig_error_code_to_format(int code) case ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY: p = "group number is too big for capture history"; break; case ONIGERR_INVALID_CHAR_PROPERTY_NAME: - p = "invalid character property name"; break; + p = "invalid character property name {%n}"; break; + case ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION: + p = "not supported encoding combination"; break; case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: p = "over thread pass limit count"; break; @@ -184,6 +208,7 @@ onig_error_code_to_str(s, code, va_alist) case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: case ONIGERR_INVALID_GROUP_NAME: case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: einfo = va_arg(vargs, OnigErrorInfo*); len = einfo->par_end - einfo->par; q = onig_error_code_to_format(code); @@ -218,7 +243,7 @@ onig_error_code_to_str(s, code, va_alist) default: q = onig_error_code_to_format(code); - len = strlen(q); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, q); xmemcpy(s, q, len); s[len] = '\0'; break; @@ -245,7 +270,8 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) #endif { int n, need, len; - UChar *p, *s; + UChar *p, *s, *bp; + char bs[6]; va_list args; va_init_list(args, fmt); @@ -256,29 +282,41 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) if (n + need < bufsize) { strcat(buf, ": /"); - s = buf + strlen(buf); + s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; while (p < (UChar* )pat_end) { - if (*p == MC_ESC) { + if (*p == MC_ESC(enc)) { *s++ = *p++; - len = enc_len(enc, *p); + len = enc_len(enc, p); while (len-- > 0) *s++ = *p++; } else if (*p == '/') { - *s++ = MC_ESC; + *s++ = (unsigned char )MC_ESC(enc); *s++ = *p++; } - else if (ONIGENC_IS_MBC_HEAD(enc, *p)) { - len = enc_len(enc, *p); - while (len-- > 0) *s++ = *p++; + else if (ONIGENC_IS_MBC_HEAD(enc, p)) { + len = enc_len(enc, p); + if (ONIGENC_MBC_MINLEN(enc) == 1) { + while (len-- > 0) *s++ = *p++; + } + else { /* for UTF16 */ + int blen; + + while (len-- > 0) { + sprintf(bs, "\\%03o", *p++ & 0377); + blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (blen-- > 0) *s++ = *bp++; + } + } } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { - char b[5]; - sprintf(b, "\\%03o", *p & 0377); - len = strlen(b); - while (len-- > 0) *s++ = *p++; + sprintf(bs, "\\%03o", *p++ & 0377); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (len-- > 0) *s++ = *bp++; } else { *s++ = *p++; diff --git a/ext/mbstring/oniguruma/regex.c b/ext/mbstring/oniguruma/regex.c deleted file mode 100644 index 2d79d000a8b..00000000000 --- a/ext/mbstring/oniguruma/regex.c +++ /dev/null @@ -1,26 +0,0 @@ -/********************************************************************** - - regex.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ -/* - * Source wrapper for Ruby. - */ -#include "regint.h" -#include "regex.h" - -#include "regparse.c" -#include "regcomp.c" -#include "regexec.c" -#include "regenc.c" -#include "reggnu.c" -#include "regerror.c" - -#ifndef ONIG_RUBY_M17N -#include "enc/ascii.c" -#include "enc/utf8.c" -#include "enc/euc_jp.c" -#include "enc/sjis.c" -#endif diff --git a/ext/mbstring/oniguruma/regexec.c b/ext/mbstring/oniguruma/regexec.c index 2ded602e15a..2c082de423f 100644 --- a/ext/mbstring/oniguruma/regexec.c +++ b/ext/mbstring/oniguruma/regexec.c @@ -1,53 +1,152 @@ /********************************************************************** - regexec.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" +#ifdef USE_CAPTURE_HISTORY +static void history_tree_free(OnigCaptureTreeNode* node); + static void -region_list_clear(OnigRegion** list) +history_tree_clear(OnigCaptureTreeNode* node) { int i; - if (IS_NOT_NULL(list)) { - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(list[i])) { - xfree(list[i]); - list[i] = (OnigRegion* )0; + if (IS_NOT_NULL(node)) { + for (i = 0; i < node->num_childs; i++) { + if (IS_NOT_NULL(node->childs[i])) { + history_tree_free(node->childs[i]); } } + for (i = 0; i < node->allocated; i++) { + node->childs[i] = (OnigCaptureTreeNode* )0; + } + node->num_childs = 0; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + node->group = -1; } } static void -region_list_free(OnigRegion* r) +history_tree_free(OnigCaptureTreeNode* node) { - if (IS_NOT_NULL(r->list)) { - region_list_clear(r->list); - xfree(r->list); - r->list = (OnigRegion** )0; + history_tree_clear(node); + xfree(node); +} + +static void +history_root_free(OnigRegion* r) +{ + if (IS_NOT_NULL(r->history_root)) { + history_tree_free(r->history_root); + r->history_root = (OnigCaptureTreeNode* )0; } } -static OnigRegion** -region_list_new() +static OnigCaptureTreeNode* +history_node_new() +{ + OnigCaptureTreeNode* node; + + node = (OnigCaptureTreeNode* )xmalloc(sizeof(OnigCaptureTreeNode)); + CHECK_NULL_RETURN(node); + node->childs = (OnigCaptureTreeNode** )0; + node->allocated = 0; + node->num_childs = 0; + node->group = -1; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + + return node; +} + +static int +history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) +{ +#define HISTORY_TREE_INIT_ALLOC_SIZE 8 + + if (parent->num_childs >= parent->allocated) { + int n, i; + + if (IS_NULL(parent->childs)) { + n = HISTORY_TREE_INIT_ALLOC_SIZE; + parent->childs = + (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); + } + else { + n = parent->allocated * 2; + parent->childs = + (OnigCaptureTreeNode** )xrealloc(parent->childs, + sizeof(OnigCaptureTreeNode*) * n); + } + CHECK_NULL_RETURN_VAL(parent->childs, ONIGERR_MEMORY); + for (i = parent->allocated; i < n; i++) { + parent->childs[i] = (OnigCaptureTreeNode* )0; + } + parent->allocated = n; + } + + parent->childs[parent->num_childs] = child; + parent->num_childs++; + return 0; +} + +static OnigCaptureTreeNode* +history_tree_clone(OnigCaptureTreeNode* node) { int i; - OnigRegion** list; + OnigCaptureTreeNode *clone, *child; - list = (OnigRegion** )xmalloc(sizeof(OnigRegion*) - * (ONIG_MAX_CAPTURE_HISTORY_GROUP + 1)); - CHECK_NULL_RETURN(list); - for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - list[i] = (OnigRegion* )0; + clone = history_node_new(); + CHECK_NULL_RETURN(clone); + + clone->beg = node->beg; + clone->end = node->end; + for (i = 0; i < node->num_childs; i++) { + child = history_tree_clone(node->childs[i]); + if (IS_NULL(child)) { + history_tree_free(clone); + return (OnigCaptureTreeNode* )0; + } + history_tree_add_child(clone, child); } - return list; + return clone; } +extern OnigCaptureTreeNode* +onig_get_capture_tree(OnigRegion* region) +{ + return region->history_root; +} +#endif /* USE_CAPTURE_HISTORY */ + extern void onig_region_clear(OnigRegion* region) { @@ -56,14 +155,14 @@ onig_region_clear(OnigRegion* region) for (i = 0; i < region->num_regs; i++) { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } - region_list_clear(region->list); +#ifdef USE_CAPTURE_HISTORY + history_root_free(region); +#endif } extern int onig_region_resize(OnigRegion* region, int n) { - int i; - region->num_regs = n; if (n < ONIG_NREGION) @@ -88,92 +187,43 @@ onig_region_resize(OnigRegion* region, int n) region->allocated = n; } - for (i = 0; i < region->num_regs; i++) { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } - - if (IS_NOT_NULL(region->list)) - region_list_clear(region->list); - return 0; } -static int -region_ensure_size(OnigRegion* region, int n) +extern int +onig_region_resize_clear(OnigRegion* region, int n) { - int i, new_size; - - if (region->allocated >= n) - return 0; - - new_size = region->allocated; - if (new_size == 0) - new_size = ONIG_NREGION; - while (new_size < n) - new_size *= 2; - - if (region->allocated == 0) { - region->beg = (int* )xmalloc(new_size * sizeof(int)); - region->end = (int* )xmalloc(new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - else if (region->allocated < new_size) { - region->beg = (int* )xrealloc(region->beg, new_size * sizeof(int)); - region->end = (int* )xrealloc(region->end, new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - - for (i = region->num_regs; i < n; i++) { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } - return 0; -} - -static int -region_list_add_entry(OnigRegion* region, int group, int start, int end) -{ - int r, pos; - OnigRegion** list; - - if (group > ONIG_MAX_CAPTURE_HISTORY_GROUP) - return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; - - if (IS_NULL(region->list)) { - region->list = region_list_new(); - CHECK_NULL_RETURN_VAL(region->list, ONIGERR_MEMORY); - } - - list = region->list; - if (IS_NULL(list[group])) { - list[group] = onig_region_new(); - CHECK_NULL_RETURN_VAL(list[group], ONIGERR_MEMORY); - } - - r = region_ensure_size(list[group], list[group]->num_regs + 1); + int r; + + r = onig_region_resize(region, n); if (r != 0) return r; + onig_region_clear(region); + return 0; +} + +extern int +onig_region_set(OnigRegion* region, int at, int beg, int end) +{ + if (at < 0) return ONIGERR_INVALID_ARGUMENT; - pos = list[group]->num_regs; - list[group]->beg[pos] = start; - list[group]->end[pos] = end; - list[group]->num_regs++; - + if (at >= region->allocated) { + int r = onig_region_resize(region, at + 1); + if (r < 0) return r; + } + + region->beg[at] = beg; + region->end[at] = end; return 0; } -static void +extern void onig_region_init(OnigRegion* region) { - region->num_regs = 0; - region->allocated = 0; - region->beg = (int* )0; - region->end = (int* )0; - region->list = (OnigRegion** )0; + region->num_regs = 0; + region->allocated = 0; + region->beg = (int* )0; + region->end = (int* )0; + region->history_root = (OnigCaptureTreeNode* )0; } extern OnigRegion* @@ -195,7 +245,9 @@ onig_region_free(OnigRegion* r, int free_self) if (r->end) xfree(r->end); r->allocated = 0; } - region_list_free(r); +#ifdef USE_CAPTURE_HISTORY + history_root_free(r); +#endif if (free_self) xfree(r); } } @@ -227,34 +279,19 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) } to->num_regs = from->num_regs; - if (IS_NOT_NULL(from->list)) { - if (IS_NULL(to->list)) { - to->list = region_list_new(); - } +#ifdef USE_CAPTURE_HISTORY + history_root_free(to); - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(from->list[i])) { - if (IS_NULL(to->list[i])) - to->list[i] = onig_region_new(); - - onig_region_copy(to->list[i], from->list[i]); - } - else { - if (IS_NOT_NULL(to->list[i])) { - xfree(to->list[i]); - to->list[i] = (OnigRegion* )0; - } - } - } + if (IS_NOT_NULL(from->history_root)) { + to->history_root = history_tree_clone(from->history_root); } - else - region_list_free(to); +#endif } /** stack **/ #define INVALID_STACK_INDEX -1 -typedef int StackIndex; +typedef long StackIndex; typedef struct _StackType { unsigned int type; @@ -324,7 +361,7 @@ typedef struct { int stack_n; OnigOptionType options; OnigRegion* region; - UChar* start; /* search start position (for \G: BEGIN_POSITION) */ + const UChar* start; /* search start position (for \G: BEGIN_POSITION) */ } MatchArg; #define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ @@ -362,11 +399,26 @@ typedef struct { };\ } while(0) +static unsigned int MatchStackLimitSize = DEFAULT_MATCH_STACK_LIMIT_SIZE; + +extern unsigned int +onig_get_match_stack_limit_size(void) +{ + return MatchStackLimitSize; +} + +extern int +onig_set_match_stack_limit_size(unsigned int size) +{ + MatchStackLimitSize = size; + return 0; +} + static int stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType** arg_stk, StackType* stk_alloc, MatchArg* msa) { - int n; + unsigned int n; StackType *x, *stk_base, *stk_end, *stk; stk_base = *arg_stk_base; @@ -385,7 +437,12 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, } else { n *= 2; - if (n > MATCH_STACK_LIMIT_SIZE) return ONIGERR_MATCH_STACK_LIMIT_OVER; + if (MatchStackLimitSize != 0 && n > MatchStackLimitSize) { + if ((unsigned int )(stk_end - stk_base) == MatchStackLimitSize) + return ONIGERR_MATCH_STACK_LIMIT_OVER; + else + n = MatchStackLimitSize; + } x = (StackType* )xrealloc(stk_base, sizeof(StackType) * n); if (IS_NULL(x)) { STACK_SAVE; @@ -831,24 +888,25 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(s1,ps2,len) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_IC(ambig_flag,s1,ps2,len) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ goto fail; \ } while(0) -static int string_cmp_ic(OnigEncoding enc, +static int string_cmp_ic(OnigEncoding enc, int ambig_flag, UChar* s1, UChar** ps2, int mblen) { - UChar buf1[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar buf2[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar *p1, *p2, *end, *s2; + UChar buf1[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar buf2[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar *p1, *p2, *end, *s2, *end2; int len1, len2; - s2 = *ps2; - end = s1 + mblen; + s2 = *ps2; + end = s1 + mblen; + end2 = s2 + mblen; while (s1 < end) { - len1 = ONIGENC_MBC_TO_LOWER(enc, s1, buf1); - len2 = ONIGENC_MBC_TO_LOWER(enc, s2, buf2); + len1 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s1, end, buf1); + len2 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s2, end2, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -857,9 +915,6 @@ static int string_cmp_ic(OnigEncoding enc, p1++; p2++; } - - s1 += enc_len(enc, *s1); - s2 += enc_len(enc, *s2); } *ps2 = s2; @@ -875,8 +930,8 @@ static int string_cmp_ic(OnigEncoding enc, }\ } while(0) -#define STRING_CMP_VALUE_IC(s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC(ambig_flag,s1,ps2,len,is_fail) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -891,6 +946,110 @@ static int string_cmp_ic(OnigEncoding enc, #define DATA_ENSURE_CHECK(n) (s + (n) <= end) +#ifdef USE_CAPTURE_HISTORY +static int +make_capture_history_tree(OnigCaptureTreeNode* node, StackType** kp, + StackType* stk_top, UChar* str, regex_t* reg) +{ + int n, r; + OnigCaptureTreeNode* child; + StackType* k = *kp; + + while (k < stk_top) { + if (k->type == STK_MEM_START) { + n = k->u.mem.num; + if (n <= ONIG_MAX_CAPTURE_HISTORY_GROUP && + BIT_STATUS_AT(reg->capture_history, n) != 0) { + child = history_node_new(); + CHECK_NULL_RETURN_VAL(child, ONIGERR_MEMORY); + child->group = n; + child->beg = (int )(k->u.mem.pstr - str); + r = history_tree_add_child(node, child); + if (r != 0) return r; + *kp = (k + 1); + r = make_capture_history_tree(child, kp, stk_top, str, reg); + if (r != 0) return r; + + k = *kp; + child->end = (int )(k->u.mem.pstr - str); + } + } + else if (k->type == STK_MEM_END) { + if (k->u.mem.num == node->group) { + node->end = (int )(k->u.mem.pstr - str); + *kp = k; + return 0; + } + } + k++; + } + + return 1; /* 1: root node ending. */ +} +#endif + +#ifdef RUBY_PLATFORM + +typedef struct { + int state; + regex_t* reg; + MatchArg* msa; + StackType* stk_base; +} TrapEnsureArg; + +static VALUE +trap_ensure(VALUE arg) +{ + TrapEnsureArg* ta = (TrapEnsureArg* )arg; + + if (ta->state == 0) { /* trap_exec() is not normal return */ + ONIG_STATE_DEC(ta->reg); + if (! IS_NULL(ta->msa->stack_p) && ta->stk_base != ta->msa->stack_p) + xfree(ta->stk_base); + + MATCH_ARG_FREE(*(ta->msa)); + } + + return Qnil; +} + +static VALUE +trap_exec(VALUE arg) +{ + TrapEnsureArg* ta; + + rb_trap_exec(); + + ta = (TrapEnsureArg* )arg; + ta->state = 1; /* normal return */ + return Qnil; +} + +extern void +onig_exec_trap(regex_t* reg, MatchArg* msa, StackType* stk_base) +{ + VALUE arg; + TrapEnsureArg ta; + + ta.state = 0; + ta.reg = reg; + ta.msa = msa; + ta.stk_base = stk_base; + arg = (VALUE )(&ta); + rb_ensure(trap_exec, arg, trap_ensure, arg); +} + +#define CHECK_INTERRUPT_IN_MATCH_AT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + onig_exec_trap(reg, msa, stk_base);\ + }\ + }\ +} while (0) +#else +#define CHECK_INTERRUPT_IN_MATCH_AT +#endif /* RUBY_PLATFORM */ + #ifdef ONIG_DEBUG_STATISTICS #define USE_TIMEOFDAY @@ -935,6 +1094,7 @@ static int MaxStackDepth = 0; } while (0) #ifdef RUBY_PLATFORM + /* * :nodoc: */ @@ -984,7 +1144,7 @@ onig_print_statistics(FILE* f) #endif extern int -onig_is_in_code_range(UChar* p, OnigCodePoint code) +onig_is_in_code_range(const UChar* p, OnigCodePoint code) { OnigCodePoint n, *data; OnigCodePoint low, high, x; @@ -1004,6 +1164,27 @@ onig_is_in_code_range(UChar* p, OnigCodePoint code) return ((low < n && code >= data[low * 2]) ? 1 : 0); } +static int +code_is_in_cclass_node(void* node, OnigCodePoint code, int enclen) +{ + unsigned int in_cc; + CClassNode* cc = (CClassNode* )node; + + if (enclen == 1) { + in_cc = BITSET_AT(cc->bs, code); + } + else { + UChar* p = ((BBuf* )(cc->mbuf))->p; + in_cc = onig_is_in_code_range(p, code); + } + + if (IS_CCLASS_NOT(cc)) { + return (in_cc ? 0 : 1); + } + else { + return (in_cc ? 1 : 0); + } +} /* matching region of POSIX API */ typedef int regoff_t; @@ -1016,7 +1197,7 @@ typedef struct { /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ static int -match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, +match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, UChar* sprev, MatchArg* msa) { static UChar FinishCode[] = { OP_FINISH }; @@ -1027,18 +1208,18 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, RelAddrType addr; OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; - int ignore_case; + OnigAmbigType ambig_flag = reg->ambig_flag; UChar *s, *q, *sbegin; UChar *p = reg->p; char *alloca_base; StackType *stk_alloc, *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ + StackIndex si; StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); - ignore_case = IS_IGNORECASE(option); pop_level = reg->stack_pop_level; num_mem = reg->num_mem; repeat_stk = (StackIndex* )alloca_base; @@ -1062,7 +1243,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STACK_PUSH_ENSURED(STK_ALT, FinishCode); /* bottom stack */ best_len = ONIG_MISMATCH; - s = sstart; + s = (UChar* )sstart; while (1) { #ifdef ONIG_DEBUG_MATCH { @@ -1071,7 +1252,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, fprintf(stderr, "%4d> \"", (int )(s - str)); bp = buf; for (i = 0, q = s; i < 7 && q < end; i++) { - len = enc_len(encode, *q); + len = enc_len(encode, q); while (len-- > 0) *bp++ = *q++; } if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } @@ -1079,7 +1260,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, *bp = 0; fputs(buf, stderr); for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); - onig_print_compiled_byte_code(stderr, p, NULL); + onig_print_compiled_byte_code(stderr, p, NULL, encode); fprintf(stderr, "\n"); } #endif @@ -1134,27 +1315,33 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } } +#ifdef USE_CAPTURE_HISTORY if (reg->capture_history != 0) { - UChar *pstart, *pend; - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (BIT_STATUS_AT(reg->capture_history, i) != 0) { - stkp = stk_base; - do { - STACK_GET_MEM_RANGE(stkp, i, pstart, pend); - if (stkp < stk) { - int r; - r = region_list_add_entry(region, i, - pstart - str, pend - str); - if (r) { - STACK_SAVE; - return r; - } - } - stkp++; - } while (stkp < stk); - } - } - } /* list of captures */ + int r; + OnigCaptureTreeNode* node; + + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY); + } + else { + node = region->history_root; + history_tree_clear(node); + } + + node->group = 0; + node->beg = sstart - str; + node->end = s - str; + + stkp = stk_base; + r = make_capture_history_tree(region->history_root, &stkp, + stk, (UChar* )str, reg); + if (r < 0) { + best_len = r; /* error code */ + goto finish; + } + } +#endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_REGION_OPTION } /* else IS_POSIX_REGION() */ #endif @@ -1171,10 +1358,9 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, goto fail; /* for retry */ } } - else { - /* default behavior: return first-matching result. */ - goto finish; - } + + /* default behavior: return first-matching result. */ + goto finish; break; case OP_EXACT1: STAT_OP_IN(OP_EXACT1); @@ -1192,14 +1378,31 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC); { int len; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *q, *ss, *sp, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); + DATA_ENSURE(1); + ss = s; + sp = p; + + exact1_ic_retry: + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; - s += enc_len(encode, *s); while (len-- > 0) { - if (*p != *q) goto fail; + if (*p != *q) { +#if 1 + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + ambig_flag &= ~ONIGENC_AMBIGUOUS_MATCH_COMPOUND; + s = ss; + p = sp; + goto exact1_ic_retry; + } + else + goto fail; +#else + goto fail; +#endif + } p++; q++; } } @@ -1276,19 +1479,36 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC); { int len; - UChar *q, *endp, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *ss, *sp, *q, *endp, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; GET_LENGTH_INC(tlen, p); endp = p + tlen; while (p < endp) { - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); sprev = s; - s += enc_len(encode, *s); + DATA_ENSURE(1); + ss = s; + sp = p; + + exactn_ic_retry: + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { - if (*p != *q) goto fail; + if (*p != *q) { +#if 1 + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + ambig_flag &= ~ONIGENC_AMBIGUOUS_MATCH_COMPOUND; + s = ss; + p = sp; + goto exactn_ic_retry; + } + else + goto fail; +#else + goto fail; +#endif + } p++; q++; } } @@ -1389,20 +1609,22 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */ + s += enc_len(encode, s); /* OP_CCLASS can match mb-code. \D, \S */ STAT_OP_OUT; break; case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) goto fail; + if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: GET_LENGTH_INC(tlen, p); { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len; + DATA_ENSURE(1); + mb_len = enc_len(encode, s); DATA_ENSURE(mb_len); ss = s; s += mb_len; @@ -1422,7 +1644,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb; } @@ -1442,13 +1664,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) { - DATA_ENSURE(1); + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_HEAD(encode, s)) { s++; GET_LENGTH_INC(tlen, p); p += tlen; @@ -1460,10 +1682,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len = enc_len(encode, s); if (s + mb_len > end) { - s = end; + DATA_ENSURE(1); + s = (UChar* )end; p += tlen; goto cc_mb_not_success; } @@ -1488,7 +1711,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb_not; } @@ -1504,22 +1727,36 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STAT_OP_OUT; break; - case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - s += n; - } - else { - DATA_ENSURE(1); - if (ONIG_IS_NEWLINE(*s)) goto fail; - s++; + case OP_CCLASS_NODE: STAT_OP_IN(OP_CCLASS_NODE); + { + OnigCodePoint code; + void *node; + int mb_len; + UChar *ss; + + DATA_ENSURE(1); + GET_POINTER_INC(node, p); + mb_len = enc_len(encode, s); + ss = s; + s += mb_len; + code = ONIGENC_MBC_TO_CODE(encode, ss, s); + if (code_is_in_cclass_node(node, code, mb_len) == 0) goto fail; } STAT_OP_OUT; break; + case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); + DATA_ENSURE(1); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + s += n; + STAT_OP_OUT; + break; + case OP_ANYCHAR_ML: STAT_OP_IN(OP_ANYCHAR_ML); - n = enc_len(encode, *s); + DATA_ENSURE(1); + n = enc_len(encode, s); DATA_ENSURE(n); s += n; STAT_OP_OUT; @@ -1528,17 +1765,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } STAT_OP_OUT; break; @@ -1546,7 +1777,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_ML_STAR: STAT_OP_IN(OP_ANYCHAR_ML_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1565,17 +1796,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } p++; STAT_OP_OUT; @@ -1586,7 +1811,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n >1) { DATA_ENSURE(n); sprev = s; @@ -1606,7 +1831,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (! ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1615,7 +1840,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1698,7 +1923,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STAT_OP_OUT; continue; } - else if (ONIG_IS_NEWLINE(*sprev) && !ON_STR_END(s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { STAT_OP_OUT; continue; } @@ -1708,7 +1933,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_END_LINE: STAT_OP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; STAT_OP_OUT; @@ -1717,7 +1942,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - else if (ONIG_IS_NEWLINE(*s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) { STAT_OP_OUT; continue; } @@ -1727,7 +1952,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */ STAT_OP_OUT; @@ -1736,7 +1961,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - if (ONIG_IS_NEWLINE(*s) && ON_STR_END(s+1)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && + ON_STR_END(s + enc_len(encode, s))) { STAT_OP_OUT; continue; } @@ -1845,7 +2071,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; STRING_CMP(pstart, s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1876,8 +2102,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(pstart, &s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + STRING_CMP_IC(ambig_flag, pstart, &s, n); + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1912,7 +2138,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STRING_CMP_VALUE(pstart, swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1948,10 +2174,10 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC(ambig_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1965,7 +2191,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STACK_PUSH_ALT(p, s, sprev); p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; STAT_OP_OUT; @@ -1974,7 +2199,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STAT_OP_OUT; continue; break; @@ -2006,6 +2230,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_REPEAT_INC: case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: p += SIZE_MEMNUM; break; default: @@ -2072,6 +2298,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, GET_RELADDR_INC(addr, p); p += addr; STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; @@ -2150,79 +2377,70 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_REPEAT_INC: STAT_OP_IN(OP_REPEAT_INC); - { - StackIndex si; + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + si = repeat_stk[mem]; + stkp = STACK_AT(si); - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ -#ifdef USE_SUBEXP_CALL - if (reg->num_call > 0) { - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - } - else { - si = repeat_stk[mem]; - stkp = STACK_AT(si); - } -#else - si = repeat_stk[mem]; - stkp = STACK_AT(si); -#endif - stkp->u.repeat.count++; - if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - /* end of repeat. Nothing to do. */ - } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - STACK_PUSH_ALT(p, s, sprev); - p = stkp->u.repeat.pcode; - } - else { - p = stkp->u.repeat.pcode; - } - STACK_PUSH_REPEAT_INC(si); + repeat_inc: + stkp->u.repeat.count++; + if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + /* end of repeat. Nothing to do. */ } + else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(p, s, sprev); + p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + } + else { + p = stkp->u.repeat.pcode; + } + STACK_PUSH_REPEAT_INC(si); STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; + case OP_REPEAT_INC_SG: STAT_OP_IN(OP_REPEAT_INC_SG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + goto repeat_inc; + break; + case OP_REPEAT_INC_NG: STAT_OP_IN(OP_REPEAT_INC_NG); - { - StackIndex si; + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + si = repeat_stk[mem]; + stkp = STACK_AT(si); - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ -#ifdef USE_SUBEXP_CALL - if (reg->num_call > 0) { - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - } - else { - si = repeat_stk[mem]; - stkp = STACK_AT(si); - } -#else - si = repeat_stk[mem]; - stkp = STACK_AT(si); -#endif - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - UChar* pcode = stkp->u.repeat.pcode; + repeat_inc_ng: + stkp->u.repeat.count++; + if (stkp->u.repeat.count < reg->repeat_range[mem].upper || + IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + UChar* pcode = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); - } - else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); - } - } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - } + STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_ALT(pcode, s, sprev); + } + else { + p = stkp->u.repeat.pcode; + STACK_PUSH_REPEAT_INC(si); + } + } + else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + STACK_PUSH_REPEAT_INC(si); } STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; + case OP_REPEAT_INC_NG_SG: STAT_OP_IN(OP_REPEAT_INC_NG_SG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + goto repeat_inc_ng; + break; + case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS); STACK_PUSH_POS(s, sprev); STAT_OP_OUT; @@ -2265,9 +2483,9 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_LOOK_BEHIND: STAT_OP_IN(OP_LOOK_BEHIND); GET_LENGTH_INC(tlen, p); - s = ONIGENC_STEP_BACK(encode, str, s, (int )tlen); + s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); if (IS_NULL(s)) goto fail; - sprev = onigenc_get_prev_char_head(encode, str, s); + sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); STAT_OP_OUT; continue; break; @@ -2275,7 +2493,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_PUSH_LOOK_BEHIND_NOT: STAT_OP_IN(OP_PUSH_LOOK_BEHIND_NOT); GET_RELADDR_INC(addr, p); GET_LENGTH_INC(tlen, p); - q = ONIGENC_STEP_BACK(encode, str, s, (int )tlen); + q = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); if (IS_NULL(q)) { /* too short case -> success. ex. /(? text_range) end = text_range; - s = text; + s = (UChar* )text; while (s < end) { if (*s == *target) { @@ -2380,97 +2599,66 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (t == target_end) return s; } - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } -#if 0 static int -str_trans_match_after_head_byte(OnigEncoding enc, - int len, UChar* t, UChar* tend, UChar* p) +str_lower_case_match(OnigEncoding enc, int ambig_flag, + const UChar* t, const UChar* tend, + const UChar* p, const UChar* end) { - while (--len > 0) { - if (*t != *p) break; - t++; p++; - } + int lowlen; + UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; + const UChar* tsave; + const UChar* psave; - if (len == 0) { - int lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - - while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); - q = lowbuf; - while (lowlen > 0) { - if (*t++ != *q++) break; - lowlen--; - } - if (lowlen > 0) break; - p += len; - } - if (t == tend) - return 1; - } - - return 0; -} -#endif - -static int -str_lower_case_match(OnigEncoding enc, UChar* t, UChar* tend, UChar* p) -{ - int len, lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + tsave = t; + psave = p; + retry: while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { - if (*t++ != *q++) return 0; + if (*t++ != *q++) { + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + ambig_flag &= ~ONIGENC_AMBIGUOUS_MATCH_COMPOUND; + t = tsave; + p = psave; + goto retry; + } + else + return 0; + } lowlen--; } - p += len; } return 1; } static UChar* -slow_search_ic(OnigEncoding enc, +slow_search_ic(OnigEncoding enc, int ambig_flag, UChar* target, UChar* target_end, - UChar* text, UChar* text_end, UChar* text_range) + const UChar* text, const UChar* text_end, UChar* text_range) { - int len, lowlen; - UChar *t, *p, *s, *end; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *s, *end; - end = text_end - (target_end - target) + 1; + end = (UChar* )text_end; + end -= target_end - target - 1; if (end > text_range) end = text_range; - s = text; + s = (UChar* )text; while (s < end) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); - if (*target == *lowbuf) { - p = lowbuf + 1; - t = target + 1; - while (--lowlen > 0) { - if (*p != *t) break; - p++; *t++; - } - if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; - } - } + if (str_lower_case_match(enc, ambig_flag, target, target_end, s, text_end)) + return s; - s += len; + s += enc_len(enc, s); } return (UChar* )NULL; @@ -2478,13 +2666,15 @@ slow_search_ic(OnigEncoding enc, static UChar* slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, - UChar* text, UChar* adjust_text, UChar* text_end, UChar* text_start) + const UChar* text, const UChar* adjust_text, + const UChar* text_end, const UChar* text_start) { UChar *t, *p, *s; - s = text_end - (target_end - target); + s = (UChar* )text_end; + s -= (target_end - target); if (s > text_start) - s = text_start; + s = (UChar* )text_start; else s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); @@ -2500,58 +2690,52 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, if (t == target_end) return s; } - s = onigenc_get_prev_char_head(enc, adjust_text, s); + s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s); } return (UChar* )NULL; } static UChar* -slow_search_backward_ic(OnigEncoding enc, - UChar* target,UChar* target_end, - UChar* text, UChar* adjust_text, - UChar* text_end, UChar* text_start) +slow_search_backward_ic(OnigEncoding enc, int ambig_flag, + UChar* target, UChar* target_end, + const UChar* text, const UChar* adjust_text, + const UChar* text_end, const UChar* text_start) { - int len, lowlen; - UChar *t, *p, *s; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *s; - s = text_end - (target_end - target); + s = (UChar* )text_end; + s -= (target_end - target); if (s > text_start) - s = text_start; + s = (UChar* )text_start; else s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); while (s >= text) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); - if (*target == *lowbuf) { - p = lowbuf + 1; - t = target + 1; - while (--lowlen > 0) { - if (*p != *t) break; - p++; *t++; - } - if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; - } - } + if (str_lower_case_match(enc, ambig_flag, + target, target_end, s, text_end)) + return s; - s = onigenc_get_prev_char_head(enc, adjust_text, s); + s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s); } return (UChar* )NULL; } static UChar* -bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, - UChar* text, UChar* text_end, UChar* text_range) +bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, + const UChar* text_range) { - UChar *s, *t, *p, *end; - UChar *tail; + const UChar *s, *t, *p, *end; + const UChar *tail; int skip; +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); +#endif + end = text_range + (target_end - target) - 1; if (end > text_end) end = text_end; @@ -2559,7 +2743,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, tail = target_end - 1; s = text; while ((s - text) < target_end - target) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } s--; /* set to text check tail position. */ @@ -2570,14 +2754,16 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, while (t >= target && *p == *t) { p--; t--; } - if (t < target) return p + 1; + if (t < target) return (UChar* )(p + 1); skip = reg->map[*s]; - p++; + p = s + 1; + if (p >= text_end) return (UChar* )NULL; t = p; - while ((p - t) < skip) { - p += enc_len(reg->enc, *p); - } + do { + p += enc_len(reg->enc, p); + } while ((p - t) < skip && p < text_end); + s += (p - t); } } @@ -2588,14 +2774,16 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, while (t >= target && *p == *t) { p--; t--; } - if (t < target) return p + 1; + if (t < target) return (UChar* )(p + 1); skip = reg->int_map[*s]; - p++; + p = s + 1; + if (p >= text_end) return (UChar* )NULL; t = p; - while ((p - t) < skip) { - p += enc_len(reg->enc, *p); - } + do { + p += enc_len(reg->enc, p); + } while ((p - t) < skip && p < text_end); + s += (p - t); } } @@ -2603,11 +2791,11 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, } static UChar* -bm_search(regex_t* reg, UChar* target, UChar* target_end, - UChar* text, UChar* text_end, UChar* text_range) +bm_search(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, const UChar* text_range) { - UChar *s, *t, *p, *end; - UChar *tail; + const UChar *s, *t, *p, *end; + const UChar *tail; end = text_range + (target_end - target) - 1; if (end > text_end) @@ -2622,7 +2810,7 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end, while (t >= target && *p == *t) { p--; t--; } - if (t < target) return p + 1; + if (t < target) return (UChar* )(p + 1); s += reg->map[*s]; } } @@ -2633,7 +2821,7 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end, while (t >= target && *p == *t) { p--; t--; } - if (t < target) return p + 1; + if (t < target) return (UChar* )(p + 1); s += reg->int_map[*s]; } } @@ -2641,11 +2829,10 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end, } static int -set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip) +set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, int** skip) + { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; if (IS_NULL(*skip)) { *skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); @@ -2656,24 +2843,18 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*skip)[i] = len; - if (ignore_case) { - for (i = len - 1; i > 0; i--) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*skip)[*lowbuf] = i; - } - } - else { - for (i = len - 1; i > 0; i--) - (*skip)[s[i]] = i; - } + for (i = len - 1; i > 0; i--) + (*skip)[s[i]] = i; + return 0; } static UChar* -bm_search_backward(regex_t* reg, UChar* target, UChar* target_end, UChar* text, - UChar* adjust_text, UChar* text_end, UChar* text_start) +bm_search_backward(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* adjust_text, + const UChar* text_end, const UChar* text_start) { - UChar *s, *t, *p; + const UChar *s, *t, *p; s = text_end - (target_end - target); if (text_start < s) @@ -2688,7 +2869,7 @@ bm_search_backward(regex_t* reg, UChar* target, UChar* target_end, UChar* text, p++; t++; } if (t == target_end) - return s; + return (UChar* )s; s -= reg->int_map_backward[*s]; s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, adjust_text, s); @@ -2698,26 +2879,28 @@ bm_search_backward(regex_t* reg, UChar* target, UChar* target_end, UChar* text, } static UChar* -map_search(OnigEncoding enc, UChar map[], UChar* text, UChar* text_range) +map_search(OnigEncoding enc, UChar map[], + const UChar* text, const UChar* text_range) { - UChar *s = text; + const UChar *s = text; while (s < text_range) { - if (map[*s]) return s; + if (map[*s]) return (UChar* )s; - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } static UChar* map_search_backward(OnigEncoding enc, UChar map[], - UChar* text, UChar* adjust_text, UChar* text_start) + const UChar* text, const UChar* adjust_text, + const UChar* text_start) { - UChar *s = text_start; + const UChar *s = text_start; while (s >= text) { - if (map[*s]) return s; + if (map[*s]) return (UChar* )s; s = onigenc_get_prev_char_head(enc, adjust_text, s); } @@ -2725,13 +2908,32 @@ map_search_backward(OnigEncoding enc, UChar map[], } extern int -onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, +onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option) { int r; UChar *prev; MatchArg msa; +#ifdef USE_MULTI_THREAD_SYSTEM + if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { + ONIG_STATE_INC(reg); + if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + onig_chain_reduce(reg); + ONIG_STATE_INC(reg); + } + } + else { + int n = 0; + while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + ONIG_STATE_INC(reg); + } +#endif /* USE_MULTI_THREAD_SYSTEM */ + MATCH_ARG_INIT(msa, option, region, at); if (region @@ -2739,21 +2941,23 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, && !IS_POSIX_REGION(option) #endif ) { - r = onig_region_resize(region, reg->num_mem + 1); + r = onig_region_resize_clear(region, reg->num_mem + 1); } else r = 0; if (r == 0) { - prev = onigenc_get_prev_char_head(reg->enc, str, at); + prev = (UChar* )onigenc_get_prev_char_head(reg->enc, str, at); r = match_at(reg, str, end, at, prev, &msa); } + MATCH_ARG_FREE(msa); + ONIG_STATE_DEC(reg); return r; } static int -forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, +forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, UChar* range, UChar** low, UChar** high, UChar** low_prev) { UChar *p, *pprev = (UChar* )NULL; @@ -2770,7 +2974,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } else { UChar *q = p + reg->dmin; - while (p < q) p += enc_len(reg->enc, *p); + while (p < q) p += enc_len(reg->enc, p); } } @@ -2780,7 +2984,8 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range); + p = slow_search_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2800,7 +3005,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (p - reg->dmin < s) { retry_gate: pprev = p; - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); goto retry; } @@ -2812,19 +3017,19 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (!ONIG_IS_NEWLINE(*prev)) + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } break; case ANCHOR_END_LINE: if (ON_STR_END(p)) { - prev = onigenc_get_prev_char_head(reg->enc, + prev = (UChar* )onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (prev && ONIG_IS_NEWLINE(*prev)) + if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } - else if (!ONIG_IS_NEWLINE(*p)) + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) goto retry_gate; break; } @@ -2845,7 +3050,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, *low = p - reg->dmax; if (*low > s) { *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, - *low, low_prev); + *low, (const UChar** )low_prev); if (low_prev && IS_NULL(*low_prev)) *low_prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : s), *low); @@ -2872,13 +3077,14 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip)); + int** skip)); #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 static int -backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, - UChar* range, UChar* adjrange, UChar** low, UChar** high) +backward_search_range(regex_t* reg, const UChar* str, const UChar* end, + UChar* s, const UChar* range, UChar* adjrange, + UChar** low, UChar** high) { int r; UChar *p; @@ -2895,8 +3101,9 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_backward_ic(reg->enc, reg->exact, - reg->exact_end, range, adjrange, end, p); + p = slow_search_backward_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, + range, adjrange, end, p); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2905,7 +3112,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) goto exact_method; - r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, &(reg->int_map_backward)); if (r) return r; } @@ -2926,7 +3133,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (!ONIG_IS_NEWLINE(*prev)) { + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } @@ -2937,12 +3144,12 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (ON_STR_END(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(prev)) goto fail; - if (ONIG_IS_NEWLINE(*prev)) { + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } } - else if (!ONIG_IS_NEWLINE(*p)) { + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(p)) goto fail; goto retry; @@ -2974,18 +3181,19 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, extern int -onig_search(regex_t* reg, UChar* str, UChar* end, - UChar* start, UChar* range, OnigRegion* region, OnigOptionType option) +onig_search(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option) { int r; UChar *s, *prev; MatchArg msa; - if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { - reg->state++; /* increment as search counter */ - if (IS_NOT_NULL(reg->chain)) { +#ifdef USE_MULTI_THREAD_SYSTEM + if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { + ONIG_STATE_INC(reg); + if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { onig_chain_reduce(reg); - reg->state++; + ONIG_STATE_INC(reg); } } else { @@ -2995,12 +3203,14 @@ onig_search(regex_t* reg, UChar* str, UChar* end, return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; THREAD_PASS; } - reg->state++; /* increment as search counter */ + ONIG_STATE_INC(reg); } +#endif /* USE_MULTI_THREAD_SYSTEM */ #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "onig_search (entry point): str: %d, end: %d, start: %d, range: %d\n", - (int )str, (int )(end - str), (int )(start - str), (int )(range - str)); + fprintf(stderr, + "onig_search (entry point): str: %d, end: %d, start: %d, range: %d\n", + (int )str, (int )(end - str), (int )(start - str), (int )(range - str)); #endif if (region @@ -3008,7 +3218,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, && !IS_POSIX_REGION(option) #endif ) { - r = onig_region_resize(region, reg->num_mem + 1); + r = onig_region_resize_clear(region, reg->num_mem + 1); if (r) goto finish_no_msa; } @@ -3049,7 +3259,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, } } else if (reg->anchor & ANCHOR_END_BUF) { - semi_end = end; + semi_end = (UChar* )end; end_buf: if ((OnigDistance )(semi_end - str) < reg->anchor_dmin) @@ -3082,14 +3292,16 @@ onig_search(regex_t* reg, UChar* str, UChar* end, } } else if (reg->anchor & ANCHOR_SEMI_END_BUF) { - if (ONIG_IS_NEWLINE(end[-1])) { - semi_end = end - 1; + UChar* pre_end = ONIGENC_STEP_BACK(reg->enc, str, end, 1); + + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) { + semi_end = pre_end; if (semi_end > str && start <= semi_end) { goto end_buf; } } else { - semi_end = end; + semi_end = (UChar* )end; goto end_buf; } } @@ -3098,14 +3310,15 @@ onig_search(regex_t* reg, UChar* str, UChar* end, } } else if (str == end) { /* empty string */ - static UChar* address_for_empty_string = ""; + static const UChar* address_for_empty_string = ""; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search: empty string.\n"); #endif if (reg->threshold_len == 0) { - s = start = end = str = address_for_empty_string; + start = end = str = address_for_empty_string; + s = (UChar* )start; prev = (UChar* )NULL; MATCH_ARG_INIT(msa, option, region, start); @@ -3122,7 +3335,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, MATCH_ARG_INIT(msa, option, region, start); - s = start; + s = (UChar* )start; if (range > start) { /* forward search */ if (s > str) prev = onigenc_get_prev_char_head(reg->enc, str, s); @@ -3132,13 +3345,13 @@ onig_search(regex_t* reg, UChar* str, UChar* end, if (reg->optimize != ONIG_OPTIMIZE_NONE) { UChar *sch_range, *low, *high, *low_prev; - sch_range = range; + sch_range = (UChar* )range; if (reg->dmax != 0) { if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_range = end; + sch_range = (UChar* )end; else { sch_range += reg->dmax; - if (sch_range > end) sch_range = end; + if (sch_range > end) sch_range = (UChar* )end; } } if (reg->dmax != ONIG_INFINITE_DISTANCE && @@ -3153,13 +3366,14 @@ onig_search(regex_t* reg, UChar* str, UChar* end, while (s <= high) { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { if (IS_NOT_NULL(prev)) { - while (!ONIG_IS_NEWLINE(*prev) && s < range) { + while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) && + s < range) { prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } } } @@ -3176,19 +3390,23 @@ onig_search(regex_t* reg, UChar* str, UChar* end, do { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } while (s <= range); /* exec s == range, because empty match with /$/. */ } else { /* backward search */ if (reg->optimize != ONIG_OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; - adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + if (range < end) + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + else + adjrange = (UChar* )end; + if (reg->dmax != ONIG_INFINITE_DISTANCE && (end - range) >= reg->threshold_len) { do { sch_start = s + reg->dmax; - if (sch_start > end) sch_start = end; + if (sch_start > end) sch_start = (UChar* )end; if (backward_search_range(reg, str, end, sch_start, range, adjrange, &low, &high) <= 0) goto mismatch; @@ -3210,10 +3428,10 @@ onig_search(regex_t* reg, UChar* str, UChar* end, sch_start = s; if (reg->dmax != 0) { if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_start = end; + sch_start = (UChar* )end; else { sch_start += reg->dmax; - if (sch_start > end) sch_start = end; + if (sch_start > end) sch_start = (UChar* )end; else sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, start, sch_start); @@ -3236,7 +3454,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, finish: MATCH_ARG_FREE(msa); - reg->state--; /* decrement as search counter */ + ONIG_STATE_DEC(reg); /* If result is mismatch and no FIND_NOT_EMPTY option, then the region is not setted in match_at(). */ @@ -3257,7 +3475,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, mismatch_no_msa: r = ONIG_MISMATCH; finish_no_msa: - reg->state--; /* decrement as search counter */ + ONIG_STATE_DEC(reg); #ifdef ONIG_DEBUG if (r != ONIG_MISMATCH) fprintf(stderr, "onig_search: error %d\n", r); @@ -3265,7 +3483,7 @@ onig_search(regex_t* reg, UChar* str, UChar* end, return r; match: - reg->state--; /* decrement as search counter */ + ONIG_STATE_DEC(reg); MATCH_ARG_FREE(msa); return s - str; } @@ -3282,18 +3500,44 @@ onig_get_options(regex_t* reg) return reg->options; } +extern OnigAmbigType +onig_get_ambig_flag(regex_t* reg) +{ + return reg->ambig_flag; +} + extern OnigSyntaxType* onig_get_syntax(regex_t* reg) { return reg->syntax; } -extern const char* -onig_version(void) +extern int +onig_number_of_captures(regex_t* reg) { -#define MSTR(a) # a - - return (MSTR(ONIGURUMA_VERSION_MAJOR) "." - MSTR(ONIGURUMA_VERSION_MINOR) "." - MSTR(ONIGURUMA_VERSION_TEENY)); + return reg->num_mem; } + +extern int +onig_number_of_capture_histories(regex_t* reg) +{ +#ifdef USE_CAPTURE_HISTORY + int i, n; + + n = 0; + for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (BIT_STATUS_AT(reg->capture_history, i) != 0) + n++; + } + return n; +#else + return 0; +#endif +} + +extern void +onig_copy_encoding(OnigEncoding to, OnigEncoding from) +{ + *to = *from; +} + diff --git a/ext/mbstring/oniguruma/regext.c b/ext/mbstring/oniguruma/regext.c new file mode 100755 index 00000000000..6839708be7a --- /dev/null +++ b/ext/mbstring/oniguruma/regext.c @@ -0,0 +1,213 @@ +/********************************************************************** + regext.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regint.h" + +static void +conv_ext0be32(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = '\0'; + *conv++ = '\0'; + *conv++ = '\0'; + *conv++ = *s++; + } +} + +static void +conv_ext0le32(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = *s++; + *conv++ = '\0'; + *conv++ = '\0'; + *conv++ = '\0'; + } +} + +static void +conv_ext0be(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = '\0'; + *conv++ = *s++; + } +} + +static void +conv_ext0le(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = *s++; + *conv++ = '\0'; + } +} + +static void +conv_swap4bytes(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = s[3]; + *conv++ = s[2]; + *conv++ = s[1]; + *conv++ = s[0]; + s += 4; + } +} + +static void +conv_swap2bytes(const UChar* s, const UChar* end, UChar* conv) +{ + while (s < end) { + *conv++ = s[1]; + *conv++ = s[0]; + s += 2; + } +} + +static int +conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* end, + UChar** conv, UChar** conv_end) +{ + int len = end - s; + + if (to == ONIG_ENCODING_UTF16_BE) { + if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) { + *conv = (UChar* )xmalloc(len * 2); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + (len * 2); + conv_ext0be(s, end, *conv); + return 0; + } + else if (from == ONIG_ENCODING_UTF16_LE) { + swap16: + *conv = (UChar* )xmalloc(len); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + len; + conv_swap2bytes(s, end, *conv); + return 0; + } + } + else if (to == ONIG_ENCODING_UTF16_LE) { + if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) { + *conv = (UChar* )xmalloc(len * 2); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + (len * 2); + conv_ext0le(s, end, *conv); + return 0; + } + else if (from == ONIG_ENCODING_UTF16_BE) { + goto swap16; + } + } + if (to == ONIG_ENCODING_UTF32_BE) { + if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) { + *conv = (UChar* )xmalloc(len * 4); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + (len * 4); + conv_ext0be32(s, end, *conv); + return 0; + } + else if (from == ONIG_ENCODING_UTF32_LE) { + swap32: + *conv = (UChar* )xmalloc(len); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + len; + conv_swap4bytes(s, end, *conv); + return 0; + } + } + else if (to == ONIG_ENCODING_UTF32_LE) { + if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) { + *conv = (UChar* )xmalloc(len * 4); + CHECK_NULL_RETURN_VAL(*conv, ONIGERR_MEMORY); + *conv_end = *conv + (len * 4); + conv_ext0le32(s, end, *conv); + return 0; + } + else if (from == ONIG_ENCODING_UTF32_BE) { + goto swap32; + } + } + + return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; +} + +extern int +onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigCompileInfo* ci, OnigErrorInfo* einfo) +{ + int r; + UChar *cpat, *cpat_end; + + if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; + + if (ci->pattern_enc != ci->target_enc) { + r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, + &cpat, &cpat_end); + if (r) return r; + } + else { + cpat = (UChar* )pattern; + cpat_end = (UChar* )pattern_end; + } + + r = onig_alloc_init(reg, ci->option, ci->ambig_flag, ci->target_enc, + ci->syntax); + if (r) goto err; + + r = onig_compile(*reg, cpat, cpat_end, einfo); + if (r) { + onig_free(*reg); + *reg = NULL; + } + + err: + if (cpat != pattern) xfree(cpat); + + return r; +} + +extern int +onig_recompile_deluxe(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigCompileInfo* ci, OnigErrorInfo* einfo) +{ + int r; + regex_t *new_reg; + + r = onig_new_deluxe(&new_reg, pattern, pattern_end, ci, einfo); + if (r) return r; + if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + onig_transfer(reg, new_reg); + } + else { + onig_chain_link_add(reg, new_reg); + } + return 0; +} diff --git a/ext/mbstring/oniguruma/reggnu.c b/ext/mbstring/oniguruma/reggnu.c index 9c6a2161c2c..2c8169c481f 100644 --- a/ext/mbstring/oniguruma/reggnu.c +++ b/ext/mbstring/oniguruma/reggnu.c @@ -1,26 +1,38 @@ /********************************************************************** - reggnu.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #ifndef ONIGGNU_H /* name changes from oniggnu.h to regex.h in ruby. */ #include "oniggnu.h" #endif -#if defined(RUBY_PLATFORM) || defined(RUBY) -#ifndef ONIG_RUBY_M17N -#define USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY -#endif -#endif - -#ifndef NULL -#define NULL ((void* )0) -#endif - extern void re_free_registers(OnigRegion* r) { @@ -111,7 +123,9 @@ re_free_pattern(regex_t* reg) extern int re_alloc_pattern(regex_t** reg) { - return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, OnigEncDefaultCharEncoding, + return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, + ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + OnigEncDefaultCharEncoding, OnigDefaultSyntax); } @@ -121,86 +135,6 @@ re_set_casetable(const char* table) onigenc_set_default_caseconv_table((UChar* )table); } -#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY -static const unsigned char mbctab_ascii[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -static const unsigned char mbctab_euc[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -}; - -static const unsigned char mbctab_sjis[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 -}; - -static const unsigned char mbctab_utf8[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0, -}; - -const unsigned char *re_mbctab = mbctab_ascii; -#endif - extern void #ifdef ONIG_RUBY_M17N re_mbcinit(OnigEncoding enc) @@ -236,21 +170,4 @@ re_mbcinit(int mb_code) onigenc_set_default_encoding(enc); #endif - -#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY - switch (mb_code) { - case MBCTYPE_ASCII: - re_mbctab = mbctab_ascii; - break; - case MBCTYPE_EUC: - re_mbctab = mbctab_euc; - break; - case MBCTYPE_SJIS: - re_mbctab = mbctab_sjis; - break; - case MBCTYPE_UTF8: - re_mbctab = mbctab_utf8; - break; - } -#endif } diff --git a/ext/mbstring/oniguruma/regint.h b/ext/mbstring/oniguruma/regint.h index 35736b6dcbe..a704b0e2635 100644 --- a/ext/mbstring/oniguruma/regint.h +++ b/ext/mbstring/oniguruma/regint.h @@ -1,12 +1,33 @@ -/********************************************************************** - - regint.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef REGINT_H #define REGINT_H +/********************************************************************** + regint.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ /* for debug */ /* #define ONIG_DEBUG_PARSE_TREE */ @@ -19,7 +40,8 @@ /* #define ONIG_DEBUG_STATISTICS */ #if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \ - defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ + defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #endif @@ -34,9 +56,9 @@ /* config */ /* spec. config */ +/* #define USE_UNICODE_FULL_RANGE_CTYPE */ /* --> move to regenc.h */ #define USE_NAMED_GROUP #define USE_SUBEXP_CALL -#define USE_FOLD_MATCH /* ess-tsett etc... */ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR @@ -44,48 +66,55 @@ #define USE_RECYCLE_NODE #define USE_OP_PUSH_OR_JUMP_EXACT #define USE_QUALIFIER_PEEK_NEXT +#define USE_ST_HASH_TABLE +#define USE_SHARED_CCLASS_TABLE #define INIT_MATCH_STACK_SIZE 160 -#define MATCH_STACK_LIMIT_SIZE 500000 +#define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ /* interface to external system */ #ifdef NOT_RUBY /* gived from Makefile */ #include "config.h" +#define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS -#define USE_VARIABLE_SYNTAX #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ #define USE_POSIX_REGION_OPTION /* needed for POSIX API support */ +/* #define USE_MULTI_THREAD_SYSTEM */ #define THREAD_ATOMIC_START /* depend on thread system */ #define THREAD_ATOMIC_END /* depend on thread system */ #define THREAD_PASS /* depend on thread system */ +#define CHECK_INTERRUPT /* depend on application */ #define xmalloc malloc #define xrealloc realloc +#define xcalloc calloc #define xfree free #else #include "ruby.h" #include "version.h" #include "rubysig.h" /* for DEFER_INTS, ENABLE_INTS */ + +#define USE_MULTI_THREAD_SYSTEM #define THREAD_ATOMIC_START DEFER_INTS #define THREAD_ATOMIC_END ENABLE_INTS #define THREAD_PASS rb_thread_schedule() +#define CHECK_INTERRUPT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + rb_trap_exec();\ + }\ + }\ +} while (0) + #define DEFAULT_WARN_FUNCTION rb_warn #define DEFAULT_VERB_WARN_FUNCTION rb_warning -#if defined(RUBY_VERSION_MAJOR) -#if RUBY_VERSION_MAJOR > 1 || \ -(RUBY_VERSION_MAJOR == 1 && \ - defined(RUBY_VERSION_MINOR) && RUBY_VERSION_MINOR >= 8) -#define USE_ST_HASH_TABLE -#endif -#endif - #endif /* else NOT_RUBY */ -#define THREAD_PASS_LIMIT_COUNT 10 +#define THREAD_PASS_LIMIT_COUNT 8 #define xmemset memset #define xmemcpy memcpy #define xmemmove memmove -#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(_WIN32) && !defined(__GNUC__) #define xalloca _alloca #ifdef NOT_RUBY #define vsnprintf _vsnprintf @@ -94,6 +123,69 @@ #define xalloca alloca #endif + +#ifdef USE_MULTI_THREAD_SYSTEM +#define ONIG_STATE_INC(reg) (reg)->state++ +#define ONIG_STATE_DEC(reg) (reg)->state-- +#else +#define ONIG_STATE_INC(reg) /* Nothing */ +#define ONIG_STATE_DEC(reg) /* Nothing */ +#endif /* USE_MULTI_THREAD_SYSTEM */ + + +#define onig_st_is_member st_is_member + +#ifdef NOT_RUBY + +#define st_init_table onig_st_init_table +#define st_init_table_with_size onig_st_init_table_with_size +#define st_init_numtable onig_st_init_numtable +#define st_init_numtable_with_size onig_st_init_numtable_with_size +#define st_init_strtable onig_st_init_strtable +#define st_init_strtable_with_size onig_st_init_strtable_with_size +#define st_init_strend_table_with_size onig_st_init_strend_table_with_size +#define st_delete onig_st_delete +#define st_delete_safe onig_st_delete_safe +#define st_insert onig_st_insert +#define st_insert_strend onig_st_insert_strend +#define st_lookup onig_st_lookup +#define st_lookup_strend onig_st_lookup_strend +#define st_foreach onig_st_foreach +#define st_add_direct onig_st_add_direct +#define st_add_direct_strend onig_st_add_direct_strend +#define st_free_table onig_st_free_table +#define st_cleanup_safe onig_st_cleanup_safe +#define st_copy onig_st_copy +#define st_nothing_key_clone onig_st_nothing_key_clone +#define st_nothing_key_free onig_st_nothing_key_free + +#else /* NOT_RUBY */ + +#define onig_st_init_table st_init_table +#define onig_st_init_table_with_size st_init_table_with_size +#define onig_st_init_numtable st_init_numtable +#define onig_st_init_numtable_with_size st_init_numtable_with_size +#define onig_st_init_strtable st_init_strtable +#define onig_st_init_strtable_with_size st_init_strtable_with_size +#define onig_st_init_strend_table_with_size st_init_strend_table_with_size +#define onig_st_delete st_delete +#define onig_st_delete_safe st_delete_safe +#define onig_st_insert st_insert +#define onig_st_insert_strend st_insert_strend +#define onig_st_lookup st_lookup +#define onig_st_lookup_strend st_lookup_strend +#define onig_st_foreach st_foreach +#define onig_st_add_direct st_add_direct +#define onig_st_add_direct_strend st_add_direct_strend +#define onig_st_free_table st_free_table +#define onig_st_cleanup_safe st_cleanup_safe +#define onig_st_copy st_copy +#define onig_st_nothing_key_clone st_nothing_key_clone +#define onig_st_nothing_key_free st_nothing_key_free + +#endif /* NOT_RUBY */ + + #ifdef HAVE_STDLIB_H #include #endif @@ -109,7 +201,11 @@ #endif #include +#ifdef HAVE_SYS_TYPES_H +#ifndef __BORLANDC__ #include +#endif +#endif #ifdef ONIG_DEBUG # include @@ -292,6 +388,8 @@ typedef unsigned int BitStatusType; /* ignore-case and multibyte status are included in compiled code. */ #define IS_DYNAMIC_OPTION(option) 0 +#define REPEAT_INFINITE -1 +#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) /* bitset */ #define BITS_PER_BYTE 8 @@ -449,6 +547,7 @@ enum OpCode { OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, + OP_CCLASS_NODE, /* pointer to CClassNode node */ OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ @@ -501,6 +600,8 @@ enum OpCode { OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, OP_REPEAT_INC_NG, /* non greedy */ + OP_REPEAT_INC_SG, /* search and get in stack */ + OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ OP_NULL_CHECK_START, /* null loop checker start */ OP_NULL_CHECK_END, /* null loop checker end */ OP_NULL_CHECK_END_MEMST, /* null loop checker end (with capture status) */ @@ -529,11 +630,12 @@ enum OpCode { #define ARG_MEMNUM 4 #define ARG_OPTION 5 -typedef short int RelAddrType; -typedef short int AbsAddrType; -typedef short int LengthType; -typedef short int MemNumType; -typedef int RepeatNumType; +typedef int RelAddrType; +typedef int AbsAddrType; +typedef int LengthType; +typedef int RepeatNumType; +typedef short int MemNumType; +typedef void* PointerType; #define SIZE_OPCODE 1 #define SIZE_RELADDR sizeof(RelAddrType) @@ -543,57 +645,33 @@ typedef int RepeatNumType; #define SIZE_REPEATNUM sizeof(RepeatNumType) #define SIZE_OPTION sizeof(OnigOptionType) #define SIZE_CODE_POINT sizeof(OnigCodePoint) +#define SIZE_POINTER sizeof(PointerType) + #ifdef PLATFORM_UNALIGNED_WORD_ACCESS -#define GET_RELADDR_INC(addr,p) do{\ - addr = *((RelAddrType* )(p));\ - (p) += SIZE_RELADDR;\ + +#define PLATFORM_GET_INC(val,p,type) do{\ + val = *(type* )p;\ + (p) += sizeof(type);\ } while(0) -#define GET_ABSADDR_INC(addr,p) do{\ - addr = *((AbsAddrType* )(p));\ - (p) += SIZE_ABSADDR;\ -} while(0) - -#define GET_LENGTH_INC(len,p) do{\ - len = *((LengthType* )(p));\ - (p) += SIZE_LENGTH;\ -} while(0) - -#define GET_MEMNUM_INC(num,p) do{\ - num = *((MemNumType* )(p));\ - (p) += SIZE_MEMNUM;\ -} while(0) - -#define GET_REPEATNUM_INC(num,p) do{\ - num = *((RepeatNumType* )(p));\ - (p) += SIZE_REPEATNUM;\ -} while(0) - -#define GET_OPTION_INC(option,p) do{\ - option = *((OnigOptionType* )(p));\ - (p) += SIZE_OPTION;\ -} while(0) #else -#define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p) -#define GET_ABSADDR_INC(addr,p) GET_SHORT_INC(addr,p) -#define GET_LENGTH_INC(len,p) GET_SHORT_INC(len,p) -#define GET_MEMNUM_INC(num,p) GET_SHORT_INC(num,p) -#define GET_REPEATNUM_INC(num,p) GET_INT_INC(num,p) -#define GET_OPTION_INC(option,p) GET_UINT_INC(option,p) - -#define SERIALIZE_RELADDR(addr,p) SERIALIZE_SHORT(addr,p) -#define SERIALIZE_ABSADDR(addr,p) SERIALIZE_SHORT(addr,p) -#define SERIALIZE_LENGTH(len,p) SERIALIZE_SHORT(len,p) -#define SERIALIZE_MEMNUM(num,p) SERIALIZE_SHORT(num,p) -#define SERIALIZE_REPEATNUM(num,p) SERIALIZE_INT(num,p) -#define SERIALIZE_OPTION(option,p) SERIALIZE_UINT(option,p) - -#define SERIALIZE_BUFSIZE SIZEOF_INT +#define PLATFORM_GET_INC(val,p,type) do{\ + xmemcpy(&val, (p), sizeof(type));\ + (p) += sizeof(type);\ +} while(0) #endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ +#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType) +#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType) +#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType) +#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType) +#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) +#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) +#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) + /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) #define GET_BYTE_INC(byte,p) do{\ @@ -636,23 +714,53 @@ typedef int RepeatNumType; #define SIZE_OP_RETURN SIZE_OPCODE +#define MC_ESC(enc) (enc)->meta_char_table.esc +#define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar +#define MC_ANYTIME(enc) (enc)->meta_char_table.anytime +#define MC_ZERO_OR_ONE_TIME(enc) (enc)->meta_char_table.zero_or_one_time +#define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time +#define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime + +#define SYN_POSIX_COMMON_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ + ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \ + ONIG_SYN_OP_LINE_ANCHOR | \ + ONIG_SYN_OP_ESC_CONTROL_CHARS ) + +#define SYN_GNU_REGEX_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \ + ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \ + ONIG_SYN_OP_VBAR_ALT | \ + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \ + ONIG_SYN_OP_QMARK_ZERO_ONE | \ + ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \ + ONIG_SYN_OP_ESC_W_WORD | \ + ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \ + ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \ + ONIG_SYN_OP_LINE_ANCHOR ) + +#define SYN_GNU_REGEX_BV \ + ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \ + ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \ + ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) + +/* cclass node */ +#define FLAG_CCLASS_NOT 1 +#define FLAG_CCLASS_SHARE (1<<1) + +#define CCLASS_SET_NOT(cc) (cc)->flags |= FLAG_CCLASS_NOT +#define CCLASS_CLEAR_NOT(cc) (cc)->flags &= ~FLAG_CCLASS_NOT +#define CCLASS_SET_SHARE(cc) (cc)->flags |= FLAG_CCLASS_SHARE +#define IS_CCLASS_NOT(cc) (((cc)->flags & FLAG_CCLASS_NOT) != 0) +#define IS_CCLASS_SHARE(cc) (((cc)->flags & FLAG_CCLASS_SHARE) != 0) + typedef struct { - UChar esc; - UChar anychar; - UChar anytime; - UChar zero_or_one_time; - UChar one_or_more_time; - UChar anychar_anytime; -} OnigMetaCharTableType; - -extern OnigMetaCharTableType OnigMetaCharTable; - -#define MC_ESC OnigMetaCharTable.esc -#define MC_ANYCHAR OnigMetaCharTable.anychar -#define MC_ANYTIME OnigMetaCharTable.anytime -#define MC_ZERO_OR_ONE_TIME OnigMetaCharTable.zero_or_one_time -#define MC_ONE_OR_MORE_TIME OnigMetaCharTable.one_or_more_time -#define MC_ANYCHAR_ANYTIME OnigMetaCharTable.anychar_anytime + int flags; + BitSet bs; + BBuf* mbuf; /* multi-byte info or NULL */ +} CClassNode; #ifdef ONIG_DEBUG @@ -665,7 +773,7 @@ typedef struct { extern OnigOpInfoType OnigOpInfo[]; -extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp)); +extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp, OnigEncoding enc)); #ifdef ONIG_DEBUG_STATISTICS extern void onig_statistics_init P_((void)); @@ -675,11 +783,11 @@ extern void onig_print_statistics P_((FILE* f)); extern char* onig_error_code_to_format P_((int code)); extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); -extern UChar* onig_strdup P_((UChar* s, UChar* end)); extern int onig_bbuf_init P_((BBuf* buf, int size)); -extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax)); -extern int onig_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigErrorInfo* einfo)); +extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax)); +extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo)); extern void onig_chain_reduce P_((regex_t* reg)); -extern int onig_is_in_code_range P_((UChar* p, OnigCodePoint code)); +extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); +extern void onig_transfer P_((regex_t* to, regex_t* from)); #endif /* REGINT_H */ diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c index 2260df41555..58e122f4869 100644 --- a/ext/mbstring/oniguruma/regparse.c +++ b/ext/mbstring/oniguruma/regparse.c @@ -1,120 +1,36 @@ /********************************************************************** - regparse.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #define WARN_BUFSIZE 256 -#define SYN_POSIX_COMMON_OP \ - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ - ONIG_SYN_OP_DECIMAL_BACKREF | \ - ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \ - ONIG_SYN_OP_LINE_ANCHOR | \ - ONIG_SYN_OP_ESC_CONTROL_CHARS ) - -#define SYN_GNU_REGEX_OP \ - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \ - ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \ - ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \ - ONIG_SYN_OP_VBAR_ALT | \ - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \ - ONIG_SYN_OP_QMARK_ZERO_ONE | \ - ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \ - ONIG_SYN_OP_ESC_W_WORD | \ - ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \ - ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \ - ONIG_SYN_OP_LINE_ANCHOR ) - -#define SYN_GNU_REGEX_BV \ - ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \ - ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \ - ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) - -#ifdef USE_VARIABLE_SYNTAX -OnigSyntaxType OnigSyntaxPosixBasic = { - ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | - ONIG_SYN_OP_ESC_BRACE_INTERVAL ) - , 0 - , 0 - , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) -}; - -OnigSyntaxType OnigSyntaxPosixExtended = { - ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_LPAREN_SUBEXP | - ONIG_SYN_OP_BRACE_INTERVAL | - ONIG_SYN_OP_PLUS_ONE_INF | ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_VBAR_ALT ) - , 0 - , ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | - ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | - ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP | - ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) - , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) -}; - -OnigSyntaxType OnigSyntaxEmacs = { - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | - ONIG_SYN_OP_ESC_BRACE_INTERVAL | - ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT | - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | - ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_DECIMAL_BACKREF | - ONIG_SYN_OP_LINE_ANCHOR | ONIG_SYN_OP_ESC_CONTROL_CHARS ) - , ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR - , ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxGrep = { - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET | - ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | - ONIG_SYN_OP_ESC_VBAR_ALT | - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF | - ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR | - ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND | - ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF ) - , 0 - , ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC ) - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxGnuRegex = { - SYN_GNU_REGEX_OP - , 0 - , SYN_GNU_REGEX_BV - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxJava = { - (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | - ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL | - ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 ) - & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) - , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT | - ONIG_SYN_OP2_OPTION_PERL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | - ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP | - ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | - ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) - , ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) - , ONIG_OPTION_SINGLELINE -}; - -OnigSyntaxType OnigSyntaxPerl = { - (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | - ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | - ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | - ONIG_SYN_OP_ESC_C_CONTROL ) - & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) - , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | - ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | - ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) - , SYN_GNU_REGEX_BV - , ONIG_OPTION_SINGLELINE -}; -#endif /* USE_VARIABLE_SYNTAX */ - OnigSyntaxType OnigSyntaxRuby = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | @@ -127,12 +43,14 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | - ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB ) + ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | + ONIG_SYN_OP2_ESC_H_XDIGIT ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | + ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -140,89 +58,7 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -#ifdef USE_VARIABLE_SYNTAX -extern int -onig_set_default_syntax(OnigSyntaxType* syntax) -{ - if (IS_NULL(syntax)) - syntax = ONIG_SYNTAX_RUBY; - - OnigDefaultSyntax = syntax; - return 0; -} - -extern void -onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) -{ - *to = *from; -} - -extern void -onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) -{ - syntax->op = op; -} - -extern void -onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) -{ - syntax->op2 = op2; -} - -extern void -onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) -{ - syntax->behavior = behavior; -} - -extern void -onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) -{ - syntax->options = options; -} -#endif - -OnigMetaCharTableType OnigMetaCharTable = { - (OnigCodePoint )'\\' /* esc */ - , (OnigCodePoint )0 /* anychar '.' */ - , (OnigCodePoint )0 /* anytime '*' */ - , (OnigCodePoint )0 /* zero or one time '?' */ - , (OnigCodePoint )0 /* one or more time '+' */ - , (OnigCodePoint )0 /* anychar anytime */ -}; - -#ifdef USE_VARIABLE_META_CHARS -extern int onig_set_meta_char(unsigned int what, unsigned int c) -{ - switch (what) { - case ONIG_META_CHAR_ESCAPE: - OnigMetaCharTable.esc = c; - break; - case ONIG_META_CHAR_ANYCHAR: - OnigMetaCharTable.anychar = c; - break; - case ONIG_META_CHAR_ANYTIME: - OnigMetaCharTable.anytime = c; - break; - case ONIG_META_CHAR_ZERO_OR_ONE_TIME: - OnigMetaCharTable.zero_or_one_time = c; - break; - case ONIG_META_CHAR_ONE_OR_MORE_TIME: - OnigMetaCharTable.one_or_more_time = c; - break; - case ONIG_META_CHAR_ANYCHAR_ANYTIME: - OnigMetaCharTable.anychar_anytime = c; - break; - default: - return ONIGERR_INVALID_ARGUMENT; - break; - } - return 0; -} -#endif /* USE_VARIABLE_META_CHARS */ - - -extern void onig_null_warn(char* s) { } +extern void onig_null_warn(const char* s) { } #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; @@ -272,12 +108,15 @@ bbuf_clone(BBuf** rto, BBuf* from) #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) -#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ - add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0)) +#define MBCODE_START_POS(enc) \ + (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) -#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ - if (! ONIGENC_IS_SINGLEBYTE(code)) {\ - r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ +#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ + add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) + +#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ + if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ if (r) return r;\ }\ } while (0) @@ -359,7 +198,7 @@ bitset_copy(BitSetRef dest, BitSetRef bs) } extern int -onig_strncmp(UChar* s1, UChar* s2, int n) +onig_strncmp(const UChar* s1, const UChar* s2, int n) { int x; @@ -371,7 +210,7 @@ onig_strncmp(UChar* s1, UChar* s2, int n) } static void -k_strcpy(UChar* dest, UChar* src, UChar* end) +k_strcpy(UChar* dest, const UChar* src, const UChar* end) { int len = end - src; if (len > 0) { @@ -380,33 +219,47 @@ k_strcpy(UChar* dest, UChar* src, UChar* end) } } -extern UChar* -onig_strdup(UChar* s, UChar* end) +static UChar* +strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) { - int len = end - s; + int slen, term_len, i; + UChar *r; - if (len > 0) { - UChar* r = (UChar* )xmalloc(len + 1); - CHECK_NULL_RETURN(r); - xmemcpy(r, s, len); - r[len] = (UChar )0; - return r; - } - else return NULL; + slen = end - s; + term_len = ONIGENC_MBC_MINLEN(enc); + + r = (UChar* )xmalloc(slen + term_len); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, slen); + + for (i = 0; i < term_len; i++) + r[slen + i] = (UChar )0; + + return r; } + /* scan pattern methods */ -#define PEND_VALUE -1 +#define PEND_VALUE 0 -#define PFETCH(c) do { (c) = *p++; } while (0) -#define PUNFETCH p-- -#define PINC p++ -#define PPEEK (p < end ? *p : PEND_VALUE) -#define PEND (p < end ? 0 : 1) +#define PFETCH_READY UChar* pfetch_prev +#define PEND (p < end ? 0 : 1) +#define PUNFETCH p = pfetch_prev +#define PINC do { \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PFETCH(c) do { \ + c = ONIGENC_MBC_TO_CODE(enc, p, end); \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) +#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* -k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, +k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, int capa) { UChar* r; @@ -424,7 +277,7 @@ k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, /* dest on static area */ static UChar* strcat_capa_from_static(UChar* dest, UChar* dest_end, - UChar* src, UChar* src_end, int capa) + const UChar* src, const UChar* src_end, int capa) { UChar* r; @@ -450,7 +303,7 @@ typedef struct { #ifdef USE_ST_HASH_TABLE -#include +#include "st.h" typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ @@ -487,7 +340,7 @@ onig_print_names(FILE* fp, regex_t* reg) if (IS_NOT_NULL(t)) { fprintf(fp, "name table\n"); - st_foreach(t, i_print_name_entry, (HashDataType )fp); + onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); fputs("\n", fp); } return 0; @@ -508,7 +361,7 @@ names_clear(regex_t* reg) NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { - st_foreach(t, i_free_name_entry, 0); + onig_st_foreach(t, i_free_name_entry, 0); } return 0; } @@ -523,56 +376,39 @@ onig_names_free(regex_t* reg) if (r) return r; t = (NameTable* )reg->name_table; - if (IS_NOT_NULL(t)) st_free_table(t); + if (IS_NOT_NULL(t)) onig_st_free_table(t); reg->name_table = (void* )NULL; return 0; } static NameEntry* -name_find(regex_t* reg, UChar* name, UChar* name_end) +name_find(regex_t* reg, const UChar* name, const UChar* name_end) { - int len; - UChar namebuf[NAMEBUF_SIZE_1]; - UChar *key; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; e = (NameEntry* )NULL; if (IS_NOT_NULL(t)) { - if (*name_end == '\0') { - key = name; - } - else { - /* dirty, but st.c API claims NULL terminated key. */ - len = name_end - name; - if (len <= NAMEBUF_SIZE) { - xmemcpy(namebuf, name, len); - namebuf[len] = '\0'; - key = namebuf; - } - else { - key = onig_strdup(name, name_end); - if (IS_NULL(key)) return (NameEntry* )NULL; - } - } - - st_lookup(t, (HashDataType )key, (HashDataType * )&e); - if (key != name && key != namebuf) xfree(key); + onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); } return e; } typedef struct { - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*); + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); regex_t* reg; void* arg; int ret; + OnigEncoding enc; } INamesArg; static int i_names(UChar* key, NameEntry* e, INamesArg* arg) { - int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num, + int r = (*(arg->func))(e->name, + /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */ + e->name + e->name_len, + e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { @@ -584,8 +420,8 @@ i_names(UChar* key, NameEntry* e, INamesArg* arg) extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { INamesArg narg; NameTable* t = (NameTable* )reg->name_table; @@ -595,11 +431,41 @@ onig_foreach_name(regex_t* reg, narg.func = func; narg.reg = reg; narg.arg = arg; - st_foreach(t, i_names, (HashDataType )&narg); + narg.enc = reg->enc; /* should be pattern encoding. */ + onig_st_foreach(t, i_names, (HashDataType )&narg); } return narg.ret; } +static int +i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map) +{ + int i; + + if (e->back_num > 1) { + for (i = 0; i < e->back_num; i++) { + e->back_refs[i] = map[e->back_refs[i]].new_val; + } + } + else if (e->back_num == 1) { + e->back_ref1 = map[e->back_ref1].new_val; + } + + return ST_CONTINUE; +} + +extern int +onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + onig_st_foreach(t, i_renumber_name, (HashDataType )map); + } + return 0; +} + + extern int onig_number_of_names(regex_t* reg) { @@ -719,8 +585,8 @@ name_find(regex_t* reg, UChar* name, UChar* name_end) extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { int i, r; NameEntry* e; @@ -765,14 +631,16 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) if (IS_NULL(e)) { #ifdef USE_ST_HASH_TABLE if (IS_NULL(t)) { - reg->name_table = t = st_init_strtable(); + t = onig_st_init_strend_table_with_size(5); + reg->name_table = (void* )t; } e = (NameEntry* )xmalloc(sizeof(NameEntry)); CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY); - e->name = onig_strdup(name, name_end); + e->name = strdup_with_null(reg->enc, name, name_end); if (IS_NULL(e->name)) return ONIGERR_MEMORY; - st_insert(t, (HashDataType )e->name, (HashDataType )e); + onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), + (HashDataType )e); e->name_len = name_end - name; e->back_num = 0; @@ -817,7 +685,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } e = &(t->e[t->num]); t->num++; - e->name = onig_strdup(name, name_end); + e->name = strdup_with_null(reg->enc, name, name_end); e->name_len = name_end - name; #endif } @@ -857,8 +725,8 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } extern int -onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, - int** nums) +onig_name_to_group_numbers(regex_t* reg, const UChar* name, + const UChar* name_end, int** nums) { NameEntry* e; @@ -879,8 +747,8 @@ onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, } extern int -onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, - OnigRegion *region) +onig_name_to_backref_number(regex_t* reg, const UChar* name, + const UChar* name_end, OnigRegion *region) { int i, n, *nums; @@ -905,23 +773,23 @@ onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, #else /* USE_NAMED_GROUP */ extern int -onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, - int** nums) +onig_name_to_group_numbers(regex_t* reg, const UChar* name, + const UChar* name_end, int** nums) { return ONIG_NO_SUPPORT_CONFIG; } extern int -onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, - OnigRegion* region) +onig_name_to_backref_number(regex_t* reg, const UChar* name, + const UChar* name_end, OnigRegion* region) { return ONIG_NO_SUPPORT_CONFIG; } extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { return ONIG_NO_SUPPORT_CONFIG; } @@ -1014,6 +882,7 @@ static FreeNode* FreeNodeList = (FreeNode* )NULL; extern void onig_node_free(Node* node) { + start: if (IS_NULL(node)) return ; switch (NTYPE(node)) { @@ -1026,12 +895,38 @@ onig_node_free(Node* node) case N_LIST: case N_ALT: onig_node_free(NCONS(node).left); - onig_node_free(NCONS(node).right); + /* onig_node_free(NCONS(node).right); */ + { + Node* next_node = NCONS(node).right; + +#ifdef USE_RECYCLE_NODE + { + FreeNode* n = (FreeNode* )node; + + THREAD_ATOMIC_START; + n->next = FreeNodeList; + FreeNodeList = n; + THREAD_ATOMIC_END; + } +#else + xfree(node); +#endif + + node = next_node; + goto start; + } break; case N_CCLASS: - if (NCCLASS(node).mbuf) - bbuf_free(NCCLASS(node).mbuf); + { + CClassNode* cc = &(NCCLASS(node)); + + if (IS_CCLASS_SHARE(cc)) + return ; + + if (cc->mbuf) + bbuf_free(cc->mbuf); + } break; case N_QUALIFIER: @@ -1057,11 +952,12 @@ onig_node_free(Node* node) #ifdef USE_RECYCLE_NODE { - FreeNode* n; + FreeNode* n = (FreeNode* )node; - n = (FreeNode* )node; + THREAD_ATOMIC_START; n->next = FreeNodeList; FreeNodeList = n; + THREAD_ATOMIC_END; } #else xfree(node); @@ -1092,8 +988,10 @@ node_new() #ifdef USE_RECYCLE_NODE if (IS_NOT_NULL(FreeNodeList)) { + THREAD_ATOMIC_START; node = (Node* )FreeNodeList; FreeNodeList = FreeNodeList->next; + THREAD_ATOMIC_END; return node; } #endif @@ -1107,8 +1005,8 @@ static void initialize_cclass(CClassNode* cc) { BITSET_CLEAR(cc->bs); - cc->not = 0; - cc->mbuf = NULL; + cc->flags = 0; + cc->mbuf = NULL; } static Node* @@ -1122,6 +1020,54 @@ node_new_cclass() return node; } +extern Node* +node_new_cclass_by_codepoint_range(int not, + OnigCodePoint sbr[], OnigCodePoint mbr[]) +{ + CClassNode* cc; + int n, i, j; + + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CCLASS; + + cc = &(NCCLASS(node)); + cc->flags = 0; + if (not != 0) CCLASS_SET_NOT(cc); + + BITSET_CLEAR(cc->bs); + if (IS_NOT_NULL(sbr)) { + n = ONIGENC_CODE_RANGE_NUM(sbr); + for (i = 0; i < n; i++) { + for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); + j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); + } + } + } + + if (IS_NULL(mbr)) { + is_null: + cc->mbuf = NULL; + } + else { + BBuf* bbuf; + + n = ONIGENC_CODE_RANGE_NUM(mbr); + if (n == 0) goto is_null; + + bbuf = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(bbuf, NULL); + bbuf->alloc = n + 1; + bbuf->used = n + 1; + bbuf->p = (UChar* )((void* )mbr); + + cc->mbuf = bbuf; + } + + return node; +} + static Node* node_new_ctype(int type) { @@ -1152,6 +1098,12 @@ node_new_list(Node* left, Node* right) return node; } +extern Node* +onig_node_new_list(Node* left, Node* right) +{ + return node_new_list(left, right); +} + static Node* node_new_alt(Node* left, Node* right) { @@ -1237,6 +1189,7 @@ node_new_qualifier(int lower, int upper, int by_number) Node* node = node_new(); CHECK_NULL_RETURN(node); node->type = N_QUALIFIER; + NQUALIFIER(node).state = 0; NQUALIFIER(node).target = NULL; NQUALIFIER(node).lower = lower; NQUALIFIER(node).upper = upper; @@ -1295,7 +1248,7 @@ node_new_option(OnigOptionType option) } extern int -onig_node_str_cat(Node* node, UChar* s, UChar* end) +onig_node_str_cat(Node* node, const UChar* s, const UChar* end) { int addlen = end - s; @@ -1350,8 +1303,22 @@ onig_node_conv_to_str_node(Node* node, int flag) NSTRING(node).end = NSTRING(node).buf; } +extern void +onig_node_str_clear(Node* node) +{ + if (NSTRING(node).capa != 0 && + IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + static Node* -node_new_str(UChar* s, UChar* end) +node_new_str(const UChar* s, const UChar* end) { Node* node = node_new(); CHECK_NULL_RETURN(node); @@ -1368,6 +1335,12 @@ node_new_str(UChar* s, UChar* end) return node; } +extern Node* +onig_node_new_str(const UChar* s, const UChar* end) +{ + return node_new_str(s, end); +} + static Node* node_new_str_raw(UChar* s, UChar* end) { @@ -1382,15 +1355,6 @@ node_new_empty() return node_new_str(NULL, NULL); } -static Node* -node_new_str_char(UChar c) -{ - UChar p[1]; - - p[0] = c; - return node_new_str(p, p + 1); -} - static Node* node_new_str_raw_char(UChar c) { @@ -1403,7 +1367,7 @@ node_new_str_raw_char(UChar c) static Node* str_node_split_last_char(StrNode* sn, OnigEncoding enc) { - UChar *p; + const UChar *p; Node* n = NULL_NODE; if (sn->end > sn->s) { @@ -1412,7 +1376,7 @@ str_node_split_last_char(StrNode* sn, OnigEncoding enc) n = node_new_str(p, sn->end); if ((sn->flag & NSTR_RAW) != 0) NSTRING_SET_RAW(n); - sn->end = p; + sn->end = (UChar* )p; } } return n; @@ -1422,17 +1386,18 @@ static int str_node_can_be_split(StrNode* sn, OnigEncoding enc) { if (sn->end > sn->s) { - return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } extern int -onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc) +onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) { unsigned int num, val; - int c; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND) { @@ -1457,9 +1422,10 @@ static int scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1484,9 +1450,10 @@ static int scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1622,15 +1589,15 @@ add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) } static int -not_code_range_buf(BBuf* bbuf, BBuf** pbuf) +not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) { int r, i, n; - OnigCodePoint pre, from, to, *data; + OnigCodePoint pre, from, *data, to = 0; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf)) { set_all: - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } data = (OnigCodePoint* )(bbuf->p); @@ -1639,7 +1606,7 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) if (n <= 0) goto set_all; r = 0; - pre = 0x80; + pre = MBCODE_START_POS(enc); for (i = 0; i < n; i++) { from = data[i*2]; to = data[i*2+1]; @@ -1664,7 +1631,8 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) } while (0) static int -or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, + BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, n1, *data1; @@ -1673,7 +1641,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { if (not1 != 0 || not2 != 0) - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); return 0; } @@ -1683,14 +1651,14 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) if (IS_NULL(bbuf1)) { if (not1 != 0) { - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } else { if (not2 == 0) { return bbuf_clone(pbuf, bbuf2); } else { - return not_code_range_buf(bbuf2, pbuf); + return not_code_range_buf(enc, bbuf2, pbuf); } } } @@ -1706,7 +1674,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) r = bbuf_clone(pbuf, bbuf2); } else if (not1 == 0) { /* 1 OR (not 2) */ - r = not_code_range_buf(bbuf2, pbuf); + r = not_code_range_buf(enc, bbuf2, pbuf); } if (r != 0) return r; @@ -1816,6 +1784,29 @@ and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) return 0; } +static int +clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) +{ + BBuf *tbuf; + int r; + + if (IS_CCLASS_NOT(cc)) { + bitset_invert(cc->bs); + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + r = not_code_range_buf(enc, cc->mbuf, &tbuf); + if (r != 0) return r; + + bbuf_free(cc->mbuf); + cc->mbuf = tbuf; + } + + CCLASS_CLEAR_NOT(cc); + } + + return 0; +} + static int and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { @@ -1824,10 +1815,10 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = dest->not; + not1 = IS_CCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = cc->not; + not2 = IS_CCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -1850,13 +1841,13 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { - r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf); + r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); } else { r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1882,10 +1873,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = dest->not; + not1 = IS_CCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = cc->not; + not2 = IS_CCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -1911,10 +1902,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); } else { - r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf); + r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -2014,26 +2005,29 @@ popular_qualifier_num(QualifierNode* qf) return -1; } + +enum ReduceType { + RQ_ASIS = 0, /* as is */ + RQ_DEL = 1, /* delete parent */ + RQ_A, /* to '*' */ + RQ_AQ, /* to '*?' */ + RQ_QQ, /* to '??' */ + RQ_P_QQ, /* to '+)??' */ + RQ_PQ_Q, /* to '+?)?' */ +}; + +static enum ReduceType ReduceTypeTable[6][6] = { + {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ + {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ + {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ + {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ + {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ + {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ +}; + extern void onig_reduce_nested_qualifier(Node* pnode, Node* cnode) { -#define NQ_ASIS 0 /* as is */ -#define NQ_DEL 1 /* delete parent */ -#define NQ_A 2 /* to '*' */ -#define NQ_AQ 3 /* to '*?' */ -#define NQ_QQ 4 /* to '??' */ -#define NQ_P_QQ 5 /* to '+)??' */ -#define NQ_PQ_Q 6 /* to '+?)?' */ - - static char reduces[][6] = { - {NQ_DEL, NQ_A, NQ_A, NQ_QQ, NQ_AQ, NQ_ASIS}, /* '?' */ - {NQ_DEL, NQ_DEL, NQ_DEL, NQ_P_QQ, NQ_P_QQ, NQ_DEL}, /* '*' */ - {NQ_A, NQ_A, NQ_DEL, NQ_ASIS, NQ_P_QQ, NQ_DEL}, /* '+' */ - {NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL, NQ_AQ, NQ_AQ}, /* '??' */ - {NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL}, /* '*?' */ - {NQ_ASIS, NQ_PQ_Q, NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL} /* '+?' */ - }; - int pnum, cnum; QualifierNode *p, *c; @@ -2042,35 +2036,35 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) pnum = popular_qualifier_num(p); cnum = popular_qualifier_num(c); - switch(reduces[cnum][pnum]) { - case NQ_DEL: + switch(ReduceTypeTable[cnum][pnum]) { + case RQ_DEL: *p = *c; break; - case NQ_A: + case RQ_A: p->target = c->target; p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; break; - case NQ_AQ: + case RQ_AQ: p->target = c->target; p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; break; - case NQ_QQ: + case RQ_QQ: p->target = c->target; p->lower = 0; p->upper = 1; p->greedy = 0; break; - case NQ_P_QQ: + case RQ_P_QQ: p->target = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; return ; break; - case NQ_PQ_Q: + case RQ_PQ_Q: p->target = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; return ; break; - case NQ_ASIS: + case RQ_ASIS: p->target = cnode; return ; break; @@ -2083,8 +2077,9 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_BYTE = 1, - TK_RAW_BYTE = 2, + TK_RAW_BYTE = 1, + TK_CHAR, + TK_STRING, TK_CODE_POINT, TK_ANYCHAR, TK_CHAR_TYPE, @@ -2114,6 +2109,7 @@ typedef struct { int base; /* is number: 8, 16 (used in [....]) */ UChar* backp; union { + UChar* s; int c; OnigCodePoint code; int anchor; @@ -2145,8 +2141,11 @@ static int fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; - int c; + int r = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); @@ -2200,12 +2199,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) PUNFETCH; up = low; /* {n} : exact n times */ + r = 2; /* fixed */ } if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC) goto invalid; + if (c != MC_ESC(enc)) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2218,7 +2218,7 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) tok->u.repeat.lower = low; tok->u.repeat.upper = up; *src = p; - return 0; + return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: if (syn_allow) @@ -2231,10 +2231,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) static int fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) { - int c; + int v; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH(c); switch (c) { @@ -2245,9 +2248,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2270,9 +2274,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } else if (c == '?') c = 0177; @@ -2304,10 +2309,13 @@ static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { - int r, len, is_num; - int c = 0; + int r, is_num; + OnigCodePoint c = 0; + OnigCodePoint first_code; + OnigEncoding enc = env->enc; UChar *name_end; UChar *p = *src; + PFETCH_READY; name_end = end; r = 0; @@ -2317,19 +2325,20 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) } else { PFETCH(c); + first_code = c; if (c == '>') return ONIGERR_EMPTY_GROUP_NAME; - if (ONIGENC_IS_CODE_DIGIT(env->enc, c)) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { if (ref == 1) is_num = 1; else { r = ONIGERR_INVALID_GROUP_NAME; } } - len = enc_len(env->enc, c); - while (!PEND && len-- > 1) - PFETCH(c); + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } } while (!PEND) { @@ -2337,35 +2346,28 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) PFETCH(c); if (c == '>' || c == ')') break; - len = enc_len(env->enc, c); if (is_num == 1) { - if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) { - if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && c != '_') + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; else r = ONIGERR_INVALID_GROUP_NAME; } } else { - if (len == 1) { - if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && - !ONIGENC_IS_CODE_DIGIT(env->enc, c) && - c != '_') { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - } + if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } - - while (!PEND && len-- > 1) - PFETCH(c); } + if (c != '>') { r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } else { - c = **src; - if (ONIGENC_IS_CODE_UPPER(env->enc, c)) + if (ONIGENC_IS_CODE_ASCII(first_code) && + ONIGENC_IS_CODE_UPPER(enc, first_code)) r = ONIGERR_INVALID_GROUP_NAME; } @@ -2384,19 +2386,21 @@ static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { int r, len; - int c = 0; + OnigCodePoint c = 0; UChar *name_end; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; r = 0; while (!PEND) { name_end = p; - PFETCH(c); - if (enc_len(env->enc, c) > 1) + if (enc_len(enc, p) > 1) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + PFETCH(c); if (c == '>' || c == ')') break; - if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } if (c != '>') { @@ -2457,12 +2461,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) @@ -2488,24 +2492,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { if (in_esc) { in_esc = 0; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) return 1; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; - else if (x == MC_ESC) in_esc = 1; + else if (x == MC_ESC(enc)) in_esc = 1; p = q; } } @@ -2516,10 +2520,13 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int c, num; + int num; + OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; + OnigEncoding enc = env->enc; UChar* prev; UChar* p = *src; + PFETCH_READY; if (PEND) { tok->type = TK_EOT; @@ -2527,7 +2534,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } PFETCH(c); - tok->type = TK_BYTE; + tok->type = TK_CHAR; tok->base = 0; tok->u.c = c; if (c == ']') { @@ -2536,11 +2543,11 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else if (c == '-') { tok->type = TK_CC_RANGE; } - else if (c == MC_ESC) { + else if (c == MC_ESC(enc)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH(c); tok->escaped = 1; @@ -2570,14 +2577,34 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_TYPE; tok->u.subtype = CTYPE_NOT_WHITE_SPACE; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; case 'p': case 'P': - if (PPEEK == '{' && - IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + c2 = PPEEK; + if (c2 == '{' && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); + + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + PFETCH(c2); + if (c2 == '^') { + tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); + } + else + PUNFETCH; + } } break; @@ -2585,14 +2612,17 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + c2 = PPEEK; + if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) { PINC; tok->type = TK_CODE_POINT; tok->base = 16; @@ -2604,7 +2634,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2620,14 +2650,14 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; - tok->base = 16; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; } break; @@ -2636,7 +2666,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, env->enc); + num = scan_unsigned_octal_number(&p, end, 3, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2652,19 +2682,19 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) num = fetch_escaped_value(&p, end, env); if (num < 0) return num; if (tok->u.c != num) { - tok->u.c = num; - tok->type = TK_RAW_BYTE; + tok->u.code = (OnigCodePoint )num; + tok->type = TK_CODE_POINT; } break; } } else if (c == '[') { - if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; tok->backp = p; /* point at '[' is readed */ PINC; - if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', - env->enc)) { + if (str_exist_check_with_esc(send, 2, p, end, + (OnigCodePoint )']', enc)) { tok->type = TK_POSIX_BRACKET_OPEN; } else { @@ -2684,7 +2714,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } else if (c == '&') { if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && - !PEND && PPEEK == '&') { + !PEND && (PPEEK_IS('&'))) { PINC; tok->type = TK_CC_AND; } @@ -2698,10 +2728,13 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, c, num; + int r, num; + OnigCodePoint c; + OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; UChar* prev; UChar* p = *src; + PFETCH_READY; start: if (PEND) { @@ -2709,13 +2742,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return tok->type; } - tok->type = TK_BYTE; - tok->base = 0; - PFETCH(c); - if (c == MC_ESC) { - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + tok->type = TK_STRING; + tok->base = 0; + tok->backp = p; + PFETCH(c); + if (c == MC_ESC(enc)) { + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; + + tok->backp = p; PFETCH(c); + tok->u.c = c; tok->escaped = 1; switch (c) { @@ -2741,37 +2778,42 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; greedy_check: - if (!PEND && PPEEK == '?' && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { PFETCH(c); tok->u.repeat.greedy = 0; tok->u.repeat.possessive = 0; } - else if (!PEND && PPEEK == '+' && - ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && - tok->type != TK_INTERVAL) || - (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && - tok->type == TK_INTERVAL))) { - PFETCH(c); - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 1; - } else { - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 0; + possessive_check: + if (!PEND && PPEEK_IS('+') && + ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } } break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -2851,6 +2893,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.subtype = CTYPE_NOT_DIGIT; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; + case 'A': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: @@ -2891,14 +2945,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )num; @@ -2909,7 +2965,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2925,14 +2981,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; - tok->base = 16; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; } break; @@ -2940,9 +2996,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, env->enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; - if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER; + num = onig_scan_unsigned_number(&p, end, enc); + if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + goto skip_backref; + } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ @@ -2957,7 +3014,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.by_name = 0; break; } - else if (c == '8' || c == '9') { + + skip_backref: + if (c == '8' || c == '9') { /* normal char */ p = prev; PINC; break; @@ -2968,7 +3027,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -3054,11 +3113,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK == '{' && - IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + if (PPEEK_IS('{') && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); + + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + PFETCH(c); + if (c == '^') { + tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); + } + else + PUNFETCH; + } } break; @@ -3068,8 +3136,11 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; /* set_raw: */ if (tok->u.c != num) { - tok->type = TK_RAW_BYTE; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->u.code = (OnigCodePoint )num; + } + else { /* string */ + p = tok->backp + enc_len(enc, tok->backp); } break; } @@ -3081,15 +3152,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { - if (c == MC_ANYCHAR) + if (c == MC_ANYCHAR(enc)) goto any_char; - else if (c == MC_ANYTIME) + else if (c == MC_ANYTIME(enc)) goto anytime; - else if (c == MC_ZERO_OR_ONE_TIME) + else if (c == MC_ZERO_OR_ONE_TIME(enc)) goto zero_or_one_time; - else if (c == MC_ONE_OR_MORE_TIME) + else if (c == MC_ONE_OR_MORE_TIME(enc)) goto one_or_more_time; - else if (c == MC_ANYCHAR_ANYTIME) { + else if (c == MC_ANYCHAR_ANYTIME(enc)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } @@ -3132,14 +3203,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -3148,6 +3221,26 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': + if (PPEEK_IS('?') && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + PINC; + if (PPEEK_IS('#')) { + PFETCH(c); + while (1) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == MC_ESC(enc)) { + if (!PEND) PFETCH(c); + } + else { + if (c == ')') break; + } + } + goto start; + } + PUNFETCH; + } + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; @@ -3185,7 +3278,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_EXTEND(env->option)) { while (!PEND) { PFETCH(c); - if (ONIG_IS_NEWLINE(c)) + if (ONIGENC_IS_CODE_NEWLINE(enc, c)) break; } goto start; @@ -3199,6 +3292,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: + /* string */ break; } } @@ -3209,48 +3303,57 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int -add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, - OnigEncoding enc) +add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, + OnigCodePoint sbr[], OnigCodePoint mbr[]) { - int i, r, nsb, nmb; - OnigCodePointRange *sbr, *mbr; + int i, r; OnigCodePoint j; - r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); - if (r != 0) return r; + int nsb = ONIGENC_CODE_RANGE_NUM(sbr); + int nmb = ONIGENC_CODE_RANGE_NUM(mbr); if (not == 0) { for (i = 0; i < nsb; i++) { - for (j = sbr[i].from; j <= sbr[i].to; j++) { - BITSET_SET_BIT(cc->bs, j); + for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); + j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); } } + for (i = 0; i < nmb; i++) { - r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to); + r = add_code_range_to_buf(&(cc->mbuf), + ONIGENC_CODE_RANGE_FROM(mbr, i), + ONIGENC_CODE_RANGE_TO(mbr, i)); if (r != 0) return r; } } else { OnigCodePoint prev = 0; - for (i = 0; i < nsb; i++) { - for (j = prev; j < sbr[i].from; j++) { - BITSET_SET_BIT(cc->bs, j); + + if (ONIGENC_MBC_MINLEN(enc) == 1) { + for (i = 0; i < nsb; i++) { + for (j = prev; + j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); + } + prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1; } - prev = sbr[i].to + 1; - } - if (prev < 0x7f) { - for (j = prev; j < 0x7f; j++) { - BITSET_SET_BIT(cc->bs, j); + if (prev < 0x7f) { + for (j = prev; j < 0x7f; j++) { + BITSET_SET_BIT(cc->bs, j); + } } + + prev = 0x80; } - prev = 0x80; for (i = 0; i < nmb; i++) { - if (prev < mbr[i].from) { - r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1); + if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { + r = add_code_range_to_buf(&(cc->mbuf), prev, + ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); if (r != 0) return r; } - prev = mbr[i].to + 1; + prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; } if (prev < 0x7fffffff) { r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff); @@ -3258,17 +3361,21 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } } - return r; + return 0; } static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; + OnigCodePoint *sbr, *mbr; OnigEncoding enc = env->enc; - if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) { - r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc); + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr); + if (r == 0) { + return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr); + } + else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } @@ -3326,7 +3433,8 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c)) + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */ + && ! ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } } @@ -3370,6 +3478,14 @@ parse_ctype_to_enc_ctype(int pctype, int* not) ctype = ONIGENC_CTYPE_DIGIT; *not = 1; break; + case CTYPE_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 0; + break; + case CTYPE_NOT_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 1; + break; default: return ONIGERR_PARSER_BUG; break; @@ -3407,23 +3523,26 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) }; PosixBracketEntryType *pb; - int not, i, c, r; + int not, i, r; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; - if (PPEEK == '^') { + if (PPEEK_IS('^')) { PINC; not = 1; } else not = 0; - if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (onig_strncmp(p, pb->name, pb->len) == 0) { - p += pb->len; - if (end - p < 2 || *p != ':' || *(p+1) != ']') + if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { + p = (UChar* )onigenc_step(enc, p, end, pb->len); + if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3442,9 +3561,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) PINC; if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } - if (c == ':' && !PEND) { + if (c == ':' && ! PEND) { PINC; - if (!PEND) { + if (! PEND) { PFETCH(c); if (c == ']') return ONIGERR_INVALID_POSIX_BRACKET_TYPE; @@ -3455,7 +3574,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -property_name_to_ctype(UChar* p, UChar* end) +property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, @@ -3477,28 +3596,49 @@ property_name_to_ctype(UChar* p, UChar* end) PosixBracketEntryType *pb; int len; - len = end - p; + len = onigenc_strlen(enc, p, end); for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0) + if (len == pb->len && + onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) return pb->ctype; } - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; + return -1; } static int fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int ctype; - UChar *prev, *p = *src; - int c = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; + UChar *prev, *start, *p = *src; + PFETCH_READY; + + /* 'IsXXXX' => 'XXXX' */ + if (!PEND && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) { + c = PPEEK; + if (c == 'I') { + PINC; + if (! PEND) { + c = PPEEK; + if (c == 's') + PINC; + else + PUNFETCH; + } + } + } + + start = prev = p; while (!PEND) { prev = p; PFETCH(c); if (c == '}') { - ctype = property_name_to_ctype(*src, prev); - if (ctype < 0) return ctype; + ctype = property_name_to_ctype(start, prev, enc); + if (ctype < 0) break; *src = p; return ctype; @@ -3507,6 +3647,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) break; } + onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME, + *src, prev); return ONIGERR_INVALID_CHAR_PROPERTY_NAME; } @@ -3588,6 +3730,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, case CCS_RANGE: if (intype == *type) { if (intype == CCV_SB) { + if (*vs > 0xff || v > 0xff) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + if (*vs > v) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) goto ccs_range_end; @@ -3602,14 +3747,23 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } } else { - if (intype == CCV_CODE_POINT && *type == CCV_SB && - ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) { - bitset_set_range(cc->bs, (int )*vs, 0x7f); - r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v); +#if 0 + if (intype == CCV_CODE_POINT && *type == CCV_SB) { +#endif + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); if (r < 0) return r; +#if 0 } else return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; +#endif } ccs_range_end: *state = CCS_COMPLETE; @@ -3631,22 +3785,24 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } static int -char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, +code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, OnigEncoding enc) { int in_esc; + OnigCodePoint code; UChar* p = from; + PFETCH_READY; in_esc = 0; - while (p < to) { + while (! PEND) { if (ignore_escaped && in_esc) { in_esc = 0; } else { - if (*p == c) return 1; - if (*p == MC_ESC) in_esc = 1; + PFETCH(code); + if (code == c) return 1; + if (code == MC_ESC(enc)) in_esc = 1; } - p += enc_len(enc, *p); } return 0; } @@ -3669,7 +3825,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, prev_cc = (CClassNode* )NULL; *np = NULL_NODE; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -3679,11 +3835,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (r < 0) return r; if (r == TK_CC_CLOSE) { - if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + if (! code_exist_check((OnigCodePoint )']', + *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, "]"); - r = tok->type = TK_BYTE; /* allow []...] */ + r = tok->type = TK_CHAR; /* allow []...] */ } *np = node = node_new_cclass(); @@ -3696,58 +3853,69 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { - case TK_BYTE: - len = enc_len(env->enc, tok->u.c); + case TK_CHAR: + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { - PUNFETCH; - v = ONIGENC_MBC_TO_CODE(env->enc, p, end); - p += len; in_type = CCV_CODE_POINT; } else { sb_char: - v = (OnigCodePoint )tok->u.c; in_type = CCV_SB; } + v = (OnigCodePoint )tok->u.c; in_israw = 0; goto val_entry2; break; case TK_RAW_BYTE: - len = enc_len(env->enc, tok->u.c); - if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + /* tok->base != 0 : octal or hexadec. */ + if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - UChar* bufp = buf; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; + UChar* psave = p; int i, base = tok->base; - if (len > ONIGENC_CODE_TO_MBC_MAXLEN) { - bufp = (UChar* )xmalloc(len); - if (IS_NULL(bufp)) { - r = ONIGERR_MEMORY; - goto err; - } - bufe = bufp + len; - } - bufp[0] = tok->u.c; - for (i = 1; i < len; i++) { + buf[0] = tok->u.c; + for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); - if (r < 0) goto raw_byte_err; - if (r != TK_RAW_BYTE || tok->base != base) break; - bufp[i] = tok->u.c; + if (r < 0) goto err; + if (r != TK_RAW_BYTE || tok->base != base) { + fetched = 1; + break; + } + buf[i] = tok->u.c; } - if (i < len) { + + if (i < ONIGENC_MBC_MINLEN(env->enc)) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - raw_byte_err: - if (bufp != buf) xfree(bufp); goto err; } - v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe); - if (bufp != buf) xfree(bufp); - in_type = CCV_CODE_POINT; + + len = enc_len(env->enc, buf); + if (i < len) { + r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; + goto err; + } + else if (i > len) { /* fetch back */ + p = psave; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + } + fetched = 0; + } + + if (i == 1) { + v = (OnigCodePoint )buf[0]; + goto raw_single; + } + else { + v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CCV_CODE_POINT; + } } else { v = (OnigCodePoint )tok->u.c; + raw_single: in_type = CCV_SB; } in_israw = 1; @@ -3881,7 +4049,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, case TK_CC_AND: /* && */ { if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, &val_type, &state, env); if (r != 0) goto err; } @@ -3921,7 +4089,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, &val_type, &state, env); if (r != 0) goto err; } @@ -3933,16 +4101,28 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, cc = prev_cc; } - cc->not = neg; - if (cc->not != 0 && + if (neg != 0) + CCLASS_SET_NOT(cc); + else + CCLASS_CLEAR_NOT(cc); + if (IS_CCLASS_NOT(cc) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { int is_empty; is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); if (is_empty != 0) BITSET_IS_EMPTY(cc->bs, is_empty); - if (is_empty == 0) - BITSET_SET_BIT(cc->bs, ONIG_NEWLINE); + + if (is_empty == 0) { +#define NEWLINE_CODE 0x0a + + if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { + if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) + BITSET_SET_BIT(cc->bs, NEWLINE_CODE); + else + add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + } + } } *src = p; return 0; @@ -3961,33 +4141,26 @@ static int parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) { + int r, num; + int list_capture; Node *target; OnigOptionType option; - int r, c, num; - int list_capture; + OnigEncoding enc = env->enc; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; *np = NULL; if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; option = env->option; - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); switch (c) { - case '#': /* (?#...) comment */ - while (1) { - if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; - PFETCH(c); - if (c == ')') break; - } - *src = p; - return 3; /* 3: comment */ - break; - case ':': /* (?:...) grouping only */ group: r = fetch_token(tok, &p, end, env); @@ -4129,7 +4302,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == ':') { OnigOptionType prev = env->option; - env->option = option; + env->option = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, &p, end, env); @@ -4185,6 +4358,14 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return 0; } +static char* PopularQStr[] = { + "?", "*", "+", "??", "*?", "+?" +}; + +static char* ReduceQStr[] = { + "", "", "*", "*?", "??", "+ and ??", "+? and ?" +}; + static int set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) { @@ -4217,38 +4398,38 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR if (qn->by_number == 0 && qnt->by_number == 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { - if (IS_REPEAT_INFINITE(qn->upper)) { - if (qn->lower == 0) { /* '*' */ - redundant: - { - char buf[WARN_BUFSIZE]; - if (onig_verb_warn != onig_null_warn) { - onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, - env->pattern, env->pattern_end, - "redundant nested repeat operator"); - (*onig_verb_warn)(buf); - } - goto warn_exit; - } - } - else if (qn->lower == 1) { /* '+' */ - /* (?:a?)+? only allowed. */ - if (qn->greedy || !(qnt->upper == 1 && qnt->greedy)) - goto redundant; - } - } - else if (qn->upper == 1 && qn->lower == 0) { - if (qn->greedy) { /* '?' */ - if (!(qnt->lower == 1 && qnt->greedy == 0)) /* not '+?' */ - goto redundant; - } - else { /* '??' */ - /* '(?:a+)?? only allowd. (?:a*)?? can be replaced to (?:a+)?? */ - if (!(qnt->greedy && qnt->lower == 1 && - IS_REPEAT_INFINITE(qnt->upper))) - goto redundant; - } - } + int nestq_num, targetq_num; + char buf[WARN_BUFSIZE]; + + nestq_num = popular_qualifier_num(qn); + targetq_num = popular_qualifier_num(qnt); + + switch(ReduceTypeTable[targetq_num][nestq_num]) { + case RQ_ASIS: + break; + + case RQ_DEL: + if (onig_verb_warn != onig_null_warn) { + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "redundant nested repeat operator"); + (*onig_verb_warn)(buf); + } + goto warn_exit; + break; + + default: + if (onig_verb_warn != onig_null_warn) { + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "nested repeat operator %s and %s was replaced with '%s'", + PopularQStr[targetq_num], PopularQStr[nestq_num], + ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); + (*onig_verb_warn)(buf); + } + goto warn_exit; + break; + } } warn_exit: @@ -4269,74 +4450,151 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) return 0; } -#ifdef USE_FOLD_MATCH static int -make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node) +make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, + CClassNode* cc, Node** root) { - int i; - UChar *s, *end; - Node *root, **ptail, *snode; - - ptail = &root; - for (i = 0; i < info->target_num; i++) { - s = info->target_str[i]; - end = s + info->target_byte_len[i]; - /* ex. - U+00DF match "ss" and "SS, but not match "Ss". - So, string nodes must be raw. - */ - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - } - *ptail = NULL_NODE; - *node = root; - return 0; -} - -static int -make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root) -{ - int i, j, flen, len, ncode, n; - UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - OnigCodePoint* codes; - Node **ptail, *snode; - OnigEncFoldMatchInfo* info; + int r, i, j, k, clen, len, ncode, n; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node **ptail, *snode = NULL_NODE; + OnigCompAmbigCodes* ccs; + OnigCompAmbigCodeItem* ci; + OnigAmbigType amb; + n = 0; *root = NULL_NODE; ptail = root; - ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes); - n = 0; - for (i = 0; i < ncode; i++) { - if (onig_is_code_in_cc(enc, codes[i], cc)) { - len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); - flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info); - if (flen > 0) { /* fold */ - for (j = 0; j < info->target_num; j++) { - s = info->target_str[j]; - end = s + info->target_byte_len[j]; - if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0) - continue; /* ignore single char. */ - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - n++; - } + ncode = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < ncode; i++) { + if (onig_is_code_in_cc(enc, ccs[i].code, cc)) { + for (j = 0; j < ccs[i].n; j++) { + ci = &(ccs[i].items[j]); + if (ci->len > 1) { /* compound only */ + if (IS_CCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc); + + clen = ci->len; + for (k = 0; k < clen; k++) { + len = ONIGENC_CODE_TO_MBC(enc, ci->code[k], buf); + + if (k == 0) { + snode = node_new_str_raw(buf, buf + len); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + else { + r = onig_node_str_cat(snode, buf, buf + len); + if (r < 0) return r; + } + } + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + n++; + } + } } } } return n; } -#endif + + +#ifdef USE_SHARED_CCLASS_TABLE + +#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 + +/* for ctype node hash table */ + +typedef struct { + OnigEncoding enc; + int not; + int type; +} type_cclass_key; + +static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) +{ + if (x->type != y->type) return 1; + if (x->enc != y->enc) return 1; + if (x->not != y->not) return 1; + return 0; +} + +static int type_cclass_hash(type_cclass_key* key) +{ + int i, val; + unsigned char *p; + + val = 0; + + p = (unsigned char* )&(key->enc); + for (i = 0; i < sizeof(key->enc); i++) { + val = val * 997 + (int )*p++; + } + + p = (unsigned char* )(&key->type); + for (i = 0; i < sizeof(key->type); i++) { + val = val * 997 + (int )*p++; + } + + val += key->not; + return val + (val >> 5); +} + +static int type_cclass_key_free(st_data_t x) +{ + xfree((void* )x); + return 0; +} + +static st_data_t type_cclass_key_clone(st_data_t x) +{ + type_cclass_key* new_key; + type_cclass_key* key = (type_cclass_key* )x; + + new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); + *new_key = *key; + return (st_data_t )new_key; +} + +static struct st_hash_type type_type_cclass_hash = { + type_cclass_cmp, + type_cclass_hash, + type_cclass_key_free, + type_cclass_key_clone +}; + +static st_table* OnigTypeCClassTable; + + +static int +i_free_shared_class(type_cclass_key* key, Node* node, void* arg) +{ + if (IS_NOT_NULL(node)) { + CClassNode* cc = &(NCCLASS(node)); + if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); + xfree(node); + } + return ST_DELETE; +} + +extern int +onig_free_shared_cclass_table() +{ + if (IS_NOT_NULL(OnigTypeCClassTable)) { + onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); + } + + return 0; +} + +#endif /* USE_SHARED_CCLASS_TABLE */ + static int parse_exp(Node** np, OnigToken* tok, int term, @@ -4346,7 +4604,6 @@ parse_exp(Node** np, OnigToken* tok, int term, Node* qn; Node** targetp; - start: *np = NULL; if (tok->type == term) goto end_of_token; @@ -4376,11 +4633,6 @@ parse_exp(Node** np, OnigToken* tok, int term, NEFFECT(*np).target = target; return tok->type; } - else if (r == 3) { /* comment */ - r = fetch_token(tok, src, end, env); - if (r < 0) return r; - goto start; - } break; case TK_SUBEXP_CLOSE: @@ -4391,76 +4643,22 @@ parse_exp(Node** np, OnigToken* tok, int term, else goto tk_byte; break; - case TK_BYTE: + case TK_STRING: tk_byte: { - *np = node_new_str_char((UChar )tok->u.c); + *np = node_new_str(tok->backp, *src); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); while (1) { - len = enc_len(env->enc, tok->u.c); - if (len > 1) { - r = onig_node_str_cat(*np, *src, *src + len - 1); - if (r < 0) return r; - *src += (len - 1); - } - r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_BYTE) break; + if (r != TK_STRING) break; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = onig_node_str_cat(*np, tok->backp, *src); if (r < 0) return r; } - fold_entry: -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int flen, ret; - Node *root, **ptail, *work, *snode, *anode; - UChar *p, *pprev; - OnigEncFoldMatchInfo* fold_info; - StrNode* sn = &(NSTRING(*np)); - - ptail = &root; - pprev = sn->s; - for (p = sn->s; p < sn->end; ) { - flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info); - if (flen > 0) { /* fold */ - ret = make_alt_node_from_fold_info(fold_info, &anode); - if (ret != 0) return ret; - work = node_new_list(anode, NULL); - CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY); - - if (pprev < p) { - snode = node_new_str(pprev, p); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, work); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - else { - *ptail = work; - } - ptail = &(NCONS(work).right); - p += flen; - pprev = p; - } - else - p += enc_len(env->enc, *p); - } - *ptail = NULL_NODE; - if (IS_NOT_NULL(root)) { - if (pprev < sn->end) { - snode = node_new_str(pprev, sn->end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - onig_node_free(*np); - *np = root; - } - } -#endif + string_end: targetp = np; goto repeat; } @@ -4469,22 +4667,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_RAW_BYTE: tk_raw_byte: { - int expect_len; - *np = node_new_str_raw_char((UChar )tok->u.c); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - expect_len = enc_len(env->enc, tok->u.c); len = 1; while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_RAW_BYTE) { #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - if (len >= expect_len) { + if (len >= enc_len(env->enc, NSTRING(*np).s)) { NSTRING_CLEAR_RAW(*np); } #endif - goto fold_entry; + goto string_end; } r = node_str_cat_char(*np, (UChar )tok->u.c); @@ -4510,9 +4705,11 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_QUOTE_OPEN: { - OnigCodePoint end_op[] = { (OnigCodePoint )MC_ESC, (OnigCodePoint )'E' }; + OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; + end_op[0] = (OnigCodePoint )MC_ESC(env->enc); + end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); if (IS_NULL(qend)) { @@ -4537,17 +4734,69 @@ parse_exp(Node** np, OnigToken* tok, int term, case CTYPE_NOT_WHITE_SPACE: case CTYPE_DIGIT: case CTYPE_NOT_DIGIT: + case CTYPE_XDIGIT: + case CTYPE_NOT_XDIGIT: { CClassNode* cc; int ctype, not; - ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); +#ifdef USE_SHARED_CCLASS_TABLE + OnigCodePoint *sbr, *mbr; - *np = node_new_cclass(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - cc = &(NCCLASS(*np)); - add_ctype_to_cc(cc, ctype, 0, env); - if (not != 0) CCLASS_SET_NOT(cc); + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr); + if (r == 0 && + ONIGENC_CODE_RANGE_NUM(mbr) + >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { + type_cclass_key key; + type_cclass_key* new_key; + + key.enc = env->enc; + key.not = not; + key.type = ctype; + + THREAD_ATOMIC_START; + + if (IS_NULL(OnigTypeCClassTable)) { + OnigTypeCClassTable + = onig_st_init_table_with_size(&type_type_cclass_hash, 10); + if (IS_NULL(OnigTypeCClassTable)) { + THREAD_ATOMIC_END; + return ONIGERR_MEMORY; + } + } + else { + if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key, + (st_data_t* )np)) { + THREAD_ATOMIC_END; + break; + } + } + + *np = node_new_cclass_by_codepoint_range(not, sbr, mbr); + if (IS_NULL(*np)) { + THREAD_ATOMIC_END; + return ONIGERR_MEMORY; + } + + CCLASS_SET_SHARE(&(NCCLASS(*np))); + new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); + onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, + (st_data_t )*np); + + THREAD_ATOMIC_END; + } + else { +#endif + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + cc = &(NCCLASS(*np)); + add_ctype_to_cc(cc, ctype, 0, env); + if (not != 0) CCLASS_SET_NOT(cc); +#ifdef USE_SHARED_CCLASS_TABLE + } +#endif } break; @@ -4564,27 +4813,66 @@ parse_exp(Node** np, OnigToken* tok, int term, break; case TK_CC_OPEN: - r = parse_char_class(np, tok, src, end, env); - if (r != 0) return r; + { + CClassNode* cc; -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int res; - Node *alt_root, *work; - CClassNode* cc = &(NCCLASS(*np)); + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; - res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root); - if (res < 0) return res; - if (res > 0) { - work = node_new_alt(*np, alt_root); - if (IS_NULL(work)) { - onig_node_free(alt_root); - return ONIGERR_MEMORY; - } - *np = work; + cc = &(NCCLASS(*np)); + + if (IS_IGNORECASE(env->option)) { + int i, n, in_cc; + OnigPairAmbigCodes* ccs; + BitSetRef bs = cc->bs; + OnigAmbigType amb; + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & env->ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs); + for (i = 0; i < n; i++) { + in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc); + + if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) || + (in_cc == 0 && IS_CCLASS_NOT(cc))) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ccs[i].from >= SINGLE_BYTE_SIZE) { + /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */ + add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to); + } + else { + if (BITSET_AT(bs, ccs[i].from)) { + /* /(?i:[^A-C])/.match("a") ==> fail. */ + BITSET_SET_BIT(bs, ccs[i].to); + } + if (BITSET_AT(bs, ccs[i].to)) { + BITSET_SET_BIT(bs, ccs[i].from); + } + } + } + } + } + } + + if (IS_IGNORECASE(env->option) && + (env->ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + int res; + Node *alt_root, *work; + + res = make_compound_alt_node_from_cc(env->ambig_flag, env->enc, + cc, &alt_root); + if (res < 0) return res; + if (res > 0) { + work = node_new_alt(*np, alt_root); + if (IS_NULL(work)) { + onig_node_free(alt_root); + return ONIGERR_MEMORY; + } + *np = work; + } } } -#endif break; case TK_ANYCHAR: @@ -4630,7 +4918,6 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_empty(); } else { - *src = tok->backp; goto tk_byte; } break; @@ -4781,7 +5068,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) } extern int -onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, +onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env) { int r; @@ -4793,15 +5080,16 @@ onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, scan_env_clear(env); env->option = reg->options; + env->ambig_flag = reg->ambig_flag; env->enc = reg->enc; env->syntax = reg->syntax; - env->pattern = pattern; - env->pattern_end = end; + env->pattern = (UChar* )pattern; + env->pattern_end = (UChar* )end; env->reg = reg; *root = NULL; - p = pattern; - r = parse_regexp(root, &p, end, env); + p = (UChar* )pattern; + r = parse_regexp(root, &p, (UChar* )end, env); reg->num_mem = env->num_mem; return r; } diff --git a/ext/mbstring/oniguruma/regparse.h b/ext/mbstring/oniguruma/regparse.h index b2726becbde..1a4ac7dea24 100644 --- a/ext/mbstring/oniguruma/regparse.h +++ b/ext/mbstring/oniguruma/regparse.h @@ -1,12 +1,33 @@ -/********************************************************************** - - regparse.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - -**********************************************************************/ #ifndef REGPARSE_H #define REGPARSE_H +/********************************************************************** + regparse.h - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "regint.h" @@ -43,7 +64,8 @@ #define CTYPE_NOT_WHITE_SPACE (1<<3) #define CTYPE_DIGIT (1<<4) #define CTYPE_NOT_DIGIT (1<<5) - +#define CTYPE_XDIGIT (1<<6) +#define CTYPE_NOT_XDIGIT (1<<7) #define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) #define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) @@ -52,29 +74,27 @@ #define EFFECT_OPTION (1<<1) #define EFFECT_STOP_BACKTRACK (1<<2) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) - #define NODE_STR_MARGIN 16 #define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 7 #define NSTR_RAW (1<<0) /* by backslashed number */ -#define NSTR_CASE_AMBIG (1<<1) +#define NSTR_AMBIG (1<<1) +#define NSTR_AMBIG_REDUCE (1<<2) -#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) -#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW -#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW -#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG -#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) -#define NSTRING_IS_CASE_AMBIG(node) \ - (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0) +#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) +#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW +#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW +#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG +#define NSTRING_SET_AMBIG_REDUCE(node) (node)->u.str.flag |= NSTR_AMBIG_REDUCE +#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) +#define NSTRING_IS_AMBIG(node) (((node)->u.str.flag & NSTR_AMBIG) != 0) +#define NSTRING_IS_AMBIG_REDUCE(node) \ + (((node)->u.str.flag & NSTR_AMBIG_REDUCE) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static); -#define CCLASS_SET_NOT(cc) (cc)->not = 1 - #define NQ_TARGET_ISNOT_EMPTY 0 #define NQ_TARGET_IS_EMPTY 1 #define NQ_TARGET_IS_EMPTY_MEM 2 @@ -89,13 +109,17 @@ typedef struct { UChar buf[NODE_STR_BUF_SIZE]; } StrNode; +/* move to regint.h */ +#if 0 typedef struct { - int not; + int flags; BitSet bs; BBuf* mbuf; /* multi-byte info or NULL */ } CClassNode; +#endif typedef struct { + int state; struct _Node* target; int lower; int upper; @@ -108,19 +132,19 @@ typedef struct { } QualifierNode; /* status bits */ -#define NST_MIN_FIXED (1<<0) -#define NST_MAX_FIXED (1<<1) -#define NST_CLEN_FIXED (1<<2) -#define NST_MARK1 (1<<3) -#define NST_MARK2 (1<<4) -#define NST_MEM_BACKREFED (1<<5) -#define NST_SIMPLE_REPEAT (1<<6) /* for stop backtrack optimization */ - -#define NST_RECURSION (1<<7) -#define NST_CALLED (1<<8) -#define NST_ADDR_FIXED (1<<9) -#define NST_NAMED_GROUP (1<<10) -#define NST_NAME_REF (1<<11) +#define NST_MIN_FIXED (1<<0) +#define NST_MAX_FIXED (1<<1) +#define NST_CLEN_FIXED (1<<2) +#define NST_MARK1 (1<<3) +#define NST_MARK2 (1<<4) +#define NST_MEM_BACKREFED (1<<5) +#define NST_STOP_BT_SIMPLE_REPEAT (1<<6) +#define NST_RECURSION (1<<7) +#define NST_CALLED (1<<8) +#define NST_ADDR_FIXED (1<<9) +#define NST_NAMED_GROUP (1<<10) +#define NST_NAME_REF (1<<11) +#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */ #define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) #define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) @@ -133,13 +157,15 @@ typedef struct { #define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0) #define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0) #define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0) -#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0) +#define IS_EFFECT_STOP_BT_SIMPLE_REPEAT(en) \ + (((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0) #define IS_EFFECT_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0) #define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION #define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0) #define IS_CALL_NAME_REF(cn) (((cn)->state & NST_NAME_REF) != 0) #define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0) +#define IS_QUALIFIER_IN_REPEAT(qn) (((qn)->state & NST_IN_REPEAT) != 0) typedef struct { int state; @@ -224,9 +250,10 @@ typedef struct _Node { (senv)->mem_nodes_dynamic : (senv)->mem_nodes_static) typedef struct { - OnigOptionType option; - OnigEncoding enc; - OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; + OnigEncoding enc; + OnigSyntaxType* syntax; BitStatusType capture_history; BitStatusType bt_mem_start; BitStatusType bt_mem_end; @@ -254,19 +281,31 @@ typedef struct { #define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) #define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) + +#ifdef USE_NAMED_GROUP +typedef struct { + int new_val; +} GroupNumRemap; + +extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); +#endif + extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); -extern int onig_strncmp P_((UChar* s1, UChar* s2, int n)); +extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); -extern int onig_scan_unsigned_number P_((UChar** src, UChar* end, OnigEncoding enc)); +extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); extern void onig_reduce_nested_qualifier P_((Node* pnode, Node* cnode)); extern void onig_node_conv_to_str_node P_((Node* node, int raw)); -extern int onig_node_str_cat P_((Node* node, UChar* s, UChar* end)); +extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); extern Node* onig_node_new_effect P_((int type)); extern Node* onig_node_new_anchor P_((int type)); +extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); +extern Node* onig_node_new_list P_((Node* left, Node* right)); +extern void onig_node_str_clear P_((Node* node)); extern int onig_free_node_list(); extern int onig_names_free P_((regex_t* reg)); -extern int onig_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env)); +extern int onig_parse_make_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); #ifdef ONIG_DEBUG #ifdef USE_NAMED_GROUP diff --git a/ext/mbstring/oniguruma/regposerr.c b/ext/mbstring/oniguruma/regposerr.c index 533f813c0c4..e54b5c4089e 100644 --- a/ext/mbstring/oniguruma/regposerr.c +++ b/ext/mbstring/oniguruma/regposerr.c @@ -1,10 +1,32 @@ /********************************************************************** - regposerr.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "config.h" #include "onigposix.h" @@ -58,7 +80,7 @@ regerror(int posix_ecode, const regex_t* reg, char* buf, size_t size) s = tbuf; } - len = strlen(s) + 1; + len = strlen(s) + 1; /* use strlen() because s is ascii encoding. */ if (buf != NULL && size > 0) { strncpy(buf, s, size - 1); diff --git a/ext/mbstring/oniguruma/regposix.c b/ext/mbstring/oniguruma/regposix.c index 4cb30cc5653..34cbeb9a46f 100644 --- a/ext/mbstring/oniguruma/regposix.c +++ b/ext/mbstring/oniguruma/regposix.c @@ -1,10 +1,31 @@ /********************************************************************** - regposix.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #define regex_t onig_regex_t #include "regint.h" @@ -14,16 +35,17 @@ #define ONIG_C(reg) ((onig_regex_t* )((reg)->onig)) #define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig)) -#if 1 +/* #define ENC_STRING_LEN(enc,s,len) len = strlen(s) */ #define ENC_STRING_LEN(enc,s,len) do { \ - UChar* tmps = (UChar* )(s); \ - /* while (*tmps != 0) tmps += enc_len(enc,*tmps); */ \ - while (*tmps != 0) tmps++; /* OK for UTF-8, EUC-JP, Shift_JIS */ \ - len = tmps - (UChar* )(s); \ + if (ONIGENC_MBC_MINLEN(enc) == 1) { \ + UChar* tmps = (UChar* )(s); \ + while (*tmps != 0) tmps++; \ + len = tmps - (UChar* )(s); \ + } \ + else { \ + len = onigenc_str_bytelen_null(enc, (UChar* )s); \ + } \ } while(0) -#else -#define ENC_STRING_LEN(enc,s,len) len = strlen(s) -#endif typedef struct { int onig_err; @@ -50,7 +72,7 @@ onig2posix_error_code(int code) { ONIGERR_END_PATTERN_AT_LEFT_BRACKET, REG_EBRACK }, { ONIGERR_EMPTY_CHAR_CLASS, REG_ECTYPE }, { ONIGERR_PREMATURE_END_OF_CHAR_CLASS, REG_ECTYPE }, - { ONIGERR_END_PATTERN_AT_BACKSLASH, REG_EESCAPE }, + { ONIGERR_END_PATTERN_AT_ESCAPE, REG_EESCAPE }, { ONIGERR_END_PATTERN_AT_META, REG_EESCAPE }, { ONIGERR_END_PATTERN_AT_CONTROL, REG_EESCAPE }, { ONIGERR_META_CODE_SYNTAX, REG_BADPAT }, @@ -91,6 +113,7 @@ onig2posix_error_code(int code) { ONIGERR_NEVER_ENDING_RECURSION, REG_BADPAT }, { ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY, REG_BADPAT }, { ONIGERR_INVALID_CHAR_PROPERTY_NAME, REG_BADPAT }, + { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG }, { ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT, REG_EONIG_THREAD } }; @@ -145,24 +168,37 @@ regexec(regex_t* reg, const char* str, size_t nmatch, { int r, i, len; UChar* end; + regmatch_t* pm; OnigOptionType options; options = ONIG_OPTION_POSIX_REGION; if ((posix_options & REG_NOTBOL) != 0) options |= ONIG_OPTION_NOTBOL; if ((posix_options & REG_NOTEOL) != 0) options |= ONIG_OPTION_NOTEOL; - if ((reg->comp_options & REG_NOSUB) != 0) { - pmatch = (regmatch_t* )NULL; + if (nmatch == 0 || (reg->comp_options & REG_NOSUB) != 0) { + pm = (regmatch_t* )NULL; nmatch = 0; } + else if ((int )nmatch < ONIG_C(reg)->num_mem + 1) { + pm = (regmatch_t* )xmalloc(sizeof(regmatch_t) + * (ONIG_C(reg)->num_mem + 1)); + if (pm == NULL) + return REG_ESPACE; + } + else { + pm = pmatch; + } - ENC_STRING_LEN(ONIG_C(reg)->code,str,len); + ENC_STRING_LEN(ONIG_C(reg)->enc, str, len); end = (UChar* )(str + len); r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end, (OnigRegion* )pmatch, options); if (r >= 0) { r = 0; /* Match */ + if (pm != pmatch && pm != NULL) { + xmemcpy(pmatch, pm, sizeof(regmatch_t) * nmatch); + } } else if (r == ONIG_MISMATCH) { r = REG_NOMATCH; @@ -173,6 +209,9 @@ regexec(regex_t* reg, const char* str, size_t nmatch, r = onig2posix_error_code(r); } + if (pm != pmatch && pm != NULL) + xfree(pm); + return r; } @@ -201,6 +240,13 @@ reg_set_encoding(int mb_code) case REG_POSIX_ENCODING_UTF8: enc = ONIG_ENCODING_UTF8; break; + case REG_POSIX_ENCODING_UTF16_BE: + enc = ONIG_ENCODING_UTF16_BE; + break; + case REG_POSIX_ENCODING_UTF16_LE: + enc = ONIG_ENCODING_UTF16_LE; + break; + default: return ; break; @@ -211,18 +257,18 @@ reg_set_encoding(int mb_code) extern int reg_name_to_group_numbers(regex_t* reg, - unsigned char* name, unsigned char* name_end, int** nums) + const unsigned char* name, const unsigned char* name_end, int** nums) { return onig_name_to_group_numbers(ONIG_C(reg), name, name_end, nums); } typedef struct { - int (*func)(unsigned char*,unsigned char*,int,int*,regex_t*,void*); + int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*); regex_t* reg; void* arg; } i_wrap; -static int i_wrapper(unsigned char* name, unsigned char* name_end, +static int i_wrapper(const unsigned char* name, const unsigned char* name_end, int ng, int* gs, onig_regex_t* reg, void* arg) { @@ -233,8 +279,8 @@ static int i_wrapper(unsigned char* name, unsigned char* name_end, extern int reg_foreach_name(regex_t* reg, - int (*func)(unsigned char*,unsigned char*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*), + void* arg) { i_wrap warg; diff --git a/ext/mbstring/oniguruma/regsyntax.c b/ext/mbstring/oniguruma/regsyntax.c new file mode 100644 index 00000000000..a0f36b8c33d --- /dev/null +++ b/ext/mbstring/oniguruma/regsyntax.c @@ -0,0 +1,207 @@ +/********************************************************************** + regsyntax.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regint.h" + +OnigSyntaxType OnigSyntaxPosixBasic = { + ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | + ONIG_SYN_OP_ESC_BRACE_INTERVAL ) + , 0 + , 0 + , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) +}; + +OnigSyntaxType OnigSyntaxPosixExtended = { + ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_LPAREN_SUBEXP | + ONIG_SYN_OP_BRACE_INTERVAL | + ONIG_SYN_OP_PLUS_ONE_INF | ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_VBAR_ALT ) + , 0 + , ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | + ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | + ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP | + ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) + , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) +}; + +OnigSyntaxType OnigSyntaxEmacs = { + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | + ONIG_SYN_OP_ESC_BRACE_INTERVAL | + ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT | + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | + ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_DECIMAL_BACKREF | + ONIG_SYN_OP_LINE_ANCHOR | ONIG_SYN_OP_ESC_CONTROL_CHARS ) + , ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR + , ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxGrep = { + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET | + ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | + ONIG_SYN_OP_ESC_VBAR_ALT | + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF | + ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR | + ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND | + ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF ) + , 0 + , ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC ) + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxGnuRegex = { + SYN_GNU_REGEX_OP + , 0 + , SYN_GNU_REGEX_BV + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxJava = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT | + ONIG_SYN_OP2_OPTION_PERL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | + ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP | + ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + , ONIG_OPTION_SINGLELINE +}; + +OnigSyntaxType OnigSyntaxPerl = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | + ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS ) + , SYN_GNU_REGEX_BV + , ONIG_OPTION_SINGLELINE +}; + + +extern int +onig_set_default_syntax(OnigSyntaxType* syntax) +{ + if (IS_NULL(syntax)) + syntax = ONIG_SYNTAX_RUBY; + + OnigDefaultSyntax = syntax; + return 0; +} + +extern void +onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) +{ + *to = *from; +} + +extern void +onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) +{ + syntax->op = op; +} + +extern void +onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) +{ + syntax->op2 = op2; +} + +extern void +onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) +{ + syntax->behavior = behavior; +} + +extern void +onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) +{ + syntax->options = options; +} + +extern unsigned int +onig_get_syntax_op(OnigSyntaxType* syntax) +{ + return syntax->op; +} + +extern unsigned int +onig_get_syntax_op2(OnigSyntaxType* syntax) +{ + return syntax->op2; +} + +extern unsigned int +onig_get_syntax_behavior(OnigSyntaxType* syntax) +{ + return syntax->behavior; +} + +extern OnigOptionType +onig_get_syntax_options(OnigSyntaxType* syntax) +{ + return syntax->options; +} + +#ifdef USE_VARIABLE_META_CHARS +extern int onig_set_meta_char(OnigEncoding enc, + unsigned int what, OnigCodePoint code) +{ + switch (what) { + case ONIG_META_CHAR_ESCAPE: + enc->meta_char_table.esc = code; + break; + case ONIG_META_CHAR_ANYCHAR: + enc->meta_char_table.anychar = code; + break; + case ONIG_META_CHAR_ANYTIME: + enc->meta_char_table.anytime = code; + break; + case ONIG_META_CHAR_ZERO_OR_ONE_TIME: + enc->meta_char_table.zero_or_one_time = code; + break; + case ONIG_META_CHAR_ONE_OR_MORE_TIME: + enc->meta_char_table.one_or_more_time = code; + break; + case ONIG_META_CHAR_ANYCHAR_ANYTIME: + enc->meta_char_table.anychar_anytime = code; + break; + default: + return ONIGERR_INVALID_ARGUMENT; + break; + } + return 0; +} +#endif /* USE_VARIABLE_META_CHARS */ diff --git a/ext/mbstring/oniguruma/regtrav.c b/ext/mbstring/oniguruma/regtrav.c new file mode 100644 index 00000000000..58a17f58b34 --- /dev/null +++ b/ext/mbstring/oniguruma/regtrav.c @@ -0,0 +1,76 @@ +/********************************************************************** + regtrav.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regint.h" + +#ifdef USE_CAPTURE_HISTORY + +static int +capture_tree_traverse(OnigCaptureTreeNode* node, int at, + int(*callback_func)(int,int,int,int,int,void*), + int level, void* arg) +{ + int r, i; + + if (node == (OnigCaptureTreeNode* )0) + return 0; + + if ((at & ONIG_TRAVERSE_CALLBACK_AT_FIRST) != 0) { + r = (*callback_func)(node->group, node->beg, node->end, + level, ONIG_TRAVERSE_CALLBACK_AT_FIRST, arg); + if (r != 0) return r; + } + + for (i = 0; i < node->num_childs; i++) { + r = capture_tree_traverse(node->childs[i], at, + callback_func, level + 1, arg); + if (r != 0) return r; + } + + if ((at & ONIG_TRAVERSE_CALLBACK_AT_LAST) != 0) { + r = (*callback_func)(node->group, node->beg, node->end, + level, ONIG_TRAVERSE_CALLBACK_AT_LAST, arg); + if (r != 0) return r; + } + + return 0; +} +#endif /* USE_CAPTURE_HISTORY */ + +extern int +onig_capture_tree_traverse(OnigRegion* region, int at, + int(*callback_func)(int,int,int,int,int,void*), void* arg) +{ +#ifdef USE_CAPTURE_HISTORY + return capture_tree_traverse(region->history_root, at, + callback_func, 0, arg); +#else + return ONIG_NO_SUPPORT_CONFIG; +#endif +} diff --git a/ext/mbstring/oniguruma/regversion.c b/ext/mbstring/oniguruma/regversion.c new file mode 100644 index 00000000000..5f15c10e652 --- /dev/null +++ b/ext/mbstring/oniguruma/regversion.c @@ -0,0 +1,55 @@ +/********************************************************************** + regversion.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "oniguruma.h" +#include + +extern const char* +onig_version(void) +{ + static char s[12]; + + sprintf(s, "%d.%d.%d", + ONIGURUMA_VERSION_MAJOR, + ONIGURUMA_VERSION_MINOR, + ONIGURUMA_VERSION_TEENY); + return s; +} + +extern const char* +onig_copyright(void) +{ + static char s[58]; + + sprintf(s, "Oniguruma %d.%d.%d : Copyright (C) 2002-2005 K.Kosako", + ONIGURUMA_VERSION_MAJOR, + ONIGURUMA_VERSION_MINOR, + ONIGURUMA_VERSION_TEENY); + return s; +} diff --git a/ext/mbstring/oniguruma/st.c b/ext/mbstring/oniguruma/st.c new file mode 100644 index 00000000000..65c2cc58bd7 --- /dev/null +++ b/ext/mbstring/oniguruma/st.c @@ -0,0 +1,717 @@ +/* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */ + +/* static char sccsid[] = "@(#) st.c 5.1 89/12/14 Crucible"; */ + +#include "config.h" +#include +#include +#include + +#ifdef _WIN32 +#include +#endif + +#ifdef NOT_RUBY +#include "regint.h" +#else +#ifdef RUBY_PLATFORM +#define xmalloc ruby_xmalloc +#define xcalloc ruby_xcalloc +#define xrealloc ruby_xrealloc +#define xfree ruby_xfree + +void *xmalloc(long); +void *xcalloc(long, long); +void *xrealloc(void *, long); +void xfree(void *); +#endif +#endif + +#include "st.h" + +typedef struct st_table_entry st_table_entry; + +struct st_table_entry { + unsigned int hash; + st_data_t key; + st_data_t record; + st_table_entry *next; +}; + +#define ST_DEFAULT_MAX_DENSITY 5 +#define ST_DEFAULT_INIT_TABLE_SIZE 11 + + /* + * DEFAULT_MAX_DENSITY is the default for the largest we allow the + * average number of items per bin before increasing the number of + * bins + * + * DEFAULT_INIT_TABLE_SIZE is the default for the number of bins + * allocated initially + * + */ + +static int numcmp(long, long); +static int numhash(long); +static struct st_hash_type type_numhash = { + numcmp, + numhash, + st_nothing_key_free, + st_nothing_key_clone +}; + +/* extern int strcmp(const char *, const char *); */ +static int strhash(const char *); +static struct st_hash_type type_strhash = { + strcmp, + strhash, + st_nothing_key_free, + st_nothing_key_clone +}; + +static int strend_cmp(st_strend_key*, st_strend_key*); +static int strend_hash(st_strend_key*); +static int strend_key_free(st_data_t key); +static st_data_t strend_key_clone(st_data_t x); + +static struct st_hash_type type_strend_hash = { + strend_cmp, + strend_hash, + strend_key_free, + strend_key_clone +}; + +static void rehash(st_table *); + +#define alloc(type) (type*)xmalloc((unsigned)sizeof(type)) +#define Calloc(n,s) (char*)xcalloc((n),(s)) + +#define EQUAL(table,x,y) ((x)==(y) || (*table->type->compare)((x),(y)) == 0) + +#define do_hash(key,table) (unsigned int)(*(table)->type->hash)((key)) +#define do_hash_bin(key,table) (do_hash(key, table)%(table)->num_bins) + +/* + * MINSIZE is the minimum size of a dictionary. + */ + +#define MINSIZE 8 + +/* +Table of prime numbers 2^n+a, 2<=n<=30. +*/ +static long primes[] = { + 8 + 3, + 16 + 3, + 32 + 5, + 64 + 3, + 128 + 3, + 256 + 27, + 512 + 9, + 1024 + 9, + 2048 + 5, + 4096 + 3, + 8192 + 27, + 16384 + 43, + 32768 + 3, + 65536 + 45, + 131072 + 29, + 262144 + 3, + 524288 + 21, + 1048576 + 7, + 2097152 + 17, + 4194304 + 15, + 8388608 + 9, + 16777216 + 43, + 33554432 + 35, + 67108864 + 15, + 134217728 + 29, + 268435456 + 3, + 536870912 + 11, + 1073741824 + 85, + 0 +}; + +static int +new_size(size) + int size; +{ + int i; + +#if 0 + for (i=3; i<31; i++) { + if ((1< size) return 1< size) return primes[i]; + } + /* Ran out of polynomials */ + return -1; /* should raise exception */ +#endif +} + +#ifdef HASH_LOG +static int collision = 0; +static int init_st = 0; + +static void +stat_col() +{ + FILE *f = fopen("/tmp/col", "w"); + fprintf(f, "collision: %d\n", collision); + fclose(f); +} +#endif + +st_table* +st_init_table_with_size(type, size) + struct st_hash_type *type; + int size; +{ + st_table *tbl; + +#ifdef HASH_LOG + if (init_st == 0) { + init_st = 1; + atexit(stat_col); + } +#endif + + size = new_size(size); /* round up to prime number */ + + tbl = alloc(st_table); + tbl->type = type; + tbl->num_entries = 0; + tbl->num_bins = size; + tbl->bins = (st_table_entry **)Calloc(size, sizeof(st_table_entry*)); + + return tbl; +} + +st_table* +st_init_table(type) + struct st_hash_type *type; +{ + return st_init_table_with_size(type, 0); +} + +st_table* +st_init_numtable(void) +{ + return st_init_table(&type_numhash); +} + +st_table* +st_init_numtable_with_size(size) + int size; +{ + return st_init_table_with_size(&type_numhash, size); +} + +st_table* +st_init_strtable(void) +{ + return st_init_table(&type_strhash); +} + +st_table* +st_init_strtable_with_size(size) + int size; +{ + return st_init_table_with_size(&type_strhash, size); +} + +st_table* +st_init_strend_table_with_size(size) + int size; +{ + return st_init_table_with_size(&type_strend_hash, size); +} + +void +st_free_table(table) + st_table *table; +{ + register st_table_entry *ptr, *next; + int i; + + for(i = 0; i < table->num_bins; i++) { + ptr = table->bins[i]; + while (ptr != 0) { + next = ptr->next; + table->type->key_free(ptr->key); + free(ptr); + ptr = next; + } + } + free(table->bins); + free(table); +} + +#define PTR_NOT_EQUAL(table, ptr, hash_val, key) \ +((ptr) != 0 && (ptr->hash != (hash_val) || !EQUAL((table), (key), (ptr)->key))) + +#ifdef HASH_LOG +#define COLLISION collision++ +#else +#define COLLISION +#endif + +#define FIND_ENTRY(table, ptr, hash_val, bin_pos) do {\ + bin_pos = hash_val%(table)->num_bins;\ + ptr = (table)->bins[bin_pos];\ + if (PTR_NOT_EQUAL(table, ptr, hash_val, key)) {\ + COLLISION;\ + while (PTR_NOT_EQUAL(table, ptr->next, hash_val, key)) {\ + ptr = ptr->next;\ + }\ + ptr = ptr->next;\ + }\ +} while (0) + +int +st_lookup(table, key, value) + st_table *table; + register st_data_t key; + st_data_t *value; +{ + unsigned int hash_val, bin_pos; + register st_table_entry *ptr; + + hash_val = do_hash(key, table); + FIND_ENTRY(table, ptr, hash_val, bin_pos); + + if (ptr == 0) { + return 0; + } + else { + if (value != 0) *value = ptr->record; + return 1; + } +} + +int +st_lookup_strend(table, str_key, end_key, value) + st_table *table; + const unsigned char* str_key; + const unsigned char* end_key; + st_data_t *value; +{ + st_strend_key key; + + key.s = (unsigned char* )str_key; + key.end = (unsigned char* )end_key; + + return st_lookup(table, (st_data_t )(&key), value); +} + +#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\ +do {\ + st_table_entry *entry;\ + if (table->num_entries/(table->num_bins) > ST_DEFAULT_MAX_DENSITY) {\ + rehash(table);\ + bin_pos = hash_val % table->num_bins;\ + }\ + \ + entry = alloc(st_table_entry);\ + \ + entry->hash = hash_val;\ + entry->key = key;\ + entry->record = value;\ + entry->next = table->bins[bin_pos];\ + table->bins[bin_pos] = entry;\ + table->num_entries++;\ +} while (0) + +int +st_insert(table, key, value) + register st_table *table; + register st_data_t key; + st_data_t value; +{ + unsigned int hash_val, bin_pos; + register st_table_entry *ptr; + + hash_val = do_hash(key, table); + FIND_ENTRY(table, ptr, hash_val, bin_pos); + + if (ptr == 0) { + ADD_DIRECT(table, key, value, hash_val, bin_pos); + return 0; + } + else { + ptr->record = value; + return 1; + } +} + +int +st_insert_strend(table, str_key, end_key, value) + st_table *table; + const unsigned char* str_key; + const unsigned char* end_key; + st_data_t value; +{ + st_strend_key* key; + + key = alloc(st_strend_key); + key->s = (unsigned char* )str_key; + key->end = (unsigned char* )end_key; + + return st_insert(table, (st_data_t )key, value); +} + +void +st_add_direct(table, key, value) + st_table *table; + st_data_t key; + st_data_t value; +{ + unsigned int hash_val, bin_pos; + + hash_val = do_hash(key, table); + bin_pos = hash_val % table->num_bins; + ADD_DIRECT(table, key, value, hash_val, bin_pos); +} + +void +st_add_direct_strend(table, str_key, end_key, value) + st_table *table; + const unsigned char* str_key; + const unsigned char* end_key; + st_data_t value; +{ + st_strend_key* key; + + key = alloc(st_strend_key); + key->s = (unsigned char* )str_key; + key->end = (unsigned char* )end_key; + st_add_direct(table, (st_data_t )key, value); +} + +static void +rehash(table) + register st_table *table; +{ + register st_table_entry *ptr, *next, **new_bins; + int i, old_num_bins = table->num_bins, new_num_bins; + unsigned int hash_val; + + new_num_bins = new_size(old_num_bins+1); + new_bins = (st_table_entry**)Calloc(new_num_bins, sizeof(st_table_entry*)); + + for(i = 0; i < old_num_bins; i++) { + ptr = table->bins[i]; + while (ptr != 0) { + next = ptr->next; + hash_val = ptr->hash % new_num_bins; + ptr->next = new_bins[hash_val]; + new_bins[hash_val] = ptr; + ptr = next; + } + } + free(table->bins); + table->num_bins = new_num_bins; + table->bins = new_bins; +} + +st_table* +st_copy(old_table) + st_table *old_table; +{ + st_table *new_table; + st_table_entry *ptr, *entry; + int i, num_bins = old_table->num_bins; + + new_table = alloc(st_table); + if (new_table == 0) { + return 0; + } + + *new_table = *old_table; + new_table->bins = (st_table_entry**) + Calloc((unsigned)num_bins, sizeof(st_table_entry*)); + + if (new_table->bins == 0) { + free(new_table); + return 0; + } + + for(i = 0; i < num_bins; i++) { + new_table->bins[i] = 0; + ptr = old_table->bins[i]; + while (ptr != 0) { + entry = alloc(st_table_entry); + if (entry == 0) { + free(new_table->bins); + free(new_table); + return 0; + } + *entry = *ptr; + entry->key = old_table->type->key_clone(ptr->key); + entry->next = new_table->bins[i]; + new_table->bins[i] = entry; + ptr = ptr->next; + } + } + return new_table; +} + +int +st_delete(table, key, value) + register st_table *table; + register st_data_t *key; + st_data_t *value; +{ + unsigned int hash_val; + st_table_entry *tmp; + register st_table_entry *ptr; + + hash_val = do_hash_bin(*key, table); + ptr = table->bins[hash_val]; + + if (ptr == 0) { + if (value != 0) *value = 0; + return 0; + } + + if (EQUAL(table, *key, ptr->key)) { + table->bins[hash_val] = ptr->next; + table->num_entries--; + if (value != 0) *value = ptr->record; + *key = ptr->key; + free(ptr); + return 1; + } + + for(; ptr->next != 0; ptr = ptr->next) { + if (EQUAL(table, ptr->next->key, *key)) { + tmp = ptr->next; + ptr->next = ptr->next->next; + table->num_entries--; + if (value != 0) *value = tmp->record; + *key = tmp->key; + free(tmp); + return 1; + } + } + + return 0; +} + +int +st_delete_safe(table, key, value, never) + register st_table *table; + register st_data_t *key; + st_data_t *value; + st_data_t never; +{ + unsigned int hash_val; + register st_table_entry *ptr; + + hash_val = do_hash_bin(*key, table); + ptr = table->bins[hash_val]; + + if (ptr == 0) { + if (value != 0) *value = 0; + return 0; + } + + for(; ptr != 0; ptr = ptr->next) { + if ((ptr->key != never) && EQUAL(table, ptr->key, *key)) { + table->num_entries--; + *key = ptr->key; + if (value != 0) *value = ptr->record; + ptr->key = ptr->record = never; + return 1; + } + } + + return 0; +} + +static int +delete_never(key, value, never) + st_data_t key, value, never; +{ + if (value == never) return ST_DELETE; + return ST_CONTINUE; +} + +void +st_cleanup_safe(table, never) + st_table *table; + st_data_t never; +{ + int num_entries = table->num_entries; + + st_foreach(table, delete_never, never); + table->num_entries = num_entries; +} + +void +st_foreach(table, func, arg) + st_table *table; + int (*func)(); + st_data_t arg; +{ + st_table_entry *ptr, *last, *tmp; + enum st_retval retval; + int i; + + for(i = 0; i < table->num_bins; i++) { + last = 0; + for(ptr = table->bins[i]; ptr != 0;) { + retval = (*func)(ptr->key, ptr->record, arg, 0); + switch (retval) { + case ST_CHECK: /* check if hash is modified during iteration */ + tmp = 0; + if (i < table->num_bins) { + for (tmp = table->bins[i]; tmp; tmp=tmp->next) { + if (tmp == ptr) break; + } + } + if (!tmp) { + /* call func with error notice */ + retval = (*func)(0, 0, arg, 1); + return; + } + /* fall through */ + case ST_CONTINUE: + last = ptr; + ptr = ptr->next; + break; + case ST_STOP: + return; + case ST_DELETE: + tmp = ptr; + if (last == 0) { + table->bins[i] = ptr->next; + } + else { + last->next = ptr->next; + } + ptr = ptr->next; + table->type->key_free(tmp->key); + free(tmp); + table->num_entries--; + } + } + } +} + +static int +strhash(string) + register const char *string; +{ + register int c; + +#ifdef HASH_ELFHASH + register unsigned int h = 0, g; + + while ((c = *string++) != '\0') { + h = ( h << 4 ) + c; + if ( g = h & 0xF0000000 ) + h ^= g >> 24; + h &= ~g; + } + return h; +#elif HASH_PERL + register int val = 0; + + while ((c = *string++) != '\0') { + val += c; + val += (val << 10); + val ^= (val >> 6); + } + val += (val << 3); + val ^= (val >> 11); + + return val + (val << 15); +#else + register int val = 0; + + while ((c = *string++) != '\0') { + val = val*997 + c; + } + + return val + (val>>5); +#endif +} + +static int +numcmp(x, y) + long x, y; +{ + return x != y; +} + +static int +numhash(n) + long n; +{ + return n; +} + +extern int +st_nothing_key_free(st_data_t key) { return 0; } + +extern st_data_t +st_nothing_key_clone(st_data_t x) { return x; } + +static int strend_cmp(st_strend_key* x, st_strend_key* y) +{ + unsigned char *p, *q; + int c; + + if ((x->end - x->s) != (y->end - y->s)) + return 1; + + p = x->s; + q = y->s; + while (p < x->end) { + c = (int )*p - (int )*q; + if (c != 0) return c; + + p++; q++; + } + + return 0; +} + +static int strend_hash(st_strend_key* x) +{ + int val; + unsigned char *p; + + val = 0; + p = x->s; + while (p < x->end) { + val = val * 997 + (int )*p++; + } + + return val + (val >> 5); +} + +static int strend_key_free(st_data_t x) +{ + xfree((void* )x); + return 0; +} + +static st_data_t strend_key_clone(st_data_t x) +{ + st_strend_key* new_key; + st_strend_key* key = (st_strend_key* )x; + + new_key = alloc(st_strend_key); + *new_key = *key; + return (st_data_t )new_key; +} diff --git a/ext/mbstring/oniguruma/st.h b/ext/mbstring/oniguruma/st.h new file mode 100644 index 00000000000..c5cc4e625e0 --- /dev/null +++ b/ext/mbstring/oniguruma/st.h @@ -0,0 +1,77 @@ +/* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */ + +/* @(#) st.h 5.1 89/12/14 */ + +#ifndef ST_INCLUDED + +#define ST_INCLUDED + +typedef unsigned long st_data_t; +#define ST_DATA_T_DEFINED + +typedef struct st_table st_table; + +struct st_hash_type { + int (*compare)(); + int (*hash)(); + int (*key_free)(); + st_data_t (*key_clone)(); +}; + +struct st_table { + struct st_hash_type *type; + int num_bins; + int num_entries; + struct st_table_entry **bins; +}; + +typedef struct { + unsigned char* s; + unsigned char* end; +} st_strend_key; + +#define st_is_member(table,key) st_lookup(table,key,(st_data_t *)0) + +enum st_retval {ST_CONTINUE, ST_STOP, ST_DELETE, ST_CHECK}; + +#ifndef _ +# define _(args) args +#endif +#ifndef ANYARGS +# ifdef __cplusplus +# define ANYARGS ... +# else +# define ANYARGS +# endif +#endif + +st_table *st_init_table _((struct st_hash_type *)); +st_table *st_init_table_with_size _((struct st_hash_type *, int)); +st_table *st_init_numtable _((void)); +st_table *st_init_numtable_with_size _((int)); +st_table *st_init_strtable _((void)); +st_table *st_init_strtable_with_size _((int)); +st_table *st_init_strend_table_with_size _((int)); +int st_delete _((st_table *, st_data_t *, st_data_t *)); +int st_delete_safe _((st_table *, st_data_t *, st_data_t *, st_data_t)); +int st_insert _((st_table *, st_data_t, st_data_t)); +int st_insert_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t)); +int st_lookup _((st_table *, st_data_t, st_data_t *)); +int st_lookup_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t*)); +void st_foreach _((st_table *, int (*)(ANYARGS), st_data_t)); +void st_add_direct _((st_table *, st_data_t, st_data_t)); +void st_add_direct_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t)); +void st_free_table _((st_table *)); +void st_cleanup_safe _((st_table *, st_data_t)); +st_table *st_copy _((st_table *)); + +extern st_data_t st_nothing_key_clone _((st_data_t key)); +extern int st_nothing_key_free _((st_data_t key)); + +#define ST_NUMCMP ((int (*)()) 0) +#define ST_NUMHASH ((int (*)()) -2) + +#define st_numcmp ST_NUMCMP +#define st_numhash ST_NUMHASH + +#endif /* ST_INCLUDED */ diff --git a/ext/mbstring/oniguruma/testc.c b/ext/mbstring/oniguruma/testc.c deleted file mode 100644 index e4d197e21db..00000000000 --- a/ext/mbstring/oniguruma/testc.c +++ /dev/null @@ -1,833 +0,0 @@ -/* - * This program was generated by testconv.rb. - */ -#include - -#ifdef POSIX_TEST -#include "onigposix.h" -#else -#include "oniguruma.h" -#endif - -static int nsucc = 0; -static int nfail = 0; -static int nerror = 0; - -static FILE* err_file; - -#ifndef POSIX_TEST -static OnigRegion* region; -#endif - -static void xx(char* pattern, char* str, int from, int to, int mem, int not) -{ - int r; - -#ifdef POSIX_TEST - regex_t reg; - char buf[200]; - regmatch_t pmatch[20]; - - r = regcomp(®, pattern, REG_EXTENDED | REG_NEWLINE); - if (r) { - regerror(r, ®, buf, sizeof(buf)); - fprintf(err_file, "ERROR: %s\n", buf); - nerror++; - return ; - } - - r = regexec(®, str, reg.re_nsub + 1, pmatch, 0); - if (r != 0 && r != REG_NOMATCH) { - regerror(r, ®, buf, sizeof(buf)); - fprintf(err_file, "ERROR: %s\n", buf); - nerror++; - return ; - } - - if (r == REG_NOMATCH) { - if (not) { - fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); - nsucc++; - } - else { - fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); - nfail++; - } - } - else { - if (not) { - fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); - nfail++; - } - else { - if (pmatch[mem].rm_so == from && pmatch[mem].rm_eo == to) { - fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); - nsucc++; - } - else { - fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, - from, to, pmatch[mem].rm_so, pmatch[mem].rm_eo); - nfail++; - } - } - } - regfree(®); - -#else - regex_t* reg; - OnigErrorInfo einfo; - - r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + strlen(pattern)), - ONIG_OPTION_DEFAULT, ONIG_ENCODING_EUC_JP, ONIG_SYNTAX_DEFAULT, &einfo); - if (r) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(s, r, &einfo); - fprintf(err_file, "ERROR: %s\n", s); - nerror++; - return ; - } - - r = onig_search(reg, (UChar* )str, (UChar* )(str + strlen(str)), - (UChar* )str, (UChar* )(str + strlen(str)), - region, ONIG_OPTION_NONE); - if (r < ONIG_MISMATCH) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(s, r); - fprintf(err_file, "ERROR: %s\n", s); - nerror++; - return ; - } - - if (r == ONIG_MISMATCH) { - if (not) { - fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); - nsucc++; - } - else { - fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); - nfail++; - } - } - else { - if (not) { - fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); - nfail++; - } - else { - if (region->beg[mem] == from && region->end[mem] == to) { - fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); - nsucc++; - } - else { - fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, - from, to, region->beg[mem], region->end[mem]); - nfail++; - } - } - } - onig_free(reg); -#endif -} - -static void x2(char* pattern, char* str, int from, int to) -{ - xx(pattern, str, from, to, 0, 0); -} - -static void x3(char* pattern, char* str, int from, int to, int mem) -{ - xx(pattern, str, from, to, mem, 0); -} - -static void n(char* pattern, char* str) -{ - xx(pattern, str, 0, 0, 0, 1); -} - -extern int main(int argc, char* argv[]) -{ - err_file = stdout; - -#ifdef POSIX_TEST - reg_set_encoding(REG_POSIX_ENCODING_EUC_JP); -#else - region = onig_region_new(); -#endif - - x2("", "", 0, 0); - x2("^", "", 0, 0); - x2("$", "", 0, 0); - x2("\\G", "", 0, 0); - x2("\\A", "", 0, 0); - x2("\\Z", "", 0, 0); - x2("\\z", "", 0, 0); - x2("^$", "", 0, 0); - x2("\\ca", "\001", 0, 1); - x2("\\C-b", "\002", 0, 1); - x2("\\M-Z", "\xDA", 0, 1); - x2("", "a", 0, 0); - x2("a", "a", 0, 1); - x2("aa", "aa", 0, 2); - x2("aaa", "aaa", 0, 3); - x2("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, 35); - x2("ab", "ab", 0, 2); - x2("b", "ab", 1, 2); - x2("bc", "abc", 1, 3); - x2("\\17", "\017", 0, 1); - x2("\\x1f", "\x1f", 0, 1); - x2("\\xFE", "\xfe", 0, 1); - x2("a(?#....\\\\JJJJ)b", "ab", 0, 2); - x2("(?x) G (o O(?-x)oO) g L", "GoOoOgLe", 0, 7); - x2(".", "a", 0, 1); - n(".", ""); - x2("..", "ab", 0, 2); - x2("\\w", "e", 0, 1); - n("\\W", "e"); - x2("\\s", " ", 0, 1); - x2("\\S", "b", 0, 1); - x2("\\d", "4", 0, 1); - n("\\D", "4"); - x2("\\b", "z ", 0, 0); - x2("\\b", " z", 1, 1); - x2("\\B", "zz ", 1, 1); - x2("\\B", "z ", 2, 2); - x2("\\B", " z", 0, 0); - x2("[ab]", "b", 0, 1); - n("[ab]", "c"); - x2("[a-z]", "t", 0, 1); - n("[^a]", "a"); - x2("[^a]", "\n", 0, 1); - x2("[]]", "]", 0, 1); - n("[^]]", "]"); - x2("[\\^]+", "0^^1", 1, 3); - x2("[b-]", "b", 0, 1); - x2("[b-]", "-", 0, 1); - x2("[\\w]", "z", 0, 1); - n("[\\w]", " "); - x2("[\\W]", "b$", 1, 2); - x2("[\\d]", "5", 0, 1); - n("[\\d]", "e"); - x2("[\\D]", "t", 0, 1); - n("[\\D]", "3"); - x2("[\\s]", " ", 0, 1); - n("[\\s]", "a"); - x2("[\\S]", "b", 0, 1); - n("[\\S]", " "); - x2("[\\w\\d]", "2", 0, 1); - n("[\\w\\d]", " "); - x2("[[:upper:]]", "B", 0, 1); - x2("[*[:xdigit:]+]", "+", 0, 1); - x2("[*[:xdigit:]+]", "GHIKK-9+*", 6, 7); - x2("[*[:xdigit:]+]", "-@^+", 3, 4); - n("[[:upper]]", "A"); - x2("[[:upper]]", ":", 0, 1); - x2("[\\044-\\047]", "\046", 0, 1); - x2("[\\x5a-\\x5c]", "\x5b", 0, 1); - x2("[\\x6A-\\x6D]", "\x6c", 0, 1); - n("[\\x6A-\\x6D]", "\x6E"); - n("^[0-9A-F]+ 0+ UNDEF ", "75F 00000000 SECT14A notype () External | _rb_apply"); - x2("[\\[]", "[", 0, 1); - x2("[\\]]", "]", 0, 1); - x2("[&]", "&", 0, 1); - x2("[[ab]]", "b", 0, 1); - x2("[[ab]c]", "c", 0, 1); - n("[[^a]]", "a"); - n("[^[a]]", "a"); - x2("[[ab]&&bc]", "b", 0, 1); - n("[[ab]&&bc]", "a"); - n("[[ab]&&bc]", "c"); - x2("[a-z&&b-y&&c-x]", "w", 0, 1); - n("[^a-z&&b-y&&c-x]", "w"); - x2("[[^a&&a]&&a-z]", "b", 0, 1); - n("[[^a&&a]&&a-z]", "a"); - x2("[[^a-z&&bcdef]&&[^c-g]]", "h", 0, 1); - n("[[^a-z&&bcdef]&&[^c-g]]", "c"); - x2("[^[^abc]&&[^cde]]", "c", 0, 1); - x2("[^[^abc]&&[^cde]]", "e", 0, 1); - n("[^[^abc]&&[^cde]]", "f"); - x2("[a-&&-a]", "-", 0, 1); - n("[a-&&-a]", "&"); - n("\\wabc", " abc"); - x2("a\\Wbc", "a bc", 0, 4); - x2("a.b.c", "aabbc", 0, 5); - x2(".\\wb\\W..c", "abb bcc", 0, 7); - x2("\\s\\wzzz", " zzzz", 0, 5); - x2("aa.b", "aabb", 0, 4); - n(".a", "ab"); - x2(".a", "aa", 0, 2); - x2("^a", "a", 0, 1); - x2("^a$", "a", 0, 1); - x2("^\\w$", "a", 0, 1); - n("^\\w$", " "); - x2("^\\wab$", "zab", 0, 3); - x2("^\\wabcdef$", "zabcdef", 0, 7); - x2("^\\w...def$", "zabcdef", 0, 7); - x2("\\w\\w\\s\\Waaa\\d", "aa aaa4", 0, 8); - x2("\\A\\Z", "", 0, 0); - x2("\\Axyz", "xyz", 0, 3); - x2("xyz\\Z", "xyz", 0, 3); - x2("xyz\\z", "xyz", 0, 3); - x2("\\Gaz", "az", 0, 2); - n("\\Gz", "bza"); - n("az\\G", "az"); - n("az\\A", "az"); - n("a\\Az", "az"); - x2("\\^\\$", "^$", 0, 2); - x2("^x?y", "xy", 0, 2); - x2("^(x?y)", "xy", 0, 2); - x2("\\w", "_", 0, 1); - n("\\W", "_"); - x2("(?=z)z", "z", 0, 1); - n("(?=z).", "a"); - x2("(?!z)a", "a", 0, 1); - n("(?!z)a", "z"); - x2("(?i:a)", "a", 0, 1); - x2("(?i:a)", "A", 0, 1); - x2("(?i:A)", "a", 0, 1); - n("(?i:A)", "b"); - x2("(?i:[A-Z])", "a", 0, 1); - x2("(?i:[f-m])", "H", 0, 1); - x2("(?i:[f-m])", "h", 0, 1); - n("(?i:[f-m])", "e"); - x2("(?i:[A-c])", "D", 0, 1); - x2("(?i:[!-k])", "Z", 0, 1); - x2("(?i:[!-k])", "7", 0, 1); - x2("(?i:[T-}])", "b", 0, 1); - x2("(?i:[T-}])", "{", 0, 1); - x2("(?i:\\?a)", "?A", 0, 2); - x2("(?i:\\*A)", "*a", 0, 2); - n(".", "\n"); - x2("(?m:.)", "\n", 0, 1); - x2("(?m:a.)", "a\n", 0, 2); - x2("(?m:.b)", "a\nb", 1, 3); - n("(?i)(?-i)a", "A"); - n("(?i)(?-i:a)", "A"); - x2("a?", "", 0, 0); - x2("a?", "b", 0, 0); - x2("a?", "a", 0, 1); - x2("a*", "", 0, 0); - x2("a*", "a", 0, 1); - x2("a*", "aaa", 0, 3); - x2("a*", "baaaa", 0, 0); - n("a+", ""); - x2("a+", "a", 0, 1); - x2("a+", "aaaa", 0, 4); - x2("a+", "aabbb", 0, 2); - x2("a+", "baaaa", 1, 5); - x2(".?", "", 0, 0); - x2(".?", "f", 0, 1); - x2(".?", "\n", 0, 0); - x2(".*", "", 0, 0); - x2(".*", "abcde", 0, 5); - x2(".+", "z", 0, 1); - x2(".+", "zdswer\n", 0, 6); - x2("a|b", "a", 0, 1); - x2("a|b", "b", 0, 1); - x2("|a", "a", 0, 0); - x2("(|a)", "a", 0, 0); - x2("ab|bc", "ab", 0, 2); - x2("ab|bc", "bc", 0, 2); - x2("z(?:ab|bc)", "zbc", 0, 3); - x2("a(?:ab|bc)c", "aabc", 0, 4); - x2("ab|(?:ac|az)", "az", 0, 2); - x2("a|b|c", "dc", 1, 2); - x2("a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz", "pqr", 0, 2); - n("a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz", "mn"); - x2("a|^z", "ba", 1, 2); - x2("a|^z", "za", 0, 1); - x2("a|\\Gz", "bza", 2, 3); - x2("a|\\Gz", "za", 0, 1); - x2("a|\\Az", "bza", 2, 3); - x2("a|\\Az", "za", 0, 1); - x2("a|b\\Z", "ba", 1, 2); - x2("a|b\\Z", "b", 0, 1); - x2("a|b\\z", "ba", 1, 2); - x2("a|b\\z", "b", 0, 1); - x2("\\w|\\s", " ", 0, 1); - n("\\w|\\w", " "); - x2("\\w|%", "%", 0, 1); - x2("\\w|[&$]", "&", 0, 1); - x2("[b-d]|[^e-z]", "a", 0, 1); - x2("(?:a|[c-f])|bz", "dz", 0, 1); - x2("(?:a|[c-f])|bz", "bz", 0, 2); - x2("abc|(?=zz)..f", "zzf", 0, 3); - x2("abc|(?!zz)..f", "abf", 0, 3); - x2("(?=za)..a|(?=zz)..a", "zza", 0, 3); - n("(?>a|abd)c", "abdc"); - x2("(?>abd|a)c", "abdc", 0, 4); - x2("a?|b", "a", 0, 1); - x2("a?|b", "b", 0, 0); - x2("a?|b", "", 0, 0); - x2("a*|b", "aa", 0, 2); - x2("a*|b*", "ba", 0, 0); - x2("a*|b*", "ab", 0, 1); - x2("a+|b*", "", 0, 0); - x2("a+|b*", "bbb", 0, 3); - x2("a+|b*", "abbb", 0, 1); - n("a+|b+", ""); - x2("(a|b)?", "b", 0, 1); - x2("(a|b)*", "ba", 0, 2); - x2("(a|b)+", "bab", 0, 3); - x2("(ab|ca)+", "caabbc", 0, 4); - x2("(ab|ca)+", "aabca", 1, 5); - x2("(ab|ca)+", "abzca", 0, 2); - x2("(a|bab)+", "ababa", 0, 5); - x2("(a|bab)+", "ba", 1, 2); - x2("(a|bab)+", "baaaba", 1, 4); - x2("(?:a|b)(?:a|b)", "ab", 0, 2); - x2("(?:a*|b*)(?:a*|b*)", "aaabbb", 0, 3); - x2("(?:a*|b*)(?:a+|b+)", "aaabbb", 0, 6); - x2("(?:a+|b+){2}", "aaabbb", 0, 6); - x2("h{0,}", "hhhh", 0, 4); - x2("(?:a+|b+){1,2}", "aaabbb", 0, 6); - n("ax{2}*a", "0axxxa1"); - n("a.{0,2}a", "0aXXXa0"); - n("a.{0,2}?a", "0aXXXa0"); - n("a.{0,2}?a", "0aXXXXa0"); - x2("(?:a+|\\Ab*)cc", "cc", 0, 2); - n("(?:a+|\\Ab*)cc", "abcc"); - x2("(?:^a+|b+)*c", "aabbbabc", 6, 8); - x2("(?:^a+|b+)*c", "aabbbbc", 0, 7); - x2("a|(?i)c", "C", 0, 1); - x2("(?i)c|a", "C", 0, 1); - x2("(?i)c|a", "A", 0, 1); - x2("(?i:c)|a", "C", 0, 1); - n("(?i:c)|a", "A"); - x2("[abc]?", "abc", 0, 1); - x2("[abc]*", "abc", 0, 3); - x2("[^abc]*", "abc", 0, 0); - n("[^abc]+", "abc"); - x2("a??", "aaa", 0, 0); - x2("ba??b", "bab", 0, 3); - x2("a*?", "aaa", 0, 0); - x2("ba*?", "baa", 0, 1); - x2("ba*?b", "baab", 0, 4); - x2("a+?", "aaa", 0, 1); - x2("ba+?", "baa", 0, 2); - x2("ba+?b", "baab", 0, 4); - x2("(?:a?)??", "a", 0, 0); - x2("(?:a??)?", "a", 0, 0); - x2("(?:a?)+?", "aaa", 0, 1); - x2("(?:a+)??", "aaa", 0, 0); - x2("(?:a+)??b", "aaab", 0, 4); - x2("(?:ab)?{2}", "", 0, 0); - x2("(?:ab)?{2}", "ababa", 0, 4); - x2("(?:ab)*{0}", "ababa", 0, 0); - x2("(?:ab){3,}", "abababab", 0, 8); - n("(?:ab){3,}", "abab"); - x2("(?:ab){2,4}", "ababab", 0, 6); - x2("(?:ab){2,4}", "ababababab", 0, 8); - x2("(?:ab){2,4}?", "ababababab", 0, 4); - x2("(?:ab){,}", "ab{,}", 0, 5); - x2("(?:abc)+?{2}", "abcabcabc", 0, 6); - x2("(?:X*)(?i:xa)", "XXXa", 0, 4); - x2("(d+)([^abc]z)", "dddz", 0, 4); - x2("([^abc]*)([^abc]z)", "dddz", 0, 4); - x2("(\\w+)(\\wz)", "dddz", 0, 4); - x3("(a)", "a", 0, 1, 1); - x3("(ab)", "ab", 0, 2, 1); - x2("((ab))", "ab", 0, 2); - x3("((ab))", "ab", 0, 2, 1); - x3("((ab))", "ab", 0, 2, 2); - x3("((((((((((((((((((((ab))))))))))))))))))))", "ab", 0, 2, 20); - x3("(ab)(cd)", "abcd", 0, 2, 1); - x3("(ab)(cd)", "abcd", 2, 4, 2); - x3("()(a)bc(def)ghijk", "abcdefghijk", 3, 6, 3); - x3("(()(a)bc(def)ghijk)", "abcdefghijk", 3, 6, 4); - x2("(^a)", "a", 0, 1); - x3("(a)|(a)", "ba", 1, 2, 1); - x3("(^a)|(a)", "ba", 1, 2, 2); - x3("(a?)", "aaa", 0, 1, 1); - x3("(a*)", "aaa", 0, 3, 1); - x3("(a*)", "", 0, 0, 1); - x3("(a+)", "aaaaaaa", 0, 7, 1); - x3("(a+|b*)", "bbbaa", 0, 3, 1); - x3("(a+|b?)", "bbbaa", 0, 1, 1); - x3("(abc)?", "abc", 0, 3, 1); - x3("(abc)*", "abc", 0, 3, 1); - x3("(abc)+", "abc", 0, 3, 1); - x3("(xyz|abc)+", "abc", 0, 3, 1); - x3("([xyz][abc]|abc)+", "abc", 0, 3, 1); - x3("((?i:abc))", "AbC", 0, 3, 1); - x2("(abc)(?i:\\1)", "abcABC", 0, 6); - x3("((?m:a.c))", "a\nc", 0, 3, 1); - x3("((?=az)a)", "azb", 0, 1, 1); - x3("abc|(.abd)", "zabd", 0, 4, 1); - x2("(?:abc)|(ABC)", "abc", 0, 3); - x3("(?i:(abc))|(zzz)", "ABC", 0, 3, 1); - x3("a*(.)", "aaaaz", 4, 5, 1); - x3("a*?(.)", "aaaaz", 0, 1, 1); - x3("a*?(c)", "aaaac", 4, 5, 1); - x3("[bcd]a*(.)", "caaaaz", 5, 6, 1); - x3("(\\Abb)cc", "bbcc", 0, 2, 1); - n("(\\Abb)cc", "zbbcc"); - x3("(^bb)cc", "bbcc", 0, 2, 1); - n("(^bb)cc", "zbbcc"); - x3("cc(bb$)", "ccbb", 2, 4, 1); - n("cc(bb$)", "ccbbb"); - n("(\\1)", ""); - n("\\1(a)", "aa"); - n("(a(b)\\1)\\2+", "ababb"); - n("(?:(?:\\1|z)(a))+$", "zaa"); - x2("(?:(?:\\1|z)(a))+$", "zaaa", 0, 4); - x2("(a)(?=\\1)", "aa", 0, 1); - n("(a)$|\\1", "az"); - x2("(a)\\1", "aa", 0, 2); - n("(a)\\1", "ab"); - x2("(a?)\\1", "aa", 0, 2); - x2("(a??)\\1", "aa", 0, 0); - x2("(a*)\\1", "aaaaa", 0, 4); - x3("(a*)\\1", "aaaaa", 0, 2, 1); - x2("a(b*)\\1", "abbbb", 0, 5); - x2("a(b*)\\1", "ab", 0, 1); - x2("(a*)(b*)\\1\\2", "aaabbaaabb", 0, 10); - x2("(a*)(b*)\\2", "aaabbbb", 0, 7); - x2("(((((((a*)b))))))c\\7", "aaabcaaa", 0, 8); - x3("(((((((a*)b))))))c\\7", "aaabcaaa", 0, 3, 7); - x2("(a)(b)(c)\\2\\1\\3", "abcbac", 0, 6); - x2("([a-d])\\1", "cc", 0, 2); - x2("(\\w\\d\\s)\\1", "f5 f5 ", 0, 6); - n("(\\w\\d\\s)\\1", "f5 f5"); - x2("(who|[a-c]{3})\\1", "whowho", 0, 6); - x2("...(who|[a-c]{3})\\1", "abcwhowho", 0, 9); - x2("(who|[a-c]{3})\\1", "cbccbc", 0, 6); - x2("(^a)\\1", "aa", 0, 2); - n("(^a)\\1", "baa"); - n("(a$)\\1", "aa"); - n("(ab\\Z)\\1", "ab"); - x2("(a*\\Z)\\1", "a", 1, 1); - x2(".(a*\\Z)\\1", "ba", 1, 2); - x3("(.(abc)\\2)", "zabcabc", 0, 7, 1); - x3("(.(..\\d.)\\2)", "z12341234", 0, 9, 1); - x2("((?i:az))\\1", "AzAz", 0, 4); - n("((?i:az))\\1", "Azaz"); - x2("(?<=a)b", "ab", 1, 2); - n("(?<=a)b", "bb"); - x2("(?<=a|b)b", "bb", 1, 2); - x2("(?<=a|bc)b", "bcb", 2, 3); - x2("(?<=a|bc)b", "ab", 1, 2); - x2("(?<=a|bc||defghij|klmnopq|r)z", "rz", 1, 2); - x2("(a)\\g<1>", "aa", 0, 2); - x2("(?a)", "a", 0, 1); - x2("(?ab)\\g", "abab", 0, 4); - x2("(?.zv.)\\k", "azvbazvb", 0, 8); - x2("(?<=\\g)|-\\zEND (?XyZ)", "XyZ", 3, 3); - x2("(?|a\\g)+", "", 0, 0); - x2("(?|\\(\\g\\))+$", "()(())", 0, 6); - x3("\\g(?.){0}", "X", 0, 1, 1); - x2("\\g(abc|df(?.YZ){2,8}){0}", "XYZ", 0, 3); - x2("\\A(?(a\\g)|)\\z", "aaaa", 0, 4); - x2("(?|\\g\\g)\\z|\\zEND (?a|(b)\\g)", "bbbbabba", 0, 8); - x2("(?\\w+\\sx)a+\\k", " fg xaaaaaaaafg x", 2, 18); - x3("(z)()()(?<_9>a)\\g<_9>", "zaa", 2, 3, 1); - x2("(.)(((?<_>a)))\\k<_>", "zaa", 0, 3); - x2("((?\\d)|(?\\w))(\\k|\\k)", "ff", 0, 2); - x2("(?:(?)|(?efg))\\k", "", 0, 0); - x2("(?:(?abc)|(?efg))\\k", "abcefgefg", 3, 9); - n("(?:(?abc)|(?efg))\\k", "abcefg"); - x2("(?:(?.)|(?..)|(?...)|(?....)|(?.....)|(?......)|(?.......)|(?........)|(?.........)|(?..........)|(?...........)|(?............)|(?.............)|(?..............))\\k$", "a-pyumpyum", 2, 10); - x3("(?:(?.)|(?..)|(?...)|(?....)|(?.....)|(?......)|(?.......)|(?........)|(?.........)|(?..........)|(?...........)|(?............)|(?.............)|(?..............))\\k$", "xxxxabcdefghijklmnabcdefghijklmn", 4, 18, 14); - x3("(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?)(?aaa)(?)$", "aaa", 0, 3, 16); - x2("(?a|\\(\\g\\))", "a", 0, 1); - x2("(?a|\\(\\g\\))", "((((((a))))))", 0, 13); - x3("(?a|\\(\\g\\))", "((((((((a))))))))", 0, 17, 1); - x2("\\g|\\zEND(?.*abc$)", "abcxxxabc", 0, 9); - x2("\\g<1>|\\zEND(.a.)", "bac", 0, 3); - x3("\\g<_A>\\g<_A>|\\zEND(.a.)(?<_A>.b.)", "xbxyby", 3, 6, 1); - x2("\\A(?:\\g|\\g|\\zEND (?a|c\\gc)(?b|d\\gd))$", "cdcbcdc", 0, 7); - x2("\\A(?|a\\g)\\z|\\zEND (?\\g)", "aaaa", 0, 4); - x2("(?(a|b\\gc){3,5})", "baaaaca", 1, 5); - x2("(?(a|b\\gc){3,5})", "baaaacaaaaa", 0, 10); - x2("()*\\1", "", 0, 0); - x2("(?:()|())*\\1\\2", "", 0, 0); - x3("(?:\\1a|())*", "a", 0, 0, 1); - x2("x((.)*)*x", "0x1x2x3", 1, 6); - x2("x((.)*)*x(?i:\\1)\\Z", "0x1x2x1X2", 1, 9); - x2("(?:()|()|()|()|()|())*\\2\\5", "", 0, 0); - x2("(?:()|()|()|(x)|()|())*\\2b\\5", "b", 0, 1); - x2("", "¤¢", 0, 0); - x2("¤¢", "¤¢", 0, 2); - n("¤¤", "¤¢"); - x2("¤¦¤¦", "¤¦¤¦", 0, 4); - x2("¤¢¤¤¤¦", "¤¢¤¤¤¦", 0, 6); - x2("¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³", "¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³¤³", 0, 70); - x2("¤¢", "¤¤¤¢", 2, 4); - x2("¤¤¤¦", "¤¢¤¤¤¦", 2, 6); - x2("\\xca\\xb8", "\xca\xb8", 0, 2); - x2(".", "¤¢", 0, 2); - x2("..", "¤«¤­", 0, 4); - x2("\\w", "¤ª", 0, 2); - n("\\W", "¤¢"); - x2("[\\W]", "¤¦$", 2, 3); - x2("\\S", "¤½", 0, 2); - x2("\\S", "´Á", 0, 2); - x2("\\b", "µ¤ ", 0, 0); - x2("\\b", " ¤Û", 1, 1); - x2("\\B", "¤»¤½ ", 2, 2); - x2("\\B", "¤¦ ", 3, 3); - x2("\\B", " ¤¤", 0, 0); - x2("[¤¿¤Á]", "¤Á", 0, 2); - n("[¤Ê¤Ë]", "¤Ì"); - x2("[¤¦-¤ª]", "¤¨", 0, 2); - n("[^¤±]", "¤±"); - x2("[\\w]", "¤Í", 0, 2); - n("[\\d]", "¤Õ"); - x2("[\\D]", "¤Ï", 0, 2); - n("[\\s]", "¤¯"); - x2("[\\S]", "¤Ø", 0, 2); - x2("[\\w\\d]", "¤è", 0, 2); - x2("[\\w\\d]", " ¤è", 3, 5); - n("\\wµ´¼Ö", " µ´¼Ö"); - x2("µ´\\W¼Ö", "µ´ ¼Ö", 0, 5); - x2("¤¢.¤¤.¤¦", "¤¢¤¢¤¤¤¤¤¦", 0, 10); - x2(".\\w¤¦\\W..¤¾", "¤¨¤¦¤¦ ¤¦¤¾¤¾", 0, 13); - x2("\\s\\w¤³¤³¤³", " ¤³¤³¤³¤³", 0, 9); - x2("¤¢¤¢.¤±", "¤¢¤¢¤±¤±", 0, 8); - n(".¤¤", "¤¤¤¨"); - x2(".¤ª", "¤ª¤ª", 0, 4); - x2("^¤¢", "¤¢", 0, 2); - x2("^¤à$", "¤à", 0, 2); - x2("^\\w$", "¤Ë", 0, 2); - x2("^\\w¤«¤­¤¯¤±¤³$", "z¤«¤­¤¯¤±¤³", 0, 11); - x2("^\\w...¤¦¤¨¤ª$", "z¤¢¤¤¤¦¤¦¤¨¤ª", 0, 13); - x2("\\w\\w\\s\\W¤ª¤ª¤ª\\d", "a¤ª ¤ª¤ª¤ª4", 0, 12); - x2("\\A¤¿¤Á¤Ä", "¤¿¤Á¤Ä", 0, 6); - x2("¤à¤á¤â\\Z", "¤à¤á¤â", 0, 6); - x2("¤«¤­¤¯\\z", "¤«¤­¤¯", 0, 6); - x2("¤«¤­¤¯\\Z", "¤«¤­¤¯\n", 0, 6); - x2("\\G¤Ý¤Ô", "¤Ý¤Ô", 0, 4); - n("\\G¤¨", "¤¦¤¨¤ª"); - n("¤È¤Æ\\G", "¤È¤Æ"); - n("¤Þ¤ß\\A", "¤Þ¤ß"); - n("¤Þ\\A¤ß", "¤Þ¤ß"); - x2("(?=¤»)¤»", "¤»", 0, 2); - n("(?=¤¦).", "¤¤"); - x2("(?!¤¦)¤«", "¤«", 0, 2); - n("(?!¤È)¤¢", "¤È"); - x2("(?i:¤¢)", "¤¢", 0, 2); - x2("(?i:¤Ö¤Ù)", "¤Ö¤Ù", 0, 4); - n("(?i:¤¤)", "¤¦"); - x2("(?m:¤è.)", "¤è\n", 0, 3); - x2("(?m:.¤á)", "¤Þ\n¤á", 2, 5); - x2("¤¢?", "", 0, 0); - x2("ÊÑ?", "²½", 0, 0); - x2("ÊÑ?", "ÊÑ", 0, 2); - x2("ÎÌ*", "", 0, 0); - x2("ÎÌ*", "ÎÌ", 0, 2); - x2("»Ò*", "»Ò»Ò»Ò", 0, 6); - x2("ÇÏ*", "¼¯ÇÏÇÏÇÏÇÏ", 0, 0); - n("»³+", ""); - x2("²Ï+", "²Ï", 0, 2); - x2("»þ+", "»þ»þ»þ»þ", 0, 8); - x2("¤¨+", "¤¨¤¨¤¦¤¦¤¦", 0, 4); - x2("¤¦+", "¤ª¤¦¤¦¤¦¤¦", 2, 10); - x2(".?", "¤¿", 0, 2); - x2(".*", "¤Ñ¤Ô¤×¤Ú", 0, 8); - x2(".+", "¤í", 0, 2); - x2(".+", "¤¤¤¦¤¨¤«\n", 0, 8); - x2("¤¢|¤¤", "¤¢", 0, 2); - x2("¤¢|¤¤", "¤¤", 0, 2); - x2("¤¢¤¤|¤¤¤¦", "¤¢¤¤", 0, 4); - x2("¤¢¤¤|¤¤¤¦", "¤¤¤¦", 0, 4); - x2("¤ò(?:¤«¤­|¤­¤¯)", "¤ò¤«¤­", 0, 6); - x2("¤ò(?:¤«¤­|¤­¤¯)¤±", "¤ò¤­¤¯¤±", 0, 8); - x2("¤¢¤¤|(?:¤¢¤¦|¤¢¤ò)", "¤¢¤ò", 0, 4); - x2("¤¢|¤¤|¤¦", "¤¨¤¦", 2, 4); - x2("¤¢|¤¤|¤¦¤¨|¤ª¤«¤­|¤¯|¤±¤³¤µ|¤·¤¹¤»|¤½|¤¿¤Á|¤Ä¤Æ¤È¤Ê¤Ë|¤Ì¤Í", "¤·¤¹¤»", 0, 6); - n("¤¢|¤¤|¤¦¤¨|¤ª¤«¤­|¤¯|¤±¤³¤µ|¤·¤¹¤»|¤½|¤¿¤Á|¤Ä¤Æ¤È¤Ê¤Ë|¤Ì¤Í", "¤¹¤»"); - x2("¤¢|^¤ï", "¤Ö¤¢", 2, 4); - x2("¤¢|^¤ò", "¤ò¤¢", 0, 2); - x2("µ´|\\G¼Ö", "¤±¼Öµ´", 4, 6); - x2("µ´|\\G¼Ö", "¼Öµ´", 0, 2); - x2("µ´|\\A¼Ö", "b¼Öµ´", 3, 5); - x2("µ´|\\A¼Ö", "¼Ö", 0, 2); - x2("µ´|¼Ö\\Z", "¼Öµ´", 2, 4); - x2("µ´|¼Ö\\Z", "¼Ö", 0, 2); - x2("µ´|¼Ö\\Z", "¼Ö\n", 0, 2); - x2("µ´|¼Ö\\z", "¼Öµ´", 2, 4); - x2("µ´|¼Ö\\z", "¼Ö", 0, 2); - x2("\\w|\\s", "¤ª", 0, 2); - x2("\\w|%", "%¤ª", 0, 1); - x2("\\w|[&$]", "¤¦&", 0, 2); - x2("[¤¤-¤±]", "¤¦", 0, 2); - x2("[¤¤-¤±]|[^¤«-¤³]", "¤¢", 0, 2); - x2("[¤¤-¤±]|[^¤«-¤³]", "¤«", 0, 2); - x2("[^¤¢]", "\n", 0, 1); - x2("(?:¤¢|[¤¦-¤­])|¤¤¤ò", "¤¦¤ò", 0, 2); - x2("(?:¤¢|[¤¦-¤­])|¤¤¤ò", "¤¤¤ò", 0, 4); - x2("¤¢¤¤¤¦|(?=¤±¤±)..¤Û", "¤±¤±¤Û", 0, 6); - x2("¤¢¤¤¤¦|(?!¤±¤±)..¤Û", "¤¢¤¤¤Û", 0, 6); - x2("(?=¤ò¤¢)..¤¢|(?=¤ò¤ò)..¤¢", "¤ò¤ò¤¢", 0, 6); - x2("(?<=¤¢|¤¤¤¦)¤¤", "¤¤¤¦¤¤", 4, 6); - n("(?>¤¢|¤¢¤¤¤¨)¤¦", "¤¢¤¤¤¨¤¦"); - x2("(?>¤¢¤¤¤¨|¤¢)¤¦", "¤¢¤¤¤¨¤¦", 0, 8); - x2("¤¢?|¤¤", "¤¢", 0, 2); - x2("¤¢?|¤¤", "¤¤", 0, 0); - x2("¤¢?|¤¤", "", 0, 0); - x2("¤¢*|¤¤", "¤¢¤¢", 0, 4); - x2("¤¢*|¤¤*", "¤¤¤¢", 0, 0); - x2("¤¢*|¤¤*", "¤¢¤¤", 0, 2); - x2("[a¤¢]*|¤¤*", "a¤¢¤¤¤¤¤¤", 0, 3); - x2("¤¢+|¤¤*", "", 0, 0); - x2("¤¢+|¤¤*", "¤¤¤¤¤¤", 0, 6); - x2("¤¢+|¤¤*", "¤¢¤¤¤¤¤¤", 0, 2); - x2("¤¢+|¤¤*", "a¤¢¤¤¤¤¤¤", 0, 0); - n("¤¢+|¤¤+", ""); - x2("(¤¢|¤¤)?", "¤¤", 0, 2); - x2("(¤¢|¤¤)*", "¤¤¤¢", 0, 4); - x2("(¤¢|¤¤)+", "¤¤¤¢¤¤", 0, 6); - x2("(¤¢¤¤|¤¦¤¢)+", "¤¦¤¢¤¢¤¤¤¦¤¨", 0, 8); - x2("(¤¢¤¤|¤¦¤¨)+", "¤¦¤¢¤¢¤¤¤¦¤¨", 4, 12); - x2("(¤¢¤¤|¤¦¤¢)+", "¤¢¤¢¤¤¤¦¤¢", 2, 10); - x2("(¤¢¤¤|¤¦¤¢)+", "¤¢¤¤¤ò¤¦¤¢", 0, 4); - x2("(¤¢¤¤|¤¦¤¢)+", "$$zzzz¤¢¤¤¤ò¤¦¤¢", 6, 10); - x2("(¤¢|¤¤¤¢¤¤)+", "¤¢¤¤¤¢¤¤¤¢", 0, 10); - x2("(¤¢|¤¤¤¢¤¤)+", "¤¤¤¢", 2, 4); - x2("(¤¢|¤¤¤¢¤¤)+", "¤¤¤¢¤¢¤¢¤¤¤¢", 2, 8); - x2("(?:¤¢|¤¤)(?:¤¢|¤¤)", "¤¢¤¤", 0, 4); - x2("(?:¤¢*|¤¤*)(?:¤¢*|¤¤*)", "¤¢¤¢¤¢¤¤¤¤¤¤", 0, 6); - x2("(?:¤¢*|¤¤*)(?:¤¢+|¤¤+)", "¤¢¤¢¤¢¤¤¤¤¤¤", 0, 12); - x2("(?:¤¢+|¤¤+){2}", "¤¢¤¢¤¢¤¤¤¤¤¤", 0, 12); - x2("(?:¤¢+|¤¤+){1,2}", "¤¢¤¢¤¢¤¤¤¤¤¤", 0, 12); - x2("(?:¤¢+|\\A¤¤*)¤¦¤¦", "¤¦¤¦", 0, 4); - n("(?:¤¢+|\\A¤¤*)¤¦¤¦", "¤¢¤¤¤¦¤¦"); - x2("(?:^¤¢+|¤¤+)*¤¦", "¤¢¤¢¤¤¤¤¤¤¤¢¤¤¤¦", 12, 16); - x2("(?:^¤¢+|¤¤+)*¤¦", "¤¢¤¢¤¤¤¤¤¤¤¤¤¦", 0, 14); - x2("¤¦{0,}", "¤¦¤¦¤¦¤¦", 0, 8); - x2("¤¢|(?i)c", "C", 0, 1); - x2("(?i)c|¤¢", "C", 0, 1); - x2("(?i:¤¢)|a", "a", 0, 1); - n("(?i:¤¢)|a", "A"); - x2("[¤¢¤¤¤¦]?", "¤¢¤¤¤¦", 0, 2); - x2("[¤¢¤¤¤¦]*", "¤¢¤¤¤¦", 0, 6); - x2("[^¤¢¤¤¤¦]*", "¤¢¤¤¤¦", 0, 0); - n("[^¤¢¤¤¤¦]+", "¤¢¤¤¤¦"); - x2("¤¢??", "¤¢¤¢¤¢", 0, 0); - x2("¤¤¤¢??¤¤", "¤¤¤¢¤¤", 0, 6); - x2("¤¢*?", "¤¢¤¢¤¢", 0, 0); - x2("¤¤¤¢*?", "¤¤¤¢¤¢", 0, 2); - x2("¤¤¤¢*?¤¤", "¤¤¤¢¤¢¤¤", 0, 8); - x2("¤¢+?", "¤¢¤¢¤¢", 0, 2); - x2("¤¤¤¢+?", "¤¤¤¢¤¢", 0, 4); - x2("¤¤¤¢+?¤¤", "¤¤¤¢¤¢¤¤", 0, 8); - x2("(?:Å·?)??", "Å·", 0, 0); - x2("(?:Å·??)?", "Å·", 0, 0); - x2("(?:Ì´?)+?", "Ì´Ì´Ì´", 0, 2); - x2("(?:É÷+)??", "É÷É÷É÷", 0, 0); - x2("(?:Àã+)??Áú", "ÀãÀãÀãÁú", 0, 8); - x2("(?:¤¢¤¤)?{2}", "", 0, 0); - x2("(?:µ´¼Ö)?{2}", "µ´¼Öµ´¼Öµ´", 0, 8); - x2("(?:µ´¼Ö)*{0}", "µ´¼Öµ´¼Öµ´", 0, 0); - x2("(?:µ´¼Ö){3,}", "µ´¼Öµ´¼Öµ´¼Öµ´¼Ö", 0, 16); - n("(?:µ´¼Ö){3,}", "µ´¼Öµ´¼Ö"); - x2("(?:µ´¼Ö){2,4}", "µ´¼Öµ´¼Öµ´¼Ö", 0, 12); - x2("(?:µ´¼Ö){2,4}", "µ´¼Öµ´¼Öµ´¼Öµ´¼Öµ´¼Ö", 0, 16); - x2("(?:µ´¼Ö){2,4}?", "µ´¼Öµ´¼Öµ´¼Öµ´¼Öµ´¼Ö", 0, 8); - x2("(?:µ´¼Ö){,}", "µ´¼Ö{,}", 0, 7); - x2("(?:¤«¤­¤¯)+?{2}", "¤«¤­¤¯¤«¤­¤¯¤«¤­¤¯", 0, 12); - x3("(²Ð)", "²Ð", 0, 2, 1); - x3("(²Ð¿å)", "²Ð¿å", 0, 4, 1); - x2("((»þ´Ö))", "»þ´Ö", 0, 4); - x3("((É÷¿å))", "É÷¿å", 0, 4, 1); - x3("((ºòÆü))", "ºòÆü", 0, 4, 2); - x3("((((((((((((((((((((ÎÌ»Ò))))))))))))))))))))", "ÎÌ»Ò", 0, 4, 20); - x3("(¤¢¤¤)(¤¦¤¨)", "¤¢¤¤¤¦¤¨", 0, 4, 1); - x3("(¤¢¤¤)(¤¦¤¨)", "¤¢¤¤¤¦¤¨", 4, 8, 2); - x3("()(¤¢)¤¤¤¦(¤¨¤ª¤«)¤­¤¯¤±¤³", "¤¢¤¤¤¦¤¨¤ª¤«¤­¤¯¤±¤³", 6, 12, 3); - x3("(()(¤¢)¤¤¤¦(¤¨¤ª¤«)¤­¤¯¤±¤³)", "¤¢¤¤¤¦¤¨¤ª¤«¤­¤¯¤±¤³", 6, 12, 4); - x3(".*(¥Õ¥©)¥ó¡¦¥Þ(¥ó()¥·¥å¥¿)¥¤¥ó", "¥Õ¥©¥ó¡¦¥Þ¥ó¥·¥å¥¿¥¤¥ó", 10, 18, 2); - x2("(^¤¢)", "¤¢", 0, 2); - x3("(¤¢)|(¤¢)", "¤¤¤¢", 2, 4, 1); - x3("(^¤¢)|(¤¢)", "¤¤¤¢", 2, 4, 2); - x3("(¤¢?)", "¤¢¤¢¤¢", 0, 2, 1); - x3("(¤Þ*)", "¤Þ¤Þ¤Þ", 0, 6, 1); - x3("(¤È*)", "", 0, 0, 1); - x3("(¤ë+)", "¤ë¤ë¤ë¤ë¤ë¤ë¤ë", 0, 14, 1); - x3("(¤Õ+|¤Ø*)", "¤Õ¤Õ¤Õ¤Ø¤Ø", 0, 6, 1); - x3("(¤¢+|¤¤?)", "¤¤¤¤¤¤¤¢¤¢", 0, 2, 1); - x3("(¤¢¤¤¤¦)?", "¤¢¤¤¤¦", 0, 6, 1); - x3("(¤¢¤¤¤¦)*", "¤¢¤¤¤¦", 0, 6, 1); - x3("(¤¢¤¤¤¦)+", "¤¢¤¤¤¦", 0, 6, 1); - x3("(¤µ¤·¤¹|¤¢¤¤¤¦)+", "¤¢¤¤¤¦", 0, 6, 1); - x3("([¤Ê¤Ë¤Ì][¤«¤­¤¯]|¤«¤­¤¯)+", "¤«¤­¤¯", 0, 6, 1); - x3("((?i:¤¢¤¤¤¦))", "¤¢¤¤¤¦", 0, 6, 1); - x3("((?m:¤¢.¤¦))", "¤¢\n¤¦", 0, 5, 1); - x3("((?=¤¢¤ó)¤¢)", "¤¢¤ó¤¤", 0, 2, 1); - x3("¤¢¤¤¤¦|(.¤¢¤¤¤¨)", "¤ó¤¢¤¤¤¨", 0, 8, 1); - x3("¤¢*(.)", "¤¢¤¢¤¢¤¢¤ó", 8, 10, 1); - x3("¤¢*?(.)", "¤¢¤¢¤¢¤¢¤ó", 0, 2, 1); - x3("¤¢*?(¤ó)", "¤¢¤¢¤¢¤¢¤ó", 8, 10, 1); - x3("[¤¤¤¦¤¨]¤¢*(.)", "¤¨¤¢¤¢¤¢¤¢¤ó", 10, 12, 1); - x3("(\\A¤¤¤¤)¤¦¤¦", "¤¤¤¤¤¦¤¦", 0, 4, 1); - n("(\\A¤¤¤¤)¤¦¤¦", "¤ó¤¤¤¤¤¦¤¦"); - x3("(^¤¤¤¤)¤¦¤¦", "¤¤¤¤¤¦¤¦", 0, 4, 1); - n("(^¤¤¤¤)¤¦¤¦", "¤ó¤¤¤¤¤¦¤¦"); - x3("¤í¤í(¤ë¤ë$)", "¤í¤í¤ë¤ë", 4, 8, 1); - n("¤í¤í(¤ë¤ë$)", "¤í¤í¤ë¤ë¤ë"); - x2("(̵)\\1", "̵̵", 0, 4); - n("(̵)\\1", "̵Éð"); - x2("(¶õ?)\\1", "¶õ¶õ", 0, 4); - x2("(¶õ??)\\1", "¶õ¶õ", 0, 0); - x2("(¶õ*)\\1", "¶õ¶õ¶õ¶õ¶õ", 0, 8); - x3("(¶õ*)\\1", "¶õ¶õ¶õ¶õ¶õ", 0, 4, 1); - x2("¤¢(¤¤*)\\1", "¤¢¤¤¤¤¤¤¤¤", 0, 10); - x2("¤¢(¤¤*)\\1", "¤¢¤¤", 0, 2); - x2("(¤¢*)(¤¤*)\\1\\2", "¤¢¤¢¤¢¤¤¤¤¤¢¤¢¤¢¤¤¤¤", 0, 20); - x2("(¤¢*)(¤¤*)\\2", "¤¢¤¢¤¢¤¤¤¤¤¤¤¤", 0, 14); - x3("(¤¢*)(¤¤*)\\2", "¤¢¤¢¤¢¤¤¤¤¤¤¤¤", 6, 10, 2); - x2("(((((((¤Ý*)¤Ú))))))¤Ô\\7", "¤Ý¤Ý¤Ý¤Ú¤Ô¤Ý¤Ý¤Ý", 0, 16); - x3("(((((((¤Ý*)¤Ú))))))¤Ô\\7", "¤Ý¤Ý¤Ý¤Ú¤Ô¤Ý¤Ý¤Ý", 0, 6, 7); - x2("(¤Ï)(¤Ò)(¤Õ)\\2\\1\\3", "¤Ï¤Ò¤Õ¤Ò¤Ï¤Õ", 0, 12); - x2("([¤­-¤±])\\1", "¤¯¤¯", 0, 4); - x2("(\\w\\d\\s)\\1", "¤¢5 ¤¢5 ", 0, 8); - n("(\\w\\d\\s)\\1", "¤¢5 ¤¢5"); - x2("(ï¡©|[¤¢-¤¦]{3})\\1", "ï¡©", 0, 8); - x2("...(ï¡©|[¤¢-¤¦]{3})\\1", "¤¢a¤¢Ã¯¡©Ã¯¡©", 0, 13); - x2("(ï¡©|[¤¢-¤¦]{3})\\1", "¤¦¤¤¤¦¤¦¤¤¤¦", 0, 12); - x2("(^¤³)\\1", "¤³¤³", 0, 4); - n("(^¤à)\\1", "¤á¤à¤à"); - n("(¤¢$)\\1", "¤¢¤¢"); - n("(¤¢¤¤\\Z)\\1", "¤¢¤¤"); - x2("(¤¢*\\Z)\\1", "¤¢", 2, 2); - x2(".(¤¢*\\Z)\\1", "¤¤¤¢", 2, 4); - x3("(.(¤ä¤¤¤æ)\\2)", "z¤ä¤¤¤æ¤ä¤¤¤æ", 0, 13, 1); - x3("(.(..\\d.)\\2)", "¤¢12341234", 0, 10, 1); - x2("((?i:¤¢v¤º))\\1", "¤¢v¤º¤¢v¤º", 0, 10); - x2("(?<¶ò¤«>ÊÑ|\\(\\g<¶ò¤«>\\))", "((((((ÊÑ))))))", 0, 14); - x2("\\A(?:\\g<°¤_1>|\\g<±¾_2>|\\z½ªÎ» (?<°¤_1>´Ñ|¼«\\g<±¾_2>¼«)(?<±¾_2>ºß|Ê\\g<°¤_1>Ê))$", "Ê¼«Ê¼«ºß¼«Ê¼«Ê", 0, 26); - x2("[[¤Ò¤Õ]]", "¤Õ", 0, 2); - x2("[[¤¤¤ª¤¦]¤«]", "¤«", 0, 2); - n("[[^¤¢]]", "¤¢"); - n("[^[¤¢]]", "¤¢"); - x2("[^[^¤¢]]", "¤¢", 0, 2); - x2("[[¤«¤­¤¯]&&¤­¤¯]", "¤¯", 0, 2); - n("[[¤«¤­¤¯]&&¤­¤¯]", "¤«"); - n("[[¤«¤­¤¯]&&¤­¤¯]", "¤±"); - x2("[¤¢-¤ó&&¤¤-¤ò&&¤¦-¤ñ]", "¤ñ", 0, 2); - n("[^¤¢-¤ó&&¤¤-¤ò&&¤¦-¤ñ]", "¤ñ"); - x2("[[^¤¢&&¤¢]&&¤¢-¤ó]", "¤¤", 0, 2); - n("[[^¤¢&&¤¢]&&¤¢-¤ó]", "¤¢"); - x2("[[^¤¢-¤ó&&¤¤¤¦¤¨¤ª]&&[^¤¦-¤«]]", "¤­", 0, 2); - n("[[^¤¢-¤ó&&¤¤¤¦¤¨¤ª]&&[^¤¦-¤«]]", "¤¤"); - x2("[^[^¤¢¤¤¤¦]&&[^¤¦¤¨¤ª]]", "¤¦", 0, 2); - x2("[^[^¤¢¤¤¤¦]&&[^¤¦¤¨¤ª]]", "¤¨", 0, 2); - n("[^[^¤¢¤¤¤¦]&&[^¤¦¤¨¤ª]]", "¤«"); - x2("[¤¢-&&-¤¢]", "-", 0, 1); - x2("[^[^a-z¤¢¤¤¤¦]&&[^bcdefg¤¦¤¨¤ª]q-w]", "¤¨", 0, 2); - x2("[^[^a-z¤¢¤¤¤¦]&&[^bcdefg¤¦¤¨¤ª]g-w]", "f", 0, 1); - x2("[^[^a-z¤¢¤¤¤¦]&&[^bcdefg¤¦¤¨¤ª]g-w]", "g", 0, 1); - n("[^[^a-z¤¢¤¤¤¦]&&[^bcdefg¤¦¤¨¤ª]g-w]", "2"); - fprintf(stdout, "\nRESULT SUCC: %d, FAIL: %d, ERROR: %d\n", - nsucc, nfail, nerror); - -#ifndef POSIX_TEST - onig_region_free(region, 1); - onig_end(); -#endif - - return 0; -} diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c index fdf9195b5fa..b3f812fb48d 100644 --- a/ext/mbstring/php_mbregex.c +++ b/ext/mbstring/php_mbregex.c @@ -124,6 +124,22 @@ php_mb_regex_enc_name_map_t enc_name_map[] ={ "UTF-8\0UTF8\0", ONIG_ENCODING_UTF8 }, + { + "UTF-16\0UTF-16BE\0", + ONIG_ENCODING_UTF16_BE + }, + { + "UTF-16LE\0", + ONIG_ENCODING_UTF16_LE + }, + { + "UCS-4\0UTF-32\0UTF-32BE\0", + ONIG_ENCODING_UTF32_BE + }, + { + "UCS-4LE\0UTF-32LE\0", + ONIG_ENCODING_UTF32_LE + }, { "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0", ONIG_ENCODING_SJIS diff --git a/ext/mbstring/tests/bug31911.phpt b/ext/mbstring/tests/bug31911.phpt new file mode 100644 index 00000000000..eb6438d4e7a --- /dev/null +++ b/ext/mbstring/tests/bug31911.phpt @@ -0,0 +1,11 @@ +--TEST-- +Bug #31911 (mb_decode_mimeheader() is case-sensitive to hex escapes) +--FILE-- + +--EXPECT-- +Works: ??? +Fails: ???