From f41faa558a509459a6de0e19f19996152cff9733 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Jan 2014 18:59:24 +0530 Subject: [PATCH] Update regex module --- src/regex/__init__.py | 2 +- src/regex/_regex.c | 186 ++++++++++++++++++++++++------------------ 2 files changed, 108 insertions(+), 80 deletions(-) diff --git a/src/regex/__init__.py b/src/regex/__init__.py index e54b5797a3..45d1920d6a 100644 --- a/src/regex/__init__.py +++ b/src/regex/__init__.py @@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex"] -__version__ = "2.4.37" +__version__ = "2.4.39" # -------------------------------------------------------------------- # Public interface. diff --git a/src/regex/_regex.c b/src/regex/_regex.c index 2a5698cba9..23ec638006 100644 --- a/src/regex/_regex.c +++ b/src/regex/_regex.c @@ -222,9 +222,6 @@ typedef struct RE_State* RE_StatePtr; /* Handlers for ASCII, locale and Unicode. */ typedef struct RE_EncodingTable { BOOL (*has_property)(RE_CODE property, Py_UCS4 ch); - Py_UCS4 (*lower)(Py_UCS4 ch); - Py_UCS4 (*upper)(Py_UCS4 ch); - Py_UCS4 (*title)(Py_UCS4 ch); BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos); BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos); BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos); @@ -774,22 +771,6 @@ static BOOL ascii_has_property(RE_CODE property, Py_UCS4 ch) { return unicode_has_property(property, ch); } -/* Converts a character to lowercase. */ -static Py_UCS4 ascii_lower(Py_UCS4 ch) { - if (ch > RE_ASCII_MAX || !re_get_uppercase(ch)) - return ch; - - return ch ^ 0x20; -} - -/* Converts a character to uppercase. */ -static Py_UCS4 ascii_upper(Py_UCS4 ch) { - if (ch > RE_ASCII_MAX || !re_get_lowercase(ch)) - return ch; - - return ch ^ 0x20; -} - /* Checks whether the current text position is on a word boundary. */ static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) { BOOL before; @@ -932,9 +913,6 @@ static int ascii_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) { /* The handlers for ASCII characters. */ static RE_EncodingTable ascii_encoding = { ascii_has_property, - ascii_lower, - ascii_upper, - ascii_upper, /* For ASCII, titlecase is the same as uppercase. */ ascii_at_boundary, ascii_at_word_start, ascii_at_word_end, @@ -1035,22 +1013,6 @@ static BOOL locale_has_property(RE_CODE property, Py_UCS4 ch) { return v == value; } -/* Converts a character to lowercase. */ -static Py_UCS4 locale_lower(Py_UCS4 ch) { - if (ch > RE_LOCALE_MAX) - return ch; - - return tolower(ch); -} - -/* Converts a character to uppercase. */ -static Py_UCS4 locale_upper(Py_UCS4 ch) { - if (ch > RE_LOCALE_MAX) - return ch; - - return toupper(ch); -} - /* Checks whether the current text position is on a word boundary. */ static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) { BOOL before; @@ -1166,9 +1128,6 @@ static int locale_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) { /* The handlers for locale characters. */ static RE_EncodingTable locale_encoding = { locale_has_property, - locale_lower, - locale_upper, - locale_upper, /* For locale, titlecase is the same as uppercase. */ locale_at_boundary, locale_at_word_start, locale_at_word_end, @@ -1226,21 +1185,6 @@ static BOOL unicode_has_property(RE_CODE property, Py_UCS4 ch) { return FALSE; } -/* Converts a Unicode character to lowercase. */ -static Py_UCS4 unicode_lower(Py_UCS4 ch) { - return Py_UNICODE_TOLOWER((Py_UCS4)ch); -} - -/* Converts a Unicode character to uppercase. */ -static Py_UCS4 unicode_upper(Py_UCS4 ch) { - return Py_UNICODE_TOUPPER((Py_UCS4)ch); -} - -/* Converts a Unicode character to titlecase. */ -static Py_UCS4 unicode_title(Py_UCS4 ch) { - return Py_UNICODE_TOTITLE((Py_UCS4)ch); -} - /* Checks whether the current text position is on a word boundary. */ static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) { BOOL before; @@ -1285,7 +1229,7 @@ static BOOL unicode_at_word_end(RE_State* state, Py_ssize_t text_pos) { * Only a limited number are treated as vowels. */ Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) { - switch (unicode_lower(ch)) { + switch (Py_UNICODE_TOLOWER(ch)) { case 'a': case 0xE0: case 0xE1: case 0xE2: case 'e': case 0xE8: case 0xE9: case 0xEA: case 'i': case 0xEC: case 0xED: case 0xEE: @@ -1773,9 +1717,6 @@ static int unicode_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) { /* The handlers for Unicode characters. */ static RE_EncodingTable unicode_encoding = { unicode_has_property, - unicode_lower, - unicode_upper, - unicode_title, unicode_at_boundary, unicode_at_word_start, unicode_at_word_end, @@ -8521,6 +8462,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld(RE_SafeState* safe_state, new_node = bt_data->fuzzy_string.position.node; data.new_string_pos = bt_data->fuzzy_string.string_pos; data.new_folded_pos = bt_data->fuzzy_string.folded_pos; + data.folded_len = bt_data->fuzzy_string.folded_len; data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; data.step = bt_data->fuzzy_string.step; @@ -8701,6 +8643,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld2(RE_SafeState* safe_state, new_node = bt_data->fuzzy_string.position.node; new_group_pos = bt_data->fuzzy_string.string_pos; data.new_folded_pos = bt_data->fuzzy_string.folded_pos; + data.folded_len = bt_data->fuzzy_string.folded_len; data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos; data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; data.step = bt_data->fuzzy_string.step; @@ -10953,10 +10896,10 @@ advance: goto backtrack; } } - - string_pos = -1; } + string_pos = -1; + /* Successful match. */ node = node->next_1.node; break; @@ -11057,12 +11000,10 @@ advance: } } - if (folded_pos < folded_len) { - string_pos = -1; - goto backtrack; - } - string_pos = -1; + + if (folded_pos < folded_len) + goto backtrack; } /* Successful match. */ @@ -11165,12 +11106,10 @@ advance: } } - if (folded_pos > 0) { - string_pos = -1; - goto backtrack; - } - string_pos = -1; + + if (folded_pos > 0) + goto backtrack; } /* Successful match. */ @@ -11220,10 +11159,10 @@ advance: goto backtrack; } } - - string_pos = -1; } + string_pos = -1; + /* Successful match. */ node = node->next_1.node; break; @@ -11271,10 +11210,10 @@ advance: goto backtrack; } } - - string_pos = -1; } + string_pos = -1; + /* Successful match. */ node = node->next_1.node; break; @@ -11322,10 +11261,10 @@ advance: goto backtrack; } } - - string_pos = -1; } + string_pos = -1; + /* Successful match. */ node = node->next_1.node; break; @@ -14494,7 +14433,7 @@ Py_LOCAL_INLINE(PyObject*) join_list_info(JoinInfo* join_info) { * Returns its length if it is a literal, otherwise -1. */ Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement, - char special_char) { + unsigned char special_char) { RE_StringInfo str_info; Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); Py_ssize_t pos; @@ -16951,6 +16890,94 @@ Py_LOCAL_INLINE(BOOL) append_string(PyObject* list, char* string) { return TRUE; } +/* Appends a (decimal) integer to a list. */ +Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) { + PyObject* int_obj; + PyObject* repr_obj; + int status; + + int_obj = Py_BuildValue("n", value); + if (!int_obj) + return FALSE; + + repr_obj = PyObject_Repr(int_obj); + Py_DECREF(int_obj); + if (!repr_obj) + return FALSE; + + status = PyList_Append(list, repr_obj); + Py_DECREF(repr_obj); + if (status < 0) + return FALSE; + + return TRUE; +} + +/* MatchObject's '__repr__' method. */ +static PyObject* match_repr(PyObject* self_) { + MatchObject* self; + PyObject* list; + PyObject* matched_substring; + PyObject* matched_repr; + int status; + PyObject* separator; + PyObject* result; + + self = (MatchObject*)self_; + + list = PyList_New(0); + if (!list) + return NULL; + + if (!append_string(list, "match_start)) + goto error; + + if (! append_string(list, ", ")) + goto error; + + if (!append_integer(list, self->match_end)) + goto error; + + if (!append_string(list, "), match=")) + goto error; + + matched_substring = get_slice(self->substring, self->match_start - + self->substring_offset, self->match_end - self->substring_offset); + if (!matched_substring) + goto error; + + matched_repr = PyObject_Repr(matched_substring); + Py_DECREF(matched_substring); + if (!matched_repr) + goto error; + + status = PyList_Append(list, matched_repr); + Py_DECREF(matched_repr); + if (status < 0) + goto error; + + if (! append_string(list, ">")) + goto error; + + separator = Py_BuildValue("s", ""); + if (!separator) + goto error; + + result = PyUnicode_Join(separator, list); + Py_DECREF(separator); + + Py_DECREF(list); + + return result; + +error: + Py_DECREF(list); + return NULL; +} + /* PatternObject's '__repr__' method. */ static PyObject* pattern_repr(PyObject* self_) { PatternObject* self; @@ -19873,6 +19900,7 @@ PyMODINIT_FUNC init_regex(void) { /* Initialise Match_Type. */ Match_Type.tp_dealloc = match_dealloc; + Match_Type.tp_repr = match_repr; Match_Type.tp_as_mapping = &match_as_mapping; Match_Type.tp_flags = Py_TPFLAGS_DEFAULT; Match_Type.tp_doc = match_doc;