Update regex module

This commit is contained in:
Kovid Goyal 2014-01-30 18:59:24 +05:30
parent eb4a6f89fe
commit f41faa558a
2 changed files with 108 additions and 80 deletions

View File

@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
"Regex"] "Regex"]
__version__ = "2.4.37" __version__ = "2.4.39"
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# Public interface. # Public interface.

View File

@ -222,9 +222,6 @@ typedef struct RE_State* RE_StatePtr;
/* Handlers for ASCII, locale and Unicode. */ /* Handlers for ASCII, locale and Unicode. */
typedef struct RE_EncodingTable { typedef struct RE_EncodingTable {
BOOL (*has_property)(RE_CODE property, Py_UCS4 ch); BOOL (*has_property)(RE_CODE property, Py_UCS4 ch);
Py_UCS4 (*lower)(Py_UCS4 ch);
Py_UCS4 (*upper)(Py_UCS4 ch);
Py_UCS4 (*title)(Py_UCS4 ch);
BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos); BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos);
BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos); BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos);
BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos); BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos);
@ -774,22 +771,6 @@ static BOOL ascii_has_property(RE_CODE property, Py_UCS4 ch) {
return unicode_has_property(property, ch); return unicode_has_property(property, ch);
} }
/* Converts a character to lowercase. */
static Py_UCS4 ascii_lower(Py_UCS4 ch) {
if (ch > RE_ASCII_MAX || !re_get_uppercase(ch))
return ch;
return ch ^ 0x20;
}
/* Converts a character to uppercase. */
static Py_UCS4 ascii_upper(Py_UCS4 ch) {
if (ch > RE_ASCII_MAX || !re_get_lowercase(ch))
return ch;
return ch ^ 0x20;
}
/* Checks whether the current text position is on a word boundary. */ /* Checks whether the current text position is on a word boundary. */
static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) { static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) {
BOOL before; BOOL before;
@ -932,9 +913,6 @@ static int ascii_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
/* The handlers for ASCII characters. */ /* The handlers for ASCII characters. */
static RE_EncodingTable ascii_encoding = { static RE_EncodingTable ascii_encoding = {
ascii_has_property, ascii_has_property,
ascii_lower,
ascii_upper,
ascii_upper, /* For ASCII, titlecase is the same as uppercase. */
ascii_at_boundary, ascii_at_boundary,
ascii_at_word_start, ascii_at_word_start,
ascii_at_word_end, ascii_at_word_end,
@ -1035,22 +1013,6 @@ static BOOL locale_has_property(RE_CODE property, Py_UCS4 ch) {
return v == value; return v == value;
} }
/* Converts a character to lowercase. */
static Py_UCS4 locale_lower(Py_UCS4 ch) {
if (ch > RE_LOCALE_MAX)
return ch;
return tolower(ch);
}
/* Converts a character to uppercase. */
static Py_UCS4 locale_upper(Py_UCS4 ch) {
if (ch > RE_LOCALE_MAX)
return ch;
return toupper(ch);
}
/* Checks whether the current text position is on a word boundary. */ /* Checks whether the current text position is on a word boundary. */
static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) { static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) {
BOOL before; BOOL before;
@ -1166,9 +1128,6 @@ static int locale_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
/* The handlers for locale characters. */ /* The handlers for locale characters. */
static RE_EncodingTable locale_encoding = { static RE_EncodingTable locale_encoding = {
locale_has_property, locale_has_property,
locale_lower,
locale_upper,
locale_upper, /* For locale, titlecase is the same as uppercase. */
locale_at_boundary, locale_at_boundary,
locale_at_word_start, locale_at_word_start,
locale_at_word_end, locale_at_word_end,
@ -1226,21 +1185,6 @@ static BOOL unicode_has_property(RE_CODE property, Py_UCS4 ch) {
return FALSE; return FALSE;
} }
/* Converts a Unicode character to lowercase. */
static Py_UCS4 unicode_lower(Py_UCS4 ch) {
return Py_UNICODE_TOLOWER((Py_UCS4)ch);
}
/* Converts a Unicode character to uppercase. */
static Py_UCS4 unicode_upper(Py_UCS4 ch) {
return Py_UNICODE_TOUPPER((Py_UCS4)ch);
}
/* Converts a Unicode character to titlecase. */
static Py_UCS4 unicode_title(Py_UCS4 ch) {
return Py_UNICODE_TOTITLE((Py_UCS4)ch);
}
/* Checks whether the current text position is on a word boundary. */ /* Checks whether the current text position is on a word boundary. */
static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) { static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) {
BOOL before; BOOL before;
@ -1285,7 +1229,7 @@ static BOOL unicode_at_word_end(RE_State* state, Py_ssize_t text_pos) {
* Only a limited number are treated as vowels. * Only a limited number are treated as vowels.
*/ */
Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) { Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) {
switch (unicode_lower(ch)) { switch (Py_UNICODE_TOLOWER(ch)) {
case 'a': case 0xE0: case 0xE1: case 0xE2: case 'a': case 0xE0: case 0xE1: case 0xE2:
case 'e': case 0xE8: case 0xE9: case 0xEA: case 'e': case 0xE8: case 0xE9: case 0xEA:
case 'i': case 0xEC: case 0xED: case 0xEE: case 'i': case 0xEC: case 0xED: case 0xEE:
@ -1773,9 +1717,6 @@ static int unicode_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
/* The handlers for Unicode characters. */ /* The handlers for Unicode characters. */
static RE_EncodingTable unicode_encoding = { static RE_EncodingTable unicode_encoding = {
unicode_has_property, unicode_has_property,
unicode_lower,
unicode_upper,
unicode_title,
unicode_at_boundary, unicode_at_boundary,
unicode_at_word_start, unicode_at_word_start,
unicode_at_word_end, unicode_at_word_end,
@ -8521,6 +8462,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld(RE_SafeState* safe_state,
new_node = bt_data->fuzzy_string.position.node; new_node = bt_data->fuzzy_string.position.node;
data.new_string_pos = bt_data->fuzzy_string.string_pos; data.new_string_pos = bt_data->fuzzy_string.string_pos;
data.new_folded_pos = bt_data->fuzzy_string.folded_pos; data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
data.folded_len = bt_data->fuzzy_string.folded_len;
data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
data.step = bt_data->fuzzy_string.step; data.step = bt_data->fuzzy_string.step;
@ -8701,6 +8643,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld2(RE_SafeState* safe_state,
new_node = bt_data->fuzzy_string.position.node; new_node = bt_data->fuzzy_string.position.node;
new_group_pos = bt_data->fuzzy_string.string_pos; new_group_pos = bt_data->fuzzy_string.string_pos;
data.new_folded_pos = bt_data->fuzzy_string.folded_pos; data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
data.folded_len = bt_data->fuzzy_string.folded_len;
data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos; data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos;
data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
data.step = bt_data->fuzzy_string.step; data.step = bt_data->fuzzy_string.step;
@ -10953,10 +10896,10 @@ advance:
goto backtrack; goto backtrack;
} }
} }
string_pos = -1;
} }
string_pos = -1;
/* Successful match. */ /* Successful match. */
node = node->next_1.node; node = node->next_1.node;
break; break;
@ -11057,12 +11000,10 @@ advance:
} }
} }
if (folded_pos < folded_len) {
string_pos = -1;
goto backtrack;
}
string_pos = -1; string_pos = -1;
if (folded_pos < folded_len)
goto backtrack;
} }
/* Successful match. */ /* Successful match. */
@ -11165,12 +11106,10 @@ advance:
} }
} }
if (folded_pos > 0) {
string_pos = -1;
goto backtrack;
}
string_pos = -1; string_pos = -1;
if (folded_pos > 0)
goto backtrack;
} }
/* Successful match. */ /* Successful match. */
@ -11220,10 +11159,10 @@ advance:
goto backtrack; goto backtrack;
} }
} }
string_pos = -1;
} }
string_pos = -1;
/* Successful match. */ /* Successful match. */
node = node->next_1.node; node = node->next_1.node;
break; break;
@ -11271,10 +11210,10 @@ advance:
goto backtrack; goto backtrack;
} }
} }
string_pos = -1;
} }
string_pos = -1;
/* Successful match. */ /* Successful match. */
node = node->next_1.node; node = node->next_1.node;
break; break;
@ -11322,10 +11261,10 @@ advance:
goto backtrack; goto backtrack;
} }
} }
string_pos = -1;
} }
string_pos = -1;
/* Successful match. */ /* Successful match. */
node = node->next_1.node; node = node->next_1.node;
break; break;
@ -14494,7 +14433,7 @@ Py_LOCAL_INLINE(PyObject*) join_list_info(JoinInfo* join_info) {
* Returns its length if it is a literal, otherwise -1. * Returns its length if it is a literal, otherwise -1.
*/ */
Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement, Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement,
char special_char) { unsigned char special_char) {
RE_StringInfo str_info; RE_StringInfo str_info;
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
Py_ssize_t pos; Py_ssize_t pos;
@ -16951,6 +16890,94 @@ Py_LOCAL_INLINE(BOOL) append_string(PyObject* list, char* string) {
return TRUE; return TRUE;
} }
/* Appends a (decimal) integer to a list. */
Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) {
PyObject* int_obj;
PyObject* repr_obj;
int status;
int_obj = Py_BuildValue("n", value);
if (!int_obj)
return FALSE;
repr_obj = PyObject_Repr(int_obj);
Py_DECREF(int_obj);
if (!repr_obj)
return FALSE;
status = PyList_Append(list, repr_obj);
Py_DECREF(repr_obj);
if (status < 0)
return FALSE;
return TRUE;
}
/* MatchObject's '__repr__' method. */
static PyObject* match_repr(PyObject* self_) {
MatchObject* self;
PyObject* list;
PyObject* matched_substring;
PyObject* matched_repr;
int status;
PyObject* separator;
PyObject* result;
self = (MatchObject*)self_;
list = PyList_New(0);
if (!list)
return NULL;
if (!append_string(list, "<regex.Match object; span=("))
goto error;
if (!append_integer(list, self->match_start))
goto error;
if (! append_string(list, ", "))
goto error;
if (!append_integer(list, self->match_end))
goto error;
if (!append_string(list, "), match="))
goto error;
matched_substring = get_slice(self->substring, self->match_start -
self->substring_offset, self->match_end - self->substring_offset);
if (!matched_substring)
goto error;
matched_repr = PyObject_Repr(matched_substring);
Py_DECREF(matched_substring);
if (!matched_repr)
goto error;
status = PyList_Append(list, matched_repr);
Py_DECREF(matched_repr);
if (status < 0)
goto error;
if (! append_string(list, ">"))
goto error;
separator = Py_BuildValue("s", "");
if (!separator)
goto error;
result = PyUnicode_Join(separator, list);
Py_DECREF(separator);
Py_DECREF(list);
return result;
error:
Py_DECREF(list);
return NULL;
}
/* PatternObject's '__repr__' method. */ /* PatternObject's '__repr__' method. */
static PyObject* pattern_repr(PyObject* self_) { static PyObject* pattern_repr(PyObject* self_) {
PatternObject* self; PatternObject* self;
@ -19873,6 +19900,7 @@ PyMODINIT_FUNC init_regex(void) {
/* Initialise Match_Type. */ /* Initialise Match_Type. */
Match_Type.tp_dealloc = match_dealloc; Match_Type.tp_dealloc = match_dealloc;
Match_Type.tp_repr = match_repr;
Match_Type.tp_as_mapping = &match_as_mapping; Match_Type.tp_as_mapping = &match_as_mapping;
Match_Type.tp_flags = Py_TPFLAGS_DEFAULT; Match_Type.tp_flags = Py_TPFLAGS_DEFAULT;
Match_Type.tp_doc = match_doc; Match_Type.tp_doc = match_doc;