From 641de015702e2cc93d89b75dd2cd0cab0890f07e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 27 Dec 2016 14:11:32 +0530 Subject: [PATCH] Update regex module --- src/regex/__init__.py | 2 +- src/regex/_regex.c | 75 +++++++++++++++++++++++++++++++++++----- src/regex/_regex_core.py | 7 ++-- 3 files changed, 71 insertions(+), 13 deletions(-) diff --git a/src/regex/__init__.py b/src/regex/__init__.py index aba61ad388..454bbebcb4 100644 --- a/src/regex/__init__.py +++ b/src/regex/__init__.py @@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex"] -__version__ = "2.4.105" +__version__ = "2.4.112" # -------------------------------------------------------------------- # Public interface. diff --git a/src/regex/_regex.c b/src/regex/_regex.c index 84cec8413e..17c08e2f85 100644 --- a/src/regex/_regex.c +++ b/src/regex/_regex.c @@ -1547,6 +1547,15 @@ Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) { } } +/* Checks whether a character is a Unicode apostrophe. + * + * This could be U+0027 (APOSTROPHE) or U+2019 (RIGHT SINGLE QUOTATION MARK / + * curly apostrophe). + */ +static BOOL is_unicode_apostrophe(Py_UCS4 ch) { + return ch == 0x27 || ch == 0x2019; +} + /* Checks whether a position is on a default word boundary. * * The rules are defined here: @@ -1667,7 +1676,7 @@ static BOOL unicode_at_default_boundary(RE_State* state, Py_ssize_t text_pos) { /* Break between apostrophe and vowels (French, Italian). */ /* WB5a */ - if (pos_m1 >= 0 && char_at(state->text, pos_m1) == '\'' && + if (pos_m1 >= 0 && is_unicode_apostrophe(char_at(state->text, pos_m1)) && is_unicode_vowel(char_at(state->text, text_pos))) return TRUE; @@ -1849,7 +1858,8 @@ Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state, if (prop_m1 == RE_BREAK_ALETTER && prop == RE_BREAK_ALETTER) return FALSE; - if (pos_m1 >= 0 && char_m1 == '\'' && is_unicode_vowel(char_0)) + if (pos_m1 >= 0 && is_unicode_apostrophe(char_m1) && + is_unicode_vowel(char_0)) return TRUE; pos_p1 = text_pos + 1; @@ -10019,7 +10029,7 @@ Py_LOCAL_INLINE(BOOL) any_error_permitted(RE_State* state) { return fuzzy_info->total_cost <= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MAX_ERR] && - state->total_errors <= state->max_errors; + state->total_errors < state->max_errors; } /* Checks whether this additional fuzzy error is permitted. */ @@ -10032,7 +10042,7 @@ Py_LOCAL_INLINE(BOOL) this_error_permitted(RE_State* state, int fuzzy_type) { return fuzzy_info->total_cost + values[RE_FUZZY_VAL_COST_BASE + fuzzy_type] <= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[fuzzy_type] < - values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors + 1 <= + values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors < state->max_errors; } @@ -10064,6 +10074,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data, switch (data->fuzzy_type) { case RE_FUZZY_DEL: /* Could a character at text_pos have been deleted? */ + if (step == 0) + return RE_ERROR_FAILURE; + if (is_string) data->new_string_pos += step; else @@ -10074,7 +10087,10 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data, if (!data->permit_insertion) return RE_ERROR_FAILURE; - new_pos = data->new_text_pos + step; + if (step == 0) + new_pos = data->new_text_pos + data->step; + else + new_pos = data->new_text_pos + step; if (state->slice_start <= new_pos && new_pos <= state->slice_end) { data->new_text_pos = new_pos; return RE_ERROR_SUCCESS; @@ -10083,6 +10099,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data, return check_fuzzy_partial(state, new_pos); case RE_FUZZY_SUB: /* Could the character at text_pos have been substituted? */ + if (step == 0) + return RE_ERROR_FAILURE; + new_pos = data->new_text_pos + step; if (state->slice_start <= new_pos && new_pos <= state->slice_end) { data->new_text_pos = new_pos; @@ -14808,7 +14827,10 @@ backtrack: * backtracked inside and already restored the groups. We also * need to restore certain flags. */ - if (bt_data->lookaround.node->match) + RE_Node* node; + + node = bt_data->lookaround.node; + if (node->match && (node->status & RE_STATUS_HAS_GROUPS)) pop_groups(state); state->too_few_errors = bt_data->lookaround.too_few_errors; @@ -16602,13 +16624,14 @@ Py_LOCAL_INLINE(int) do_best_fuzzy_match(RE_SafeState* safe_state, BOOL search) */ add_to_best_list(safe_state, &best_list, state->match_pos, state->text_pos); - } + } else + start_pos = state->match_pos + step; /* Should we keep searching? */ if (!search) break; - start_pos = state->match_pos + step; + state->max_errors = fewest_errors - 1; } if (found_match) { @@ -17103,7 +17126,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) { * instead. */ if (PyUnicode_Check(string)) { - /* Unicode strings doesn't always support the buffer interface. */ + /* Unicode strings don't always support the buffer interface. */ str_info->characters = (void*)PyUnicode_AS_DATA(string); str_info->length = PyUnicode_GET_SIZE(string); str_info->charsize = sizeof(Py_UNICODE); @@ -17112,6 +17135,39 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) { return TRUE; } +#if defined(PYPY_VERSION) + if (PyString_Check(string)) { + /* Bytestrings don't always support the buffer interface. */ + str_info->characters = (void*)PyString_AS_STRING(string); + str_info->length = PyString_GET_SIZE(string); + str_info->charsize = 1; + str_info->is_unicode = FALSE; + str_info->should_release = FALSE; + return TRUE; + } + +#endif +#if defined(PYPY_VERSION) + /* Get pointer to string buffer. */ + if (PyObject_GetBuffer(string, &str_info->view, PyBUF_SIMPLE) != 0) { + printf("PyObject_GetBuffer failed!\n"); + PyErr_SetString(PyExc_TypeError, "expected string or buffer"); + return FALSE; + } + + if (!str_info->view.buf) { + PyBuffer_Release(&str_info->view); + PyErr_SetString(PyExc_ValueError, "buffer is NULL"); + return FALSE; + } + + str_info->should_release = TRUE; + + str_info->characters = str_info->view.buf; + str_info->length = str_info->view.len; + str_info->charsize = 1; + str_info->is_unicode = FALSE; +#else /* Get pointer to string buffer. */ #if PY_VERSION_HEX >= 0x02060000 buffer = Py_TYPE(string)->tp_as_buffer; @@ -17183,6 +17239,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) { str_info->length = size; str_info->is_unicode = FALSE; +#endif return TRUE; } diff --git a/src/regex/_regex_core.py b/src/regex/_regex_core.py index fba42b7f57..4b65078eda 100644 --- a/src/regex/_regex_core.py +++ b/src/regex/_regex_core.py @@ -14,7 +14,6 @@ # 2010-01-16 mrab Python front-end re-written and extended import string -import sys import unicodedata from collections import defaultdict @@ -23,7 +22,6 @@ _regex = plugins['_regex'][0] if _regex is None: raise RuntimeError('Failed to load regex module with error: ' + plugins['_regex'][1]) - __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", @@ -2842,6 +2840,9 @@ class GreedyRepeat(RegexBase): def is_atomic(self): return self.min_count == self.max_count and self.subpattern.is_atomic() + def can_be_affix(self): + return False + def contains_group(self): return self.subpattern.contains_group() @@ -3114,7 +3115,7 @@ class LookAroundConditional(RegexBase): print("%sEITHER" % (INDENT * indent)) self.yes_item.dump(indent + 1, reverse) if not self.no_item.is_empty(): - print("%sOR".format(INDENT * indent)) + print("%sOR" % (INDENT * indent)) self.no_item.dump(indent + 1, reverse) def is_empty(self):