From 2068e52b82021b8d57f1c6de5808bfa798aea7e6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 May 2015 09:55:10 +0530 Subject: [PATCH] Update regex engine to latest (2.4.61) --- src/regex/README | 2 +- src/regex/__init__.py | 79 +++-- src/regex/_regex.c | 719 +++++++++++++++++++++++++++++---------- src/regex/_regex_core.py | 257 ++++++++------ 4 files changed, 743 insertions(+), 314 deletions(-) diff --git a/src/regex/README b/src/regex/README index 87069c0182..2ff93d4dc6 100644 --- a/src/regex/README +++ b/src/regex/README @@ -1,4 +1,4 @@ -This regex engine is taken, with thanks, from: https://code.google.com/p/mrab-regex-hg/ +This regex engine is taken, with thanks, from: https://bitbucket.org/mrabarnett/mrab-regex It is licensed under the Python Software Foundation License diff --git a/src/regex/__init__.py b/src/regex/__init__.py index e8384c5511..e620cc1f97 100644 --- a/src/regex/__init__.py +++ b/src/regex/__init__.py @@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex"] -__version__ = "2.4.48" +__version__ = "2.4.61" # -------------------------------------------------------------------- # Public interface. @@ -333,6 +333,7 @@ def compile(pattern, flags=0, **kwargs): def purge(): "Clear the regular expression cache" _cache.clear() + _locale_sensitive.clear() def template(pattern, flags=0): "Compile a template pattern, returning a pattern object." @@ -423,38 +424,43 @@ _MAXREPCACHE = 500 def _compile(pattern, flags=0, kwargs={}): "Compiles a regular expression to a PatternObject." + + # We won't bother to cache the pattern if we're debugging. + debugging = (flags & DEBUG) != 0 + # What locale is this pattern using? locale_key = (type(pattern), pattern) if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: # This pattern is, or might be, locale-sensitive. - pattern_locale = _getlocale() + pattern_locale = _getlocale()[1] else: # This pattern is definitely not locale-sensitive. pattern_locale = None - try: - # Do we know what keyword arguments are needed? - args_key = pattern, type(pattern), flags - args_needed = _named_args[args_key] + if not debugging: + try: + # Do we know what keyword arguments are needed? + args_key = pattern, type(pattern), flags + args_needed = _named_args[args_key] - # Are we being provided with its required keyword arguments? - args_supplied = set() - if args_needed: - for k, v in args_needed: - try: - args_supplied.add((k, frozenset(kwargs[k]))) - except KeyError: - raise error("missing named list: {!r}".format(k)) + # Are we being provided with its required keyword arguments? + args_supplied = set() + if args_needed: + for k, v in args_needed: + try: + args_supplied.add((k, frozenset(kwargs[k]))) + except KeyError: + raise error("missing named list: {!r}".format(k)) - args_supplied = frozenset(args_supplied) + args_supplied = frozenset(args_supplied) - # Have we already seen this regular expression and named list? - pattern_key = (pattern, type(pattern), flags, args_supplied, - DEFAULT_VERSION, pattern_locale) - return _cache[pattern_key] - except KeyError: - # It's a new pattern, or new named list for a known pattern. - pass + # Have we already seen this regular expression and named list? + pattern_key = (pattern, type(pattern), flags, args_supplied, + DEFAULT_VERSION, pattern_locale) + return _cache[pattern_key] + except KeyError: + # It's a new pattern, or new named list for a known pattern. + pass # Guess the encoding from the class of the pattern string. if isinstance(pattern, unicode): @@ -463,7 +469,7 @@ def _compile(pattern, flags=0, kwargs={}): guess_encoding = ASCII elif isinstance(pattern, _pattern_type): if flags: - raise ValueError("can't process flags argument with a compiled pattern") + raise ValueError("cannot process flags argument with a compiled pattern") return pattern else: @@ -490,10 +496,11 @@ def _compile(pattern, flags=0, kwargs={}): caught_exception = e if caught_exception: - raise error(str(caught_exception)) + raise error(caught_exception.msg, caught_exception.pattern, + caught_exception.pos) if not source.at_end(): - raise error("trailing characters in pattern at position %d" % source.pos) + raise error("unbalanced parenthesis", pattern, source.pos) # Check the global flags for conflicts. version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION @@ -520,7 +527,7 @@ def _compile(pattern, flags=0, kwargs={}): parsed.dump(indent=0, reverse=reverse) # Fix the group references. - parsed.fix_groups(reverse, False) + parsed.fix_groups(pattern, reverse, False) # Optimise the parsed pattern. parsed = parsed.optimise(info) @@ -591,19 +598,23 @@ def _compile(pattern, flags=0, kwargs={}): if len(_cache) >= _MAXCACHE: _cache_lock.acquire() try: - _shrink_cache(_cache, _named_args, _MAXCACHE) + _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE) finally: _cache_lock.release() - args_needed = frozenset(args_needed) + if not debugging: + if (info.flags & LOCALE) == 0: + pattern_locale = None - # Store this regular expression and named list. - pattern_key = (pattern, type(pattern), flags, args_needed, DEFAULT_VERSION, - pattern_locale) - _cache[pattern_key] = compiled_pattern + args_needed = frozenset(args_needed) - # Store what keyword arguments are needed. - _named_args[args_key] = args_needed + # Store this regular expression and named list. + pattern_key = (pattern, type(pattern), flags, args_needed, + DEFAULT_VERSION, pattern_locale) + _cache[pattern_key] = compiled_pattern + + # Store what keyword arguments are needed. + _named_args[args_key] = args_needed return compiled_pattern diff --git a/src/regex/_regex.c b/src/regex/_regex.c index d15b45093d..8433f218c3 100644 --- a/src/regex/_regex.c +++ b/src/regex/_regex.c @@ -678,6 +678,15 @@ typedef struct SplitterObject { Py_ssize_t index; int status; } SplitterObject; +#if PY_VERSION_HEX >= 0x02060000 + +/* The CaptureObject. */ +typedef struct CaptureObject { + PyObject_HEAD + Py_ssize_t group_index; + MatchObject** match_indirect; +} CaptureObject; +#endif /* Info used when compiling a pattern to nodes. */ typedef struct RE_CompileArgs { @@ -727,6 +736,30 @@ typedef struct { /* Function types for getting info from a MatchObject. */ typedef PyObject* (*RE_GetByIndexFunc)(MatchObject* self, Py_ssize_t index); +#if defined(PYPY_VERSION) +/* PyPy does not define PyLong_FromUnicode, so include our own implementation. + */ +Py_LOCAL_INLINE(PyObject*) PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, + int base) { + PyObject* result; + char* buffer = (char*)PyMem_MALLOC(length + 1); + + if (buffer == NULL) + return NULL; + + if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) { + PyMem_FREE(buffer); + + return NULL; + } + + result = PyLong_FromString(buffer, NULL, base); + PyMem_FREE(buffer); + + return result; +} + +#endif /* Returns the magnitude of a 'Py_ssize_t' value. */ Py_LOCAL_INLINE(Py_ssize_t) abs_ssize_t(Py_ssize_t x) { return x >= 0 ? x : -x; @@ -1640,12 +1673,12 @@ Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state, int prop; int prop_m1; Py_ssize_t pos_m1; - Py_UCS4 char_p1; Py_ssize_t pos_p1; int prop_p1; + Py_UCS4 char_p1; Py_ssize_t pos_m2; - Py_UCS4 char_m2; int prop_m2; + Py_UCS4 char_m2; char_at = state->char_at; @@ -2033,8 +2066,8 @@ Py_LOCAL_INLINE(void) set_error(int status, PyObject* object) { object->ob_type->tp_name); break; case RE_ERROR_NOT_UNICODE: - PyErr_Format(PyExc_TypeError, - "expected unicode instance, %.200s found", object->ob_type->tp_name); + PyErr_Format(PyExc_TypeError, "expected unicode instance, not %.200s", + object->ob_type->tp_name); break; case RE_ERROR_NO_SUCH_GROUP: PyErr_SetString(PyExc_IndexError, "no such group"); @@ -2158,8 +2191,8 @@ Py_LOCAL_INLINE(BOOL) in_range(Py_UCS4 lower, Py_UCS4 upper, Py_UCS4 ch) { /* Checks whether a character is in a range, ignoring case. */ Py_LOCAL_INLINE(BOOL) in_range_ign(RE_EncodingTable* encoding, RE_LocaleInfo* locale_info, Py_UCS4 lower, Py_UCS4 upper, Py_UCS4 ch) { - Py_UCS4 cases[RE_MAX_CASES]; int count; + Py_UCS4 cases[RE_MAX_CASES]; int i; count = encoding->all_cases(locale_info, ch, cases); @@ -2186,8 +2219,8 @@ static BOOL same_char_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo* /* Checks whether 2 characters are the same, ignoring case. */ Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo* locale_info, Py_UCS4 ch1, Py_UCS4 ch2) { - Py_UCS4 cases[RE_MAX_CASES]; int count; + Py_UCS4 cases[RE_MAX_CASES]; int i; if (ch1 == ch2) @@ -2597,7 +2630,7 @@ Py_LOCAL_INLINE(BOOL) in_set_union_ign(RE_EncodingTable* encoding, /* Checks whether a character is in a set. */ Py_LOCAL_INLINE(BOOL) matches_SET(RE_EncodingTable* encoding, - RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { +RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { switch (node->op) { case RE_OP_SET_DIFF: case RE_OP_SET_DIFF_REV: @@ -2618,7 +2651,7 @@ Py_LOCAL_INLINE(BOOL) matches_SET(RE_EncodingTable* encoding, /* Checks whether a character is in a set, ignoring case. */ Py_LOCAL_INLINE(BOOL) matches_SET_IGN(RE_EncodingTable* encoding, - RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { +RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { Py_UCS4 cases[RE_MAX_CASES]; int case_count; @@ -7625,11 +7658,11 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_REV(RE_SafeState* safe_state, Py_LOCAL_INLINE(int) search_start(RE_SafeState* safe_state, RE_NextNode* next, RE_Position* new_position, int search_index) { RE_State* state; - Py_ssize_t text_pos; + Py_ssize_t start_pos; RE_Node* test; RE_Node* node; - Py_ssize_t start_pos; RE_SearchPosition* info; + Py_ssize_t text_pos; state = safe_state->re_state; @@ -8874,6 +8907,34 @@ Py_LOCAL_INLINE(BOOL) guard_repeat(RE_SafeState* safe_state, size_t index, return guard(safe_state, guard_list, text_pos, protect); } +/* Guards a range of positions against further matching for a repeat. */ +Py_LOCAL_INLINE(BOOL) guard_repeat_range(RE_SafeState* safe_state, size_t + index, Py_ssize_t lo_pos, Py_ssize_t hi_pos, RE_STATUS_T guard_type, BOOL + protect) { + RE_State* state; + RE_GuardList* guard_list; + Py_ssize_t pos; + + state = safe_state->re_state; + + /* Is a guard active here? */ + if (!(state->pattern->repeat_info[index].status & guard_type)) + return TRUE; + + /* Which guard list? */ + if (guard_type & RE_STATUS_BODY) + guard_list = &state->repeats[index].body_guard_list; + else + guard_list = &state->repeats[index].tail_guard_list; + + for (pos = lo_pos; pos <= hi_pos; pos++) { + if (!guard(safe_state, guard_list, pos, protect)) + return FALSE; + } + + return TRUE; +} + /* Checks whether a position is guarded against further matching for a repeat. */ Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index, @@ -9332,9 +9393,9 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state, Py_ssize_t len; Py_ssize_t consumed; Py_UCS4 codepoints[RE_MAX_FOLDED]; - PyObject* string_set; Py_ssize_t first; Py_ssize_t last; + PyObject* string_set; state = safe_state->re_state; full_case_fold = state->encoding->full_case_fold; @@ -9868,10 +9929,10 @@ found: Py_LOCAL_INLINE(int) retry_fuzzy_match_item(RE_SafeState* safe_state, BOOL search, Py_ssize_t* text_pos, RE_Node** node, BOOL advance) { RE_State* state; - RE_FuzzyData data; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; RE_BacktrackData* bt_data; + RE_FuzzyData data; int step; state = safe_state->re_state; @@ -9972,13 +10033,13 @@ Py_LOCAL_INLINE(int) retry_fuzzy_insert(RE_SafeState* safe_state, Py_ssize_t* text_pos, RE_Node** node) { RE_State* state; RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; RE_BacktrackData* bt_data; Py_ssize_t new_text_pos; RE_Node* new_node; int step; Py_ssize_t limit; RE_Node* fuzzy_node; - RE_CODE* values; state = safe_state->re_state; fuzzy_info = &state->fuzzy_info; @@ -10111,10 +10172,10 @@ Py_LOCAL_INLINE(int) retry_fuzzy_match_string(RE_SafeState* safe_state, BOOL search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos, BOOL* matched) { RE_State* state; - RE_FuzzyData data; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; RE_BacktrackData* bt_data; + RE_FuzzyData data; RE_Node* new_node; state = safe_state->re_state; @@ -10217,10 +10278,10 @@ Py_LOCAL_INLINE(int) fuzzy_match_string_fld(RE_SafeState* safe_state, BOOL search, Py_ssize_t* text_pos, RE_Node* node, Py_ssize_t* string_pos, int* folded_pos, int folded_len, BOOL* matched, int step) { RE_State* state; + Py_ssize_t new_text_pos; RE_FuzzyData data; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; - Py_ssize_t new_text_pos; RE_BacktrackData* bt_data; state = safe_state->re_state; @@ -10297,12 +10358,12 @@ Py_LOCAL_INLINE(int) retry_fuzzy_match_string_fld(RE_SafeState* safe_state, BOOL search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos, int* folded_pos, BOOL* matched) { RE_State* state; - RE_FuzzyData data; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; RE_BacktrackData* bt_data; Py_ssize_t new_text_pos; RE_Node* new_node; + RE_FuzzyData data; state = safe_state->re_state; fuzzy_info = &state->fuzzy_info; @@ -10414,11 +10475,11 @@ Py_LOCAL_INLINE(int) fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL Py_ssize_t* group_pos, int* gfolded_pos, int gfolded_len, BOOL* matched, int step) { RE_State* state; + Py_ssize_t new_text_pos; RE_FuzzyData data; + Py_ssize_t new_group_pos; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; - Py_ssize_t new_text_pos; - Py_ssize_t new_group_pos; RE_BacktrackData* bt_data; state = safe_state->re_state; @@ -10499,13 +10560,13 @@ Py_LOCAL_INLINE(int) retry_fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL search, Py_ssize_t* text_pos, RE_Node** node, int* folded_pos, Py_ssize_t* group_pos, int* gfolded_pos, BOOL* matched) { RE_State* state; - RE_FuzzyData data; RE_FuzzyInfo* fuzzy_info; RE_CODE* values; RE_BacktrackData* bt_data; Py_ssize_t new_text_pos; - Py_ssize_t new_group_pos; RE_Node* new_node; + Py_ssize_t new_group_pos; + RE_FuzzyData data; state = safe_state->re_state; fuzzy_info = &state->fuzzy_info; @@ -10569,7 +10630,8 @@ found: } /* Locates the required string, if there's one. */ -Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { +Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state, + BOOL search) { RE_State* state; PatternObject* pattern; Py_ssize_t found_pos; @@ -10592,21 +10654,29 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } found_pos = string_search(safe_state, pattern->req_string, - state->text_pos, state->slice_end, &is_partial); + state->text_pos, limit, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = found_pos + - (Py_ssize_t)pattern->req_string->value_count; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos + + (Py_ssize_t)pattern->req_string->value_count; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10621,20 +10691,28 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING_FLD: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } found_pos = string_search_fld(safe_state, pattern->req_string, - state->text_pos, state->slice_end, &end_pos, &is_partial); + state->text_pos, limit, &end_pos, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = end_pos; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = end_pos; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10649,20 +10727,28 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING_FLD_REV: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } found_pos = string_search_fld_rev(safe_state, pattern->req_string, - state->text_pos, state->slice_start, &end_pos, &is_partial); + state->text_pos, limit, &end_pos, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = end_pos; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = end_pos; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10677,21 +10763,29 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING_IGN: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } found_pos = string_search_ign(safe_state, pattern->req_string, - state->text_pos, state->slice_end, &is_partial); + state->text_pos, limit, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = found_pos + - (Py_ssize_t)pattern->req_string->value_count; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos + + (Py_ssize_t)pattern->req_string->value_count; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10706,21 +10800,29 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING_IGN_REV: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } found_pos = string_search_ign_rev(safe_state, pattern->req_string, - state->text_pos, state->slice_start, &is_partial); + state->text_pos, limit, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = found_pos - - (Py_ssize_t)pattern->req_string->value_count; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos - + (Py_ssize_t)pattern->req_string->value_count; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10735,21 +10837,29 @@ Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) { case RE_OP_STRING_REV: { BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } found_pos = string_search_rev(safe_state, pattern->req_string, - state->text_pos, state->slice_start, &is_partial); + state->text_pos, limit, &is_partial); if (found_pos < 0) /* The required string wasn't found. */ return -1; - if (is_partial) - /* We found a partial match, so start matching from there. */ - return found_pos; - - /* Record where the required string matched. */ - state->req_pos = found_pos; - state->req_end = found_pos - - (Py_ssize_t)pattern->req_string->value_count; + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos - + (Py_ssize_t)pattern->req_string->value_count; + } if (pattern->req_offset >= 0) { /* Step back from the required string to where we should start @@ -10845,10 +10955,10 @@ Py_LOCAL_INLINE(int) basic_match(RE_SafeState* safe_state, RE_Node* start_node, Py_ssize_t string_pos; BOOL do_search_start; Py_ssize_t found_pos; + int status; + RE_Node* node; int folded_pos; int gfolded_pos; - RE_Node* node; - int status; TRACE(("<>\n")) state = safe_state->re_state; @@ -10914,7 +11024,7 @@ start_match: if (!pattern->req_string || recursive_call) found_pos = state->text_pos; else { - found_pos = locate_required_string(safe_state); + found_pos = locate_required_string(safe_state, search); if (found_pos < 0) return RE_ERROR_FAILURE; } @@ -14489,6 +14599,17 @@ backtrack: state->text_pos = pos; goto advance; } else { + /* Don't try this repeated match again. */ + if (step > 0) { + if (!guard_repeat_range(safe_state, bt_data->repeat.index, + limit, pos, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + } else if (step < 0) { + if (!guard_repeat_range(safe_state, bt_data->repeat.index, + pos, limit, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + } + /* We've backtracked the repeat as far as we can. */ rp_data->start = bt_data->repeat.text_pos; rp_data->count = bt_data->repeat.count; @@ -15236,7 +15357,6 @@ backtrack: if (status < 0) return RE_ERROR_PARTIAL; - if (matched) goto advance; @@ -15255,7 +15375,6 @@ backtrack: if (status < 0) return RE_ERROR_PARTIAL; - if (matched) goto advance; @@ -15450,14 +15569,14 @@ Py_LOCAL_INLINE(int) do_match(RE_SafeState* safe_state, BOOL search) { Py_ssize_t available; BOOL get_best; BOOL enhance_match; - BOOL must_advance; RE_GroupData* best_groups; Py_ssize_t best_match_pos; - Py_ssize_t best_text_pos = 0; /* Initialise to stop compiler warning. */ - int status; + BOOL must_advance; Py_ssize_t slice_start; Py_ssize_t slice_end; + int status; size_t best_fuzzy_counts[RE_FUZZY_COUNT]; + Py_ssize_t best_text_pos = 0; /* Initialise to stop compiler warning. */ TRACE(("<>\n")) state = safe_state->re_state; @@ -15765,8 +15884,8 @@ Py_LOCAL_INLINE(BOOL) state_init_2(RE_State* state, PatternObject* pattern, PyObject* string, RE_StringInfo* str_info, Py_ssize_t start, Py_ssize_t end, BOOL overlapped, int concurrent, BOOL partial, BOOL use_lock, BOOL visible_captures, BOOL match_all) { - Py_ssize_t final_pos; int i; + Py_ssize_t final_pos; state->groups = NULL; state->repeats = NULL; @@ -16463,9 +16582,9 @@ static PyObject* match_get_span_by_index(MatchObject* self, Py_ssize_t index) { /* Gets a MatchObject's spans by integer index. */ static PyObject* match_get_spans_by_index(MatchObject* self, Py_ssize_t index) { - RE_GroupData* group; PyObject* result; PyObject* item; + RE_GroupData* group; size_t i; if (index < 0 || (size_t)index > self->group_count) { @@ -16518,9 +16637,9 @@ error: /* Gets a MatchObject's captures by integer index. */ static PyObject* match_get_captures_by_index(MatchObject* self, Py_ssize_t index) { - RE_GroupData* group; PyObject* result; PyObject* slice; + RE_GroupData* group; size_t i; if (index < 0 || (size_t)index > self->group_count) { @@ -17257,9 +17376,9 @@ Py_LOCAL_INLINE(PyObject*) match_get_group_dict(MatchObject* self) { goto failed; for (g = 0; g < PyList_GET_SIZE(keys); g++) { - int status; PyObject* key; PyObject* value; + int status; /* PyList_GET_ITEM borrows a reference. */ key = PyList_GET_ITEM(keys, g); @@ -17286,6 +17405,92 @@ failed: return NULL; } +static PyTypeObject Capture_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Capture", + sizeof(MatchObject) +}; + +/* Creates a new CaptureObject. */ +Py_LOCAL_INLINE(PyObject*) make_capture_object(MatchObject** match_indirect, + Py_ssize_t index) { + CaptureObject* capture; + + capture = PyObject_NEW(CaptureObject, &Capture_Type); + if (!capture) + return NULL; + + capture->group_index = index; + capture->match_indirect = match_indirect; + + return (PyObject*)capture; +} + +#if PY_VERSION_HEX >= 0x02060000 +/* Makes a MatchObject's capture dictionary. */ +Py_LOCAL_INLINE(PyObject*) make_capture_dict(MatchObject* match, MatchObject** + match_indirect) { + PyObject* result; + PyObject* keys; + PyObject* values = NULL; + Py_ssize_t g; + + result = PyDict_New(); + if (!result) + return result; + + keys = PyMapping_Keys(match->pattern->groupindex); + if (!keys) + goto failed; + + values = PyMapping_Values(match->pattern->groupindex); + if (!values) + goto failed; + + for (g = 0; g < PyList_GET_SIZE(keys); g++) { + PyObject* key; + PyObject* value; + Py_ssize_t v; + int status; + + /* PyList_GET_ITEM borrows a reference. */ + key = PyList_GET_ITEM(keys, g); + if (!key) + goto failed; + + /* PyList_GET_ITEM borrows a reference. */ + value = PyList_GET_ITEM(values, g); + if (!value) + goto failed; + + v = PyLong_AsLong(value); + if (v == -1 && PyErr_Occurred()) + goto failed; + + value = make_capture_object(match_indirect, v); + if (!value) + goto failed; + + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; + } + + Py_DECREF(values); + Py_DECREF(keys); + + return result; + +failed: + Py_XDECREF(values); + Py_XDECREF(keys); + Py_DECREF(result); + return NULL; +} +#endif + /* MatchObject's 'expandf' method. */ static PyObject* match_expandf(MatchObject* self, PyObject* str_template) { PyObject* format_func; @@ -17304,14 +17509,15 @@ static PyObject* match_expandf(MatchObject* self, PyObject* str_template) { for (g = 0; g < self->group_count + 1; g++) /* PyTuple_SetItem borrows the reference. */ - PyTuple_SetItem(args, (Py_ssize_t)g, match_get_group_by_index(self, - (Py_ssize_t)g, Py_None)); + PyTuple_SetItem(args, (Py_ssize_t)g, make_capture_object(&self, + (Py_ssize_t)g)); - kwargs = match_get_group_dict(self); + kwargs = make_capture_dict(self, &self); if (!kwargs) goto error; result = PyObject_Call(format_func, args, kwargs); + Py_DECREF(kwargs); Py_DECREF(args); Py_DECREF(format_func); @@ -18459,6 +18665,159 @@ static void splitter_dealloc(PyObject* self_) { Py_DECREF(self->pattern); PyObject_DEL(self); } +#if PY_VERSION_HEX >= 0x02060000 + +/* Converts a captures index to an integer. + * + * A negative capture index in 'expandf' and 'subf' is passed as a string + * because negative indexes are not supported by 'str.format'. + */ +Py_LOCAL_INLINE(Py_ssize_t) index_to_integer(PyObject* item) { + Py_ssize_t value; + + value = PyInt_AsSsize_t(item); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + value = PyLong_AsLong(item); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + /* Is the index a string representation of an integer? */ + if (PyUnicode_Check(item)) { + PyObject* int_obj; + Py_UNICODE* characters; + Py_ssize_t length; + + characters = (Py_UNICODE*)PyUnicode_AS_DATA(item); + length = PyUnicode_GET_SIZE(item); + int_obj = PyLong_FromUnicode(characters, length, 0); + if (!int_obj) + goto error; + + value = PyLong_AsLong(int_obj); + Py_DECREF(int_obj); + if (!PyErr_Occurred()) + return value; + } else if (PyString_Check(item)) { + char* characters; + PyObject* int_obj; + + characters = PyString_AsString(item); + int_obj = PyLong_FromString(characters, NULL, 0); + if (!int_obj) + goto error; + + value = PyLong_AsLong(int_obj); + Py_DECREF(int_obj); + if (!PyErr_Occurred()) + return value; + } + +error: + PyErr_Format(PyExc_TypeError, "list indices must be integers, not %.200s", + item->ob_type->tp_name); + + return -1; +} + +/* CaptureObject's length method. */ +Py_LOCAL_INLINE(Py_ssize_t) capture_length(CaptureObject* self) { + MatchObject* match; + RE_GroupData* group; + + if (self->group_index == 0) + return 1; + + match = *self->match_indirect; + group = &match->groups[self->group_index - 1]; + + return (Py_ssize_t)group->capture_count; +} + +/* CaptureObject's '__getitem__' method. */ +static PyObject* capture_getitem(CaptureObject* self, PyObject* item) { + Py_ssize_t index; + MatchObject* match; + Py_ssize_t start; + Py_ssize_t end; + + index = index_to_integer(item); + if (index == -1 && PyErr_Occurred()) + return NULL; + + match = *self->match_indirect; + + if (self->group_index == 0) { + if (index < 0) + index += 1; + + if (index != 0) { + PyErr_SetString(PyExc_IndexError, "list index out of range"); + return NULL; + } + + start = match->match_start; + end = match->match_end; + } else { + RE_GroupData* group; + RE_GroupSpan* span; + + group = &match->groups[self->group_index - 1]; + + if (index < 0) + index += group->capture_count; + + if (index < 0 || index >= (Py_ssize_t)group->capture_count) { + PyErr_SetString(PyExc_IndexError, "list index out of range"); + return NULL; + } + + span = &group->captures[index]; + + start = span->start; + end = span->end; + } + + return get_slice(match->substring, start - match->substring_offset, end - + match->substring_offset); +} + +static PyMappingMethods capture_as_mapping = { + (lenfunc)capture_length, /* mp_length */ + (binaryfunc)capture_getitem, /* mp_subscript */ + 0, /* mp_ass_subscript */ +}; + +/* CaptureObject's methods. */ +static PyMethodDef capture_methods[] = { + {"__getitem__", (PyCFunction)capture_getitem, METH_O|METH_COEXIST}, + {NULL, NULL} +}; + +/* Deallocates a CaptureObject. */ +static void capture_dealloc(PyObject* self_) { + CaptureObject* self; + + self = (CaptureObject*)self_; + PyObject_DEL(self); +} + +/* CaptureObject's 'str' method. */ +static PyObject* capture_str(PyObject* self_) { + CaptureObject* self; + MatchObject* match; + + self = (CaptureObject*)self_; + match = *self->match_indirect; + + return match_get_group_by_index(match, self->group_index, Py_None); +} +#endif static PyMemberDef splitter_members[] = { {"pattern", T_OBJECT, offsetof(SplitterObject, pattern), READONLY, @@ -18725,20 +19084,26 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* Py_ssize_t start; Py_ssize_t end; BOOL is_callable = FALSE; - BOOL is_literal = FALSE; - BOOL is_template = FALSE; PyObject* replacement = NULL; + BOOL is_literal = FALSE; #if PY_VERSION_HEX >= 0x02060000 BOOL is_format = FALSE; #endif + BOOL is_template = FALSE; RE_State state; RE_SafeState safe_state; JoinInfo join_info; Py_ssize_t sub_count; Py_ssize_t last_pos; - PyObject* item; - Py_ssize_t end_pos; Py_ssize_t step; + PyObject* item; + MatchObject* match; +#if PY_VERSION_HEX >= 0x02060000 + BOOL built_capture = FALSE; +#endif + PyObject* args = NULL; + PyObject* kwargs = NULL; + Py_ssize_t end_pos; /* Get the string. */ if (!get_string(string, &str_info)) @@ -18842,13 +19207,17 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* } /* The MatchObject, and therefore repeated captures, will be visible only - * if the replacement is callable. + * if the replacement is callable or subf is used. */ - if (!state_init_2(&state, self, string, &str_info, start, end, FALSE, - concurrent, FALSE, FALSE, is_callable, FALSE)) { #if PY_VERSION_HEX >= 0x02060000 + if (!state_init_2(&state, self, string, &str_info, start, end, FALSE, + concurrent, FALSE, FALSE, is_callable || (sub_type & RE_SUBF) != 0, + FALSE)) { release_buffer(&str_info); +#else + if (!state_init_2(&state, self, string, &str_info, start, end, FALSE, + concurrent, FALSE, FALSE, is_callable, FALSE)) { #endif Py_XDECREF(replacement); return NULL; @@ -18898,10 +19267,7 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* #if PY_VERSION_HEX >= 0x02060000 } else if (is_format) { /* The replacement is a format string. */ - MatchObject* match; - PyObject* args; size_t g; - PyObject* kwargs; /* We need to create the arguments for the 'format' method. We'll * start by creating a MatchObject. @@ -18910,31 +19276,38 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* if (!match) goto error; - /* The args are a tuple of the capture group matches. */ - args = PyTuple_New((Py_ssize_t)state.pattern->public_group_count + - 1); - if (!args) { - Py_DECREF(match); - goto error; - } + /* We'll build the args and kwargs the first time. They'll be using + * capture objects which refer to the match object indirectly; this + * means that args and kwargs can be reused with different match + * objects. + */ + if (!built_capture) { + /* The args are a tuple of the capture group matches. */ + args = PyTuple_New(match->group_count + 1); + if (!args) { + Py_DECREF(match); + goto error; + } - for (g = 0; g < state.pattern->public_group_count + 1; g++) - /* PyTuple_SetItem borrows the reference. */ - PyTuple_SetItem(args, (Py_ssize_t)g, - match_get_group_by_index(match, (Py_ssize_t)g, Py_None)); + for (g = 0; g < match->group_count + 1; g++) + /* PyTuple_SetItem borrows the reference. */ + PyTuple_SetItem(args, (Py_ssize_t)g, + make_capture_object(&match, (Py_ssize_t)g)); - /* The kwargs are a dict of the named capture group matches. */ - kwargs = match_get_group_dict(match); - if (!kwargs) { - Py_DECREF(args); - Py_DECREF(match); - goto error; + /* The kwargs are a dict of the named capture group matches. */ + kwargs = make_capture_dict(match, &match); + if (!kwargs) { + Py_DECREF(args); + Py_DECREF(match); + goto error; + } + + built_capture = TRUE; } /* Call the 'format' method. */ item = PyObject_Call(replacement, args, kwargs); - Py_DECREF(kwargs); - Py_DECREF(args); + Py_DECREF(match); if (!item) goto error; @@ -19054,6 +19427,13 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* state_fini(&state); +#if PY_VERSION_HEX >= 0x02060000 + if (built_capture) { + Py_DECREF(kwargs); + Py_DECREF(args); + } + +#endif if (!item) return NULL; @@ -19063,6 +19443,13 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* return item; error: +#if PY_VERSION_HEX >= 0x02060000 + if (built_capture) { + Py_DECREF(kwargs); + Py_DECREF(args); + } + +#endif clear_join_list(&join_info); state_fini(&state); Py_XDECREF(replacement); @@ -19339,15 +19726,15 @@ static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject* kwargs) { Py_ssize_t start; Py_ssize_t end; - RE_State state; int conc; + RE_State state; RE_SafeState safe_state; PyObject* list; Py_ssize_t step; int status; - size_t g; Py_ssize_t b; Py_ssize_t e; + size_t g; PyObject* string; PyObject* pos = Py_None; @@ -19593,8 +19980,8 @@ PyDoc_STRVAR(pattern_doc, "Compiled regex object"); /* Deallocates a PatternObject. */ static void pattern_dealloc(PyObject* self_) { PatternObject* self; - int partial_side; size_t i; + int partial_side; self = (PatternObject*)self_; @@ -20241,8 +20628,8 @@ Py_LOCAL_INLINE(void) use_nodes(RE_Node* node) { * Optimising the nodes might result in some nodes no longer being used. */ Py_LOCAL_INLINE(void) discard_unused_nodes(PatternObject* pattern) { - size_t new_count; size_t i; + size_t new_count; /* Mark the nodes which are being used. */ use_nodes(pattern->start_node); @@ -20841,8 +21228,8 @@ Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) { Py_LOCAL_INLINE(int) build_ATOMIC(RE_CompileArgs* args) { RE_Node* atomic_node; RE_CompileArgs subargs; - RE_Node* success_node; int status; + RE_Node* success_node; /* codes: opcode, sequence, end. */ if (args->code + 1 > args->end_code) @@ -21201,8 +21588,8 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) { RE_Node* start_node; RE_Node* end_node; RE_CompileArgs subargs; - Py_ssize_t min_width; int status; + Py_ssize_t min_width; /* codes: opcode, sequence, next, sequence, end. */ if (args->code + 2 > args->end_code) @@ -21212,6 +21599,10 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) { args->code += 2; + /* Record that we have a reference to a group. */ + if (!record_ref_group(args->pattern, group)) + return RE_ERROR_MEMORY; + /* Create nodes for the start and end of the structure. */ start_node = create_node(args->pattern, RE_OP_GROUP_EXISTS, 0, 0, 1); end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); @@ -21281,9 +21672,9 @@ Py_LOCAL_INLINE(int) build_LOOKAROUND(RE_CompileArgs* args) { RE_CODE flags; BOOL forward; RE_Node* lookaround_node; - RE_Node* success_node; RE_CompileArgs subargs; int status; + RE_Node* success_node; /* codes: opcode, flags, forward, sequence, end. */ if (args->code + 3 > args->end_code) @@ -21423,10 +21814,6 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) { if (args->code + 3 > args->end_code) return RE_ERROR_ILLEGAL; - /* This includes special cases such as optional items, which we'll check - * for and treat specially. They don't need repeat counts, which helps us - * avoid unnecessary work when matching. - */ greedy = args->code[0] == RE_OP_GREEDY_REPEAT; min_count = args->code[1]; max_count = args->code[2]; @@ -21435,50 +21822,7 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) { args->code += 3; - if (min_count == 0 && max_count == 1) { - /* Optional sequence. */ - RE_Node* branch_node; - RE_Node* join_node; - RE_CompileArgs subargs; - - /* Create the start and end nodes. */ - branch_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); - join_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); - if (!branch_node || !join_node) - return RE_ERROR_MEMORY; - - /* Compile the sequence and check that we've reached the end of it. */ - subargs = *args; - subargs.has_captures = FALSE; - subargs.is_fuzzy = FALSE; - status = build_sequence(&subargs); - if (status != RE_ERROR_SUCCESS) - return status; - - if (subargs.code[0] != RE_OP_END) - return RE_ERROR_ILLEGAL; - - args->code = subargs.code; - args->has_captures |= subargs.has_captures; - args->is_fuzzy |= subargs.is_fuzzy; - - ++args->code; - - if (greedy) { - /* It's a greedy option. */ - add_node(branch_node, subargs.start); - add_node(branch_node, join_node); - } else { - /* It's a lazy option. */ - add_node(branch_node, join_node); - add_node(branch_node, subargs.start); - } - add_node(subargs.end, join_node); - - /* Append the optional sequence. */ - add_node(args->end, branch_node); - args->end = join_node; - } else if (min_count == 1 && max_count == 1) { + if (min_count == 1 && max_count == 1) { /* Singly-repeated sequence. */ RE_CompileArgs subargs; @@ -22175,17 +22519,17 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) { PyObject* named_list_indexes; Py_ssize_t req_offset; PyObject* required_chars; - size_t req_length; - RE_CODE* req_chars; Py_ssize_t req_flags; size_t public_group_count; Py_ssize_t code_len; RE_CODE* code; Py_ssize_t i; + RE_CODE* req_chars; + size_t req_length; PatternObject* self; - BOOL ascii; - BOOL locale; BOOL unicode; + BOOL locale; + BOOL ascii; BOOL ok; if (!PyArg_ParseTuple(args, "OnOOOOOnOnn:re_compile", &pattern, &flags, @@ -22373,14 +22717,14 @@ static PyObject* get_properties(PyObject* self_, PyObject* args) { static PyObject* fold_case(PyObject* self_, PyObject* args) { RE_StringInfo str_info; Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo locale_info; Py_ssize_t folded_charsize; void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); - RE_EncodingTable* encoding; Py_ssize_t buf_size; void* folded; Py_ssize_t folded_len; PyObject* result; - RE_LocaleInfo locale_info; Py_ssize_t flags; PyObject* string; @@ -22527,8 +22871,8 @@ static PyObject* fold_case(PyObject* self_, PyObject* args) { */ static PyObject* get_expand_on_folding(PyObject* self, PyObject* unused) { int count; - int i; PyObject* result; + int i; /* How many characters are there? */ count = sizeof(re_expand_on_folding) / sizeof(re_expand_on_folding[0]); @@ -22582,12 +22926,12 @@ static PyObject* has_property_value(PyObject* self_, PyObject* args) { */ static PyObject* get_all_cases(PyObject* self_, PyObject* args) { RE_EncodingTable* encoding; + RE_LocaleInfo locale_info; int count; Py_UCS4 cases[RE_MAX_CASES]; - Py_UCS4 folded[RE_MAX_FOLDED]; PyObject* result; int i; - RE_LocaleInfo locale_info; + Py_UCS4 folded[RE_MAX_FOLDED]; Py_ssize_t flags; Py_ssize_t character; @@ -22794,6 +23138,15 @@ PyMODINIT_FUNC init_regex(void) { Splitter_Type.tp_iternext = splitter_iternext; Splitter_Type.tp_methods = splitter_methods; Splitter_Type.tp_members = splitter_members; +#if PY_VERSION_HEX >= 0x02060000 + + /* Initialise Capture_Type. */ + Capture_Type.tp_dealloc = capture_dealloc; + Capture_Type.tp_str = capture_str; + Capture_Type.tp_as_mapping = &capture_as_mapping; + Capture_Type.tp_flags = Py_TPFLAGS_DEFAULT; + Capture_Type.tp_methods = capture_methods; +#endif /* Initialize object types */ if (PyType_Ready(&Pattern_Type) < 0) @@ -22804,6 +23157,10 @@ PyMODINIT_FUNC init_regex(void) { return; if (PyType_Ready(&Splitter_Type) < 0) return; +#if PY_VERSION_HEX >= 0x02060000 + if (PyType_Ready(&Capture_Type) < 0) + return; +#endif error_exception = NULL; diff --git a/src/regex/_regex_core.py b/src/regex/_regex_core.py index aa0d63c8b8..e300b669e9 100644 --- a/src/regex/_regex_core.py +++ b/src/regex/_regex_core.py @@ -31,9 +31,21 @@ __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", # The regex exception. class error(Exception): - def __init__(self, message, set_error=False): + def __init__(self, message, pattern=None, pos=None): + newline = u'\n' if isinstance(pattern, unicode) else '\n' + self.msg = message + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + + message = "%s at position %d" % (message, pos) + + if newline in pattern: + message += " (line %d, column %d)" % (self.lineno, self.colno) + Exception.__init__(self, message) - self.set_error = set_error # The exception for when a positional flag has been turned on in the old # behaviour. @@ -210,7 +222,7 @@ OP = Namespace() for i, op in enumerate(OPCODES.split()): setattr(OP, op, i) -def _shrink_cache(cache_dict, args_dict, max_length, divisor=5): +def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): """Make room in the given cache. Args: @@ -247,10 +259,18 @@ def _shrink_cache(cache_dict, args_dict, max_length, divisor=5): # Ignore problems if the cache changed from another thread. pass - # Rebuild the arguments dictionary. + # Rebuild the arguments and locale-sensitivity dictionaries. args_dict.clear() + sensitivity_dict = {} for pattern, pattern_type, flags, args, default_version, locale in cache_dict: args_dict[pattern, pattern_type, flags, default_version, locale] = args + try: + sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] + except KeyError: + pass + + locale_sensitive.clear() + locale_sensitive.update(sensitivity_dict) def _fold_case(info, string): "Folds the case of a string." @@ -384,8 +404,11 @@ def apply_quantifier(source, info, counts, characters, case_flags, ch, element = Character(characters[-1], case_flags=case_flags) else: # The quantifier applies to the last item in the sequence. - if applied or not sequence: - raise error("nothing to repeat at position %d" % saved_pos) + if applied: + raise error("multiple repeat", source.string, saved_pos) + + if not sequence: + raise error("nothing to repeat", source.string, saved_pos) element = sequence.pop() @@ -420,7 +443,8 @@ def apply_constraint(source, info, constraints, characters, case_flags, else: # The constraint applies to the last item in the sequence. if applied or not sequence: - raise error("nothing for fuzzy constraint at position %d" % saved_pos) + raise error("nothing for fuzzy constraint", source.string, + saved_pos) element = sequence.pop() @@ -473,7 +497,8 @@ def parse_limited_quantifier(source): max_count = int(max_count) if max_count else None if max_count is not None and min_count > max_count: - raise error("min repeat greater than max repeat at position %d" % saved_pos) + raise error("min repeat greater than max repeat", source.string, + saved_pos) else: if not min_count: source.pos = saved_pos @@ -482,7 +507,7 @@ def parse_limited_quantifier(source): min_count = max_count = int(min_count) if is_above_limit(min_count) or is_above_limit(max_count): - raise error("repeat count too big at position %d" % saved_pos) + raise error("repeat count too big", source.string, saved_pos) if not source.match ("}"): source.pos = saved_pos @@ -507,7 +532,7 @@ def parse_fuzzy(source, ch): return None if not source.match("}"): - raise error("expected } at position %d" % source.pos) + raise error("expected }", source.string, source.pos) return constraints @@ -544,7 +569,7 @@ def parse_cost_constraint(source, constraints): max_cost -= 1 if max_cost < 0: - raise error("bad fuzzy cost limit at position %d" % cost_pos) + raise error("bad fuzzy cost limit", source.string, cost_pos) constraints[constraint] = 0, max_cost elif ch in DIGITS: @@ -575,7 +600,7 @@ def parse_cost_constraint(source, constraints): max_cost -= 1 if not 0 <= min_cost <= max_cost: - raise error("bad fuzzy cost limit at position %d" % cost_pos) + raise error("bad fuzzy cost limit", source.string, cost_pos) constraints[constraint] = min_cost, max_cost except ValueError: @@ -586,10 +611,10 @@ def parse_cost_constraint(source, constraints): def parse_constraint(source, constraints, ch): "Parses a constraint." if ch not in "deis": - raise error("bad fuzzy constraint at position %d" % source.pos) + raise error("bad fuzzy constraint", source.string, source.pos) if ch in constraints: - raise error("repeated fuzzy constraint at position %d" % source.pos) + raise error("repeated fuzzy constraint", source.string, source.pos) return ch @@ -605,7 +630,7 @@ def parse_fuzzy_compare(source): def parse_cost_equation(source, constraints): "Parses a cost equation." if "cost" in constraints: - raise error("more than one cost equation at position %d" % source.pos) + raise error("more than one cost equation", source.string, source.pos) cost = {} @@ -615,7 +640,7 @@ def parse_cost_equation(source, constraints): max_inc = parse_fuzzy_compare(source) if max_inc is None: - raise error("missing fuzzy cost limit at position %d" % source.pos) + raise error("missing fuzzy cost limit", source.string, source.pos) max_cost = int(parse_count(source)) @@ -623,7 +648,7 @@ def parse_cost_equation(source, constraints): max_cost -= 1 if max_cost < 0: - raise error("bad fuzzy cost limit at position %d" % source.pos) + raise error("bad fuzzy cost limit", source.string, source.pos) cost["max"] = max_cost @@ -637,7 +662,7 @@ def parse_cost_term(source, cost): raise ParseError() if ch in cost: - raise error("repeated fuzzy cost at position %d" % source.pos) + raise error("repeated fuzzy cost", source.string, source.pos) cost[ch] = int(coeff or 1) @@ -816,10 +841,11 @@ def parse_extension(source, info): return Group(info, group, subpattern) if ch == "=": # (?P=...: a named group reference. - name = parse_name(source) + name = parse_name(source, allow_numeric=True) source.expect(")") if info.is_open_group(name): - raise error("can't refer to an open group at position %d" % saved_pos) + raise error("cannot refer to an open group", source.string, + saved_pos) return make_ref_group(info, name, saved_pos) if ch == ">" or ch == "&": @@ -827,7 +853,7 @@ def parse_extension(source, info): return parse_call_named_group(source, info, saved_pos) source.pos = saved_pos - raise error("unknown extension at position %d" % saved_pos) + raise error("unknown extension", source.string, saved_pos) def parse_comment(source): "Parses a comment." @@ -941,7 +967,8 @@ def parse_flags(source, info): if source.match("-"): flags_off = parse_flag_set(source) if not flags_off: - raise error("bad inline flags: no flags after '-' at position %d" % source.pos) + raise error("bad inline flags: no flags after '-'", source.string, + source.pos) else: flags_off = 0 @@ -973,10 +1000,12 @@ def parse_flags_subpattern(source, info): flags_on, flags_off = parse_flags(source, info) if flags_off & GLOBAL_FLAGS: - raise error("bad inline flags: can't turn off global flag at position %d" % source.pos) + raise error("bad inline flags: cannot turn off global flag", + source.string, source.pos) if flags_on & flags_off: - raise error("bad inline flags: flag turned on and off at position %d" % source.pos) + raise error("bad inline flags: flag turned on and off", source.string, + source.pos) # Handle flags which are global in all regex behaviours. new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS @@ -996,7 +1025,7 @@ def parse_flags_subpattern(source, info): parse_positional_flags(source, info, flags_on, flags_off) return FLAGS - raise error("unknown extension at position %d" % source.pos) + raise error("unknown extension", source.string, source.pos) def parse_positional_flags(source, info, flags_on, flags_off): "Parses positional flags." @@ -1004,7 +1033,8 @@ def parse_positional_flags(source, info, flags_on, flags_off): if version == VERSION0: # Positional flags are global and can only be turned on. if flags_off: - raise error("bad inline flags: can't turn flags off at position %d" % source.pos) + raise error("bad inline flags: cannot turn flags off", + source.string, source.pos) new_global_flags = flags_on & ~info.global_flags if new_global_flags: @@ -1017,19 +1047,22 @@ def parse_positional_flags(source, info, flags_on, flags_off): source.ignore_space = bool(info.flags & VERBOSE) -def parse_name(source, allow_numeric=False): +def parse_name(source, allow_numeric=False, allow_group_0=False): "Parses a name." name = source.get_while(set(")>"), include=False) if not name: - raise error("bad group name at position %d" % source.pos) + raise error("missing group name", source.string, source.pos) if name.isdigit(): - if not allow_numeric: - raise error("bad group name at position %d" % source.pos) + min_group = 0 if allow_group_0 else 1 + if not allow_numeric or int(name) < min_group: + raise error("bad character in group name", source.string, + source.pos) else: if not is_identifier(name): - raise error("bad group name at position %d" % source.pos) + raise error("bad character in group name", source.string, + source.pos) return name @@ -1064,10 +1097,10 @@ def parse_escape(source, info, in_set): source.ignore_space = saved_ignore if not ch: # A backslash at the end of the pattern. - raise error("bad escape at position %d" % source.pos) + raise error("bad escape (end of pattern)", source.string, source.pos) if ch in HEX_ESCAPES: # A hexadecimal escape sequence. - return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set) + return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set, ch) elif ch == "g" and not in_set: # A group reference. saved_pos = source.pos @@ -1150,7 +1183,7 @@ def parse_numeric_escape(source, info, ch, in_set): # Group reference. source.pos = saved_pos if info.is_open_group(digits): - raise error("can't refer to an open group at position %d" % source.pos) + raise error("cannot refer to an open group", source.string, source.pos) return make_ref_group(info, digits, source.pos) @@ -1168,15 +1201,21 @@ def parse_octal_escape(source, info, digits, in_set): value = int("".join(digits), 8) return make_character(info, value, in_set) except ValueError: - raise error("bad octal escape at position %d" % source.pos) + if digits[0] in OCT_DIGITS: + raise error("incomplete escape \\%s" % ''.join(digits), + source.string, source.pos) + else: + raise error("bad escape \\%s" % digits[0], source.string, + source.pos) -def parse_hex_escape(source, info, expected_len, in_set): +def parse_hex_escape(source, info, expected_len, in_set, type): "Parses a hex escape sequence." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: - raise error("bad hex escape at position %d" % source.pos) + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, source.pos) digits.append(ch) value = int("".join(digits), 16) @@ -1189,7 +1228,7 @@ def parse_group_ref(source, info): name = parse_name(source, True) source.expect(">") if info.is_open_group(name): - raise error("can't refer to an open group at position %d" % source.pos) + raise error("cannot refer to an open group", source.string, source.pos) return make_ref_group(info, name, saved_pos) @@ -1199,7 +1238,7 @@ def parse_string_set(source, info): name = parse_name(source, True) source.expect(">") if name is None or name not in info.kwargs: - raise error("undefined named list at position %d" % source.pos) + raise error("undefined named list", source.string, source.pos) return make_string_set(info, name) @@ -1213,7 +1252,8 @@ def parse_named_char(source, info, in_set): value = unicodedata.lookup(name) return make_character(info, ord(value), in_set) except KeyError: - raise error("undefined character name at position %d" % source.pos) + raise error("undefined character name", source.string, + source.pos) source.pos = saved_pos return make_character(info, ord("N"), in_set) @@ -1227,12 +1267,12 @@ def parse_property(source, info, positive, in_set): prop_name, name = parse_property_name(source) if source.match("}"): # It's correctly delimited. - prop = lookup_property(prop_name, name, positive != negate, source_pos=source.pos) + prop = lookup_property(prop_name, name, positive != negate, source) return make_property(info, prop, in_set) elif ch and ch in "CLMNPSZ": # An abbreviated property, eg \pL. - prop = lookup_property(None, ch, positive) - return make_property(info, prop, in_set, source_pos=source.pos) + prop = lookup_property(None, ch, positive, source) + return make_property(info, prop, in_set) # Not a property, so treat as a literal "p" or "P". source.pos = saved_pos @@ -1276,7 +1316,7 @@ def parse_set(source, info): item = parse_set_union(source, info) if not source.match("]"): - raise error("missing ] at position %d" % source.pos) + raise error("missing ]", source.string, source.pos) finally: source.ignore_space = saved_ignore @@ -1354,17 +1394,26 @@ def parse_set_member(source, info): "Parses a member in a character set." # Parse a set item. start = parse_set_item(source, info) + saved_pos1 = source.pos if (not isinstance(start, Character) or not start.positive or not source.match("-")): # It's not the start of a range. return start + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + # It looks like the start of a range of characters. - saved_pos = source.pos + saved_pos2 = source.pos + if version == VERSION1 and source.match("-"): + # It's actually the set difference operator '--', so return the + # character. + source.pos = saved_pos1 + return start + if source.match("]"): # We've reached the end of the set, so return both the character and # hyphen. - source.pos = saved_pos + source.pos = saved_pos2 return SetUnion(info, [start, Character(ord("-"))]) # Parse a set item. @@ -1375,7 +1424,7 @@ def parse_set_member(source, info): # It _is_ a range. if start.value > end.value: - raise error("bad character range at position %d" % source.pos) + raise error("bad character range", source.string, source.pos) if start.value == end.value: return start @@ -1407,7 +1456,7 @@ def parse_set_item(source, info): item = parse_set_union(source, info) if not source.match("]"): - raise error("missing ] at position %d" % source.pos) + raise error("missing ]", source.string, source.pos) if negate: item = item.with_flags(positive=not item.positive) @@ -1416,7 +1465,7 @@ def parse_set_item(source, info): ch = source.get() if not ch: - raise error("bad set at position %d" % source.pos, True) + raise error("unterminated character set", source.string, source.pos) return Character(ord(ch)) @@ -1427,7 +1476,7 @@ def parse_posix_class(source, info): if not source.match(":]"): raise ParseError() - return lookup_property(prop_name, name, positive=not negate, source_pos=source.pos) + return lookup_property(prop_name, name, not negate, source) def float_to_rational(flt): "Converts a float to a rational pair." @@ -1442,7 +1491,7 @@ def float_to_rational(flt): def numeric_to_rational(numeric): "Converts a numeric string to a rational string, if possible." - if numeric[0] == "-": + if numeric[ : 1] == "-": sign, numeric = numeric[0], numeric[1 : ] else: sign = "" @@ -1468,7 +1517,7 @@ def standardise_name(name): except (ValueError, ZeroDivisionError): return "".join(ch for ch in name if ch not in "_- ").upper() -def lookup_property(property, value, positive, source_pos=None): +def lookup_property(property, value, positive, source=None): "Looks up a property." # Normalise the names (which may still be lists). property = standardise_name(property) if property else None @@ -1481,12 +1530,18 @@ def lookup_property(property, value, positive, source_pos=None): # Both the property and the value are provided. prop = PROPERTIES.get(property) if not prop: - raise error("unknown property at position %d" % source_pos) + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) prop_id, value_dict = prop val_id = value_dict.get(value) if val_id is None: - raise error("unknown property value at position %d" % source_pos) + if not source: + raise error("unknown property value") + + raise error("unknown property value", source.string, source.pos) if "YES" in value_dict and val_id == 0: positive, val_id = not positive, 1 @@ -1526,7 +1581,10 @@ def lookup_property(property, value, positive, source_pos=None): return Property((prop_id << 16) | val_id, positive) # Unknown property. - raise error("unknown property at position %d" % source_pos) + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) def _compile_replacement(source, pattern, is_unicode): "Compiles a replacement template escape sequence." @@ -1539,7 +1597,7 @@ def _compile_replacement(source, pattern, is_unicode): if ch in HEX_ESCAPES and (ch == "x" or is_unicode): # A hexadecimal escape sequence. - return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch])] + return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] if ch == "g": # A group preference. @@ -1595,18 +1653,19 @@ def _compile_replacement(source, pattern, is_unicode): if not ch: # A trailing backslash. - raise error("bad escape at position %d" % source.pos) + raise error("bad escape (end of pattern)", source.string, source.pos) # An escaped non-backslash is a backslash followed by the literal. return False, [ord("\\"), ord(ch)] -def parse_repl_hex_escape(source, expected_len): +def parse_repl_hex_escape(source, expected_len, type): "Parses a hex escape sequence in a replacement string." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: - raise error("bad hex escape at position %d" % source.pos) + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, source.pos) digits.append(ch) return int("".join(digits), 16) @@ -1622,7 +1681,8 @@ def parse_repl_named_char(source): value = unicodedata.lookup(name) return ord(value) except KeyError: - raise error("undefined character name at position %d" % source.pos) + raise error("undefined character name", source.string, + source.pos) source.pos = saved_pos return None @@ -1630,13 +1690,13 @@ def parse_repl_named_char(source): def compile_repl_group(source, pattern): "Compiles a replacement template group reference." source.expect("<") - name = parse_name(source, True) + name = parse_name(source, True, True) source.expect(">") if name.isdigit(): index = int(name) if not 0 <= index <= pattern.groups: - raise error("invalid group at position %d" % source.pos) + raise error("invalid group reference", source.string, source.pos) return index @@ -1689,7 +1749,7 @@ class RegexBase(object): return self.rebuild(positive, case_flags, zerowidth) - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): pass def optimise(self, info): @@ -1797,8 +1857,8 @@ class Atomic(RegexBase): RegexBase.__init__(self) self.subpattern = subpattern - def fix_groups(self, reverse, fuzzy): - self.subpattern.fix_groups(reverse, fuzzy) + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): self.subpattern = self.subpattern.optimise(info) @@ -1857,9 +1917,9 @@ class Branch(RegexBase): RegexBase.__init__(self) self.branches = branches - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): for b in self.branches: - b.fix_groups(reverse, fuzzy) + b.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): # Flatten branches within branches. @@ -2235,27 +2295,27 @@ class CallGroup(RegexBase): self._key = self.__class__, self.group - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: - raise error("unknown group at position %d" % self.position) + raise error("invalid group reference", pattern, self.position) if not 0 <= self.group <= self.info.group_count: - raise error("unknown group at position %d" % self.position) + raise error("unknown group", pattern, self.position) if self.group > 0 and self.info.open_group_count[self.group] > 1: - raise error("ambiguous group reference at position %d" % self.position) + raise error("ambiguous group reference", pattern, self.position) self.info.group_calls.append((self, reverse, fuzzy)) self._key = self.__class__, self.group def remove_captures(self): - raise error("group reference not allowed at position %d" % self.position) + raise error("group reference not allowed", pattern, self.position) def _compile(self, reverse, fuzzy): return [(OP.GROUP_CALL, self.call_ref)] @@ -2352,20 +2412,20 @@ class Conditional(RegexBase): self.no_item = no_item self.position = position - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: - raise error("unknown group at position %d" % self.position) + raise error("unknown group", pattern, self.position) if not 1 <= self.group <= self.info.group_count: - raise error("unknown group at position %d" % self.position) + raise error("invalid group reference", pattern, self.position) - self.yes_item.fix_groups(reverse, fuzzy) - self.no_item.fix_groups(reverse, fuzzy) + self.yes_item.fix_groups(pattern, reverse, fuzzy) + self.no_item.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): yes_item = self.yes_item.optimise(info) @@ -2496,8 +2556,8 @@ class Fuzzy(RegexBase): constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": constraints["e"][1]} - def fix_groups(self, reverse, fuzzy): - self.subpattern.fix_groups(reverse, True) + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, True) def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) @@ -2612,8 +2672,8 @@ class GreedyRepeat(RegexBase): self.min_count = min_count self.max_count = max_count - def fix_groups(self, reverse, fuzzy): - self.subpattern.fix_groups(reverse, fuzzy) + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) @@ -2700,9 +2760,9 @@ class Group(RegexBase): self.call_ref = None - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): self.info.defined_groups[self.group] = (self, reverse, fuzzy) - self.subpattern.fix_groups(reverse, fuzzy) + self.subpattern.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) @@ -2788,8 +2848,8 @@ class LookAround(RegexBase): self.positive = bool(positive) self.subpattern = subpattern - def fix_groups(self, reverse, fuzzy): - self.subpattern.fix_groups(self.behind, fuzzy) + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, self.behind, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) @@ -2982,22 +3042,22 @@ class RefGroup(RegexBase): self._key = self.__class__, self.group, self.case_flags - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: - raise error("unknown group at position %d" % self.position) + raise error("unknown group", pattern, self.position) if not 1 <= self.group <= self.info.group_count: - raise error("unknown group at position %d" % self.position) + raise error("invalid group reference", pattern, self.position) self._key = self.__class__, self.group, self.case_flags def remove_captures(self): - raise error("group reference not allowed at position %d" % self.position) + raise error("group reference not allowed", pattern, self.position) def _compile(self, reverse, fuzzy): flags = 0 @@ -3024,9 +3084,9 @@ class Sequence(RegexBase): self.items = items - def fix_groups(self, reverse, fuzzy): + def fix_groups(self, pattern, reverse, fuzzy): for s in self.items: - s.fix_groups(reverse, fuzzy) + s.fix_groups(pattern, reverse, fuzzy) def optimise(self, info): # Flatten the sequences. @@ -3208,7 +3268,7 @@ class SetBase(RegexBase): print "%s%s %s%s" % (INDENT * indent, self._op_name, POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) for i in self.items: - i.dump(indent + 1) + i.dump(indent + 1, reverse) def _handle_case_folding(self, info, in_set): # Is the set case-sensitive? @@ -3494,9 +3554,9 @@ class String(RegexBase): class Literal(String): def _dump(self, indent, reverse): for c in self.characters: - display = ascii("".join(chr(c))).lstrip("bu") - print("{}CHARACTER MATCH {}{}".format(INDENT * indent, - display, CASE_TEXT[self.case_flags])) + display = repr(unichr(c)).lstrip("bu") + print "%sCHARACTER MATCH %s%s" % (INDENT * indent, display, + CASE_TEXT[self.case_flags]) class StringSet(RegexBase): _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False): @@ -3792,7 +3852,7 @@ class Source(object): def expect(self, substring): if not self.match(substring): - raise error("missing %s at position %d" % (substring, self.pos)) + raise error("missing %s" % substring, self.string, self.pos) def at_end(self): string = self.string @@ -3953,7 +4013,7 @@ class Scanner: source.ignore_space = bool(info.flags & VERBOSE) parsed = _parse_pattern(source, info) if not source.at_end(): - raise error("trailing characters at position %d" % source.pos) + raise error("unbalanced parenthesis", source.string, source.pos) # We want to forbid capture groups within each phrase. patterns.append(parsed.remove_captures()) @@ -3977,7 +4037,8 @@ class Scanner: # Complain if there are any group calls. They are not supported by the # Scanner class. if info.call_refs: - raise error("recursive regex not supported by Scanner") + raise error("recursive regex not supported by Scanner", + source.string, source.pos) reverse = bool(info.flags & REVERSE)